diff --git a/external/cddl/osnet/dist/cmd/zdb/zdb.c b/external/cddl/osnet/dist/cmd/zdb/zdb.c
index 48a84d693e449..6b2f3b7cd6a9d 100644
--- a/external/cddl/osnet/dist/cmd/zdb/zdb.c
+++ b/external/cddl/osnet/dist/cmd/zdb/zdb.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -51,10 +51,25 @@
 #include <sys/zio_compress.h>
 #include <sys/zfs_fuid.h>
 #include <sys/arc.h>
+#include <sys/ddt.h>
 #undef ZFS_MAXNAMELEN
 #undef verify
 #include <libzfs.h>
 
+#define	ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \
+    zio_compress_table[(idx)].ci_name : "UNKNOWN")
+#define	ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \
+    zio_checksum_table[(idx)].ci_name : "UNKNOWN")
+#define	ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \
+    dmu_ot[(idx)].ot_name : "UNKNOWN")
+#define	ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES)
+
+#ifndef lint
+extern int zfs_recover;
+#else
+int zfs_recover;
+#endif
+
 const char cmdname[] = "zdb";
 uint8_t dump_opt[256];
 
@@ -64,8 +79,6 @@ extern void dump_intent_log(zilog_t *);
 uint64_t *zopt_object = NULL;
 int zopt_objects = 0;
 libzfs_handle_t *g_zfs;
-boolean_t zdb_sig_user_data = B_TRUE;
-int zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
 
 /*
  * These libumem hooks provide a reasonable set of defaults for the allocator's
@@ -87,42 +100,66 @@ static void
 usage(void)
 {
 	(void) fprintf(stderr,
-	    "Usage: %s [-udibcsvL] [-U cachefile_path] "
-	    "[-S user:cksumalg] "
-	    "dataset [object...]\n"
-	    "       %s -C [pool]\n"
-	    "       %s -l dev\n"
-	    "       %s -R pool:vdev:offset:size:flags\n"
-	    "       %s [-p path_to_vdev_dir]\n"
-	    "       %s -e pool | GUID | devid ...\n",
-	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
-
-	(void) fprintf(stderr, "	-u uberblock\n");
-	(void) fprintf(stderr, "	-d datasets\n");
-	(void) fprintf(stderr, "        -C cached pool configuration\n");
-	(void) fprintf(stderr, "	-i intent logs\n");
-	(void) fprintf(stderr, "	-b block statistics\n");
-	(void) fprintf(stderr, "	-c checksum all data blocks\n");
-	(void) fprintf(stderr, "	-s report stats on zdb's I/O\n");
-	(void) fprintf(stderr, "	-S <user|all>:<cksum_alg|all> -- "
-	    "dump blkptr signatures\n");
-	(void) fprintf(stderr, "	-v verbose (applies to all others)\n");
+	    "Usage: %s [-CumdibcsvhL] poolname [object...]\n"
+	    "       %s [-div] dataset [object...]\n"
+	    "       %s -m [-L] poolname [vdev [metaslab...]]\n"
+	    "       %s -R poolname vdev:offset:size[:flags]\n"
+	    "       %s -S poolname\n"
+	    "       %s -l [-u] device\n"
+	    "       %s -C\n\n",
+	    cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname);
+
+	(void) fprintf(stderr, "    Dataset name must include at least one "
+	    "separator character '/' or '@'\n");
+	(void) fprintf(stderr, "    If dataset name is specified, only that "
+	    "dataset is dumped\n");
+	(void) fprintf(stderr, "    If object numbers are specified, only "
+	    "those objects are dumped\n\n");
+	(void) fprintf(stderr, "    Options to control amount of output:\n");
+	(void) fprintf(stderr, "        -u uberblock\n");
+	(void) fprintf(stderr, "        -d dataset(s)\n");
+	(void) fprintf(stderr, "        -i intent logs\n");
+	(void) fprintf(stderr, "        -C config (or cachefile if alone)\n");
+	(void) fprintf(stderr, "        -h pool history\n");
+	(void) fprintf(stderr, "        -b block statistics\n");
+	(void) fprintf(stderr, "        -m metaslabs\n");
+	(void) fprintf(stderr, "        -c checksum all metadata (twice for "
+	    "all data) blocks\n");
+	(void) fprintf(stderr, "        -s report stats on zdb's I/O\n");
+	(void) fprintf(stderr, "        -S simulate dedup to measure effect\n");
+	(void) fprintf(stderr, "        -v verbose (applies to all others)\n");
 	(void) fprintf(stderr, "        -l dump label contents\n");
 	(void) fprintf(stderr, "        -L disable leak tracking (do not "
 	    "load spacemaps)\n");
-	(void) fprintf(stderr, "	-U cachefile_path -- use alternate "
-	    "cachefile\n");
 	(void) fprintf(stderr, "        -R read and display block from a "
-	    "device\n");
-	(void) fprintf(stderr, "        -e Pool is exported/destroyed/"
-	    "has altroot\n");
-	(void) fprintf(stderr, "	-p <Path to vdev dir> (use with -e)\n");
+	    "device\n\n");
+	(void) fprintf(stderr, "    Below options are intended for use "
+	    "with other options (except -l):\n");
+	(void) fprintf(stderr, "        -A ignore assertions (-A), enable "
+	    "panic recovery (-AA) or both (-AAA)\n");
+	(void) fprintf(stderr, "        -F attempt automatic rewind within "
+	    "safe range of transaction groups\n");
+	(void) fprintf(stderr, "        -U <cachefile_path> -- use alternate "
+	    "cachefile\n");
+	(void) fprintf(stderr, "        -X attempt extreme rewind (does not "
+	    "work with dataset)\n");
+	(void) fprintf(stderr, "        -e pool is exported/destroyed/"
+	    "has altroot/not in a cachefile\n");
+	(void) fprintf(stderr, "        -p <path> -- use one or more with "
+	    "-e to specify path to vdev dir\n");
+	(void) fprintf(stderr, "        -t <txg> -- highest txg to use when "
+	    "searching for uberblocks\n");
 	(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
 	    "to make only that option verbose\n");
 	(void) fprintf(stderr, "Default is to dump everything non-verbosely\n");
 	exit(1);
 }
 
+/*
+ * Called for usage errors that are discovered after a call to spa_open(),
+ * dmu_bonus_hold(), or pool_match().  abort() is called for other errors.
+ */
+
 static void
 fatal(const char *fmt, ...)
 {
@@ -134,69 +171,7 @@ fatal(const char *fmt, ...)
 	va_end(ap);
 	(void) fprintf(stderr, "\n");
 
-	abort();
-}
-
-static void
-dump_nvlist(nvlist_t *list, int indent)
-{
-	nvpair_t *elem = NULL;
-
-	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
-		switch (nvpair_type(elem)) {
-		case DATA_TYPE_STRING:
-			{
-				char *value;
-
-				VERIFY(nvpair_value_string(elem, &value) == 0);
-				(void) printf("%*s%s='%s'\n", indent, "",
-				    nvpair_name(elem), value);
-			}
-			break;
-
-		case DATA_TYPE_UINT64:
-			{
-				uint64_t value;
-
-				VERIFY(nvpair_value_uint64(elem, &value) == 0);
-				(void) printf("%*s%s=%llu\n", indent, "",
-				    nvpair_name(elem), (u_longlong_t)value);
-			}
-			break;
-
-		case DATA_TYPE_NVLIST:
-			{
-				nvlist_t *value;
-
-				VERIFY(nvpair_value_nvlist(elem, &value) == 0);
-				(void) printf("%*s%s\n", indent, "",
-				    nvpair_name(elem));
-				dump_nvlist(value, indent + 4);
-			}
-			break;
-
-		case DATA_TYPE_NVLIST_ARRAY:
-			{
-				nvlist_t **value;
-				uint_t c, count;
-
-				VERIFY(nvpair_value_nvlist_array(elem, &value,
-				    &count) == 0);
-
-				for (c = 0; c < count; c++) {
-					(void) printf("%*s%s[%u]\n", indent, "",
-					    nvpair_name(elem), c);
-					dump_nvlist(value[c], indent + 8);
-				}
-			}
-			break;
-
-		default:
-
-			(void) printf("bad config type %d for %s\n",
-			    nvpair_type(elem), nvpair_name(elem));
-		}
-	}
+	exit(1);
 }
 
 /* ARGSUSED */
@@ -207,7 +182,7 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size)
 	size_t nvsize = *(uint64_t *)data;
 	char *packed = umem_alloc(nvsize, UMEM_NOFAIL);
 
-	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed));
+	VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH));
 
 	VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0);
 
@@ -315,6 +290,13 @@ dump_none(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
+/*ARGSUSED*/
+static void
+dump_unknown(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	(void) printf("\tUNKNOWN OBJECT TYPE\n");
+}
+
 /*ARGSUSED*/
 void
 dump_uint8(objset_t *os, uint64_t object, void *data, size_t size)
@@ -377,6 +359,14 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size)
 	zap_cursor_fini(&zc);
 }
 
+/*ARGSUSED*/
+static void
+dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size)
+{
+	dump_zap_stats(os, object);
+	/* contents are printed elsewhere, properly decoded */
+}
+
 /*ARGSUSED*/
 static void
 dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size)
@@ -433,16 +423,16 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
 	alloc = 0;
 	for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) {
 		VERIFY(0 == dmu_read(os, smo->smo_object, offset,
-		    sizeof (entry), &entry));
+		    sizeof (entry), &entry, DMU_READ_PREFETCH));
 		if (SM_DEBUG_DECODE(entry)) {
-			(void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n",
+			(void) printf("\t    [%6llu] %s: txg %llu, pass %llu\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    ddata[SM_DEBUG_ACTION_DECODE(entry)],
 			    (u_longlong_t)SM_DEBUG_TXG_DECODE(entry),
 			    (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry));
 		} else {
-			(void) printf("\t\t[%4llu]    %c  range:"
-			    " %08llx-%08llx  size: %06llx\n",
+			(void) printf("\t    [%6llu]    %c  range:"
+			    " %010llx-%010llx  size: %06llx\n",
 			    (u_longlong_t)(offset / sizeof (entry)),
 			    SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F',
 			    (u_longlong_t)((SM_OFFSET_DECODE(entry) <<
@@ -464,100 +454,348 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm)
 	}
 }
 
+static void
+dump_metaslab_stats(metaslab_t *msp)
+{
+	char maxbuf[5];
+	space_map_t *sm = &msp->ms_map;
+	avl_tree_t *t = sm->sm_pp_root;
+	int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+	nicenum(space_map_maxsize(sm), maxbuf);
+
+	(void) printf("\t %25s %10lu   %7s  %6s   %4s %4d%%\n",
+	    "segments", avl_numnodes(t), "maxsize", maxbuf,
+	    "freepct", free_pct);
+}
+
 static void
 dump_metaslab(metaslab_t *msp)
 {
-	char freebuf[5];
-	space_map_obj_t *smo = &msp->ms_smo;
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
+	space_map_t *sm = &msp->ms_map;
+	space_map_obj_t *smo = &msp->ms_smo;
+	char freebuf[5];
 
-	nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf);
+	nicenum(sm->sm_size - smo->smo_alloc, freebuf);
 
-	if (dump_opt['d'] <= 5) {
-		(void) printf("\t%10llx   %10llu   %5s\n",
-		    (u_longlong_t)msp->ms_map.sm_start,
-		    (u_longlong_t)smo->smo_object,
-		    freebuf);
-		return;
+	(void) printf(
+	    "\tmetaslab %6llu   offset %12llx   spacemap %6llu   free    %5s\n",
+	    (u_longlong_t)(sm->sm_start / sm->sm_size),
+	    (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf);
+
+	if (dump_opt['m'] > 1 && !dump_opt['L']) {
+		mutex_enter(&msp->ms_lock);
+		space_map_load_wait(sm);
+		if (!sm->sm_loaded)
+			VERIFY(space_map_load(sm, zfs_metaslab_ops,
+			    SM_FREE, smo, spa->spa_meta_objset) == 0);
+		dump_metaslab_stats(msp);
+		space_map_unload(sm);
+		mutex_exit(&msp->ms_lock);
 	}
 
-	(void) printf(
-	    "\tvdev %llu   offset %08llx   spacemap %4llu   free %5s\n",
-	    (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start,
-	    (u_longlong_t)smo->smo_object, freebuf);
+	if (dump_opt['d'] > 5 || dump_opt['m'] > 2) {
+		ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift));
 
-	ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift));
+		mutex_enter(&msp->ms_lock);
+		dump_spacemap(spa->spa_meta_objset, smo, sm);
+		mutex_exit(&msp->ms_lock);
+	}
+}
 
-	dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map);
+static void
+print_vdev_metaslab_header(vdev_t *vd)
+{
+	(void) printf("\tvdev %10llu\n\t%-10s%5llu   %-19s   %-15s   %-10s\n",
+	    (u_longlong_t)vd->vdev_id,
+	    "metaslabs", (u_longlong_t)vd->vdev_ms_count,
+	    "offset", "spacemap", "free");
+	(void) printf("\t%15s   %19s   %15s   %10s\n",
+	    "---------------", "-------------------",
+	    "---------------", "-------------");
 }
 
 static void
 dump_metaslabs(spa_t *spa)
 {
-	vdev_t *rvd = spa->spa_root_vdev;
-	vdev_t *vd;
-	int c, m;
+	vdev_t *vd, *rvd = spa->spa_root_vdev;
+	uint64_t m, c = 0, children = rvd->vdev_children;
 
 	(void) printf("\nMetaslabs:\n");
 
-	for (c = 0; c < rvd->vdev_children; c++) {
-		vd = rvd->vdev_child[c];
+	if (!dump_opt['d'] && zopt_objects > 0) {
+		c = zopt_object[0];
 
-		(void) printf("\n    vdev %llu\n\n", (u_longlong_t)vd->vdev_id);
+		if (c >= children)
+			(void) fatal("bad vdev id: %llu", (u_longlong_t)c);
 
-		if (dump_opt['d'] <= 5) {
-			(void) printf("\t%10s   %10s   %5s\n",
-			    "offset", "spacemap", "free");
-			(void) printf("\t%10s   %10s   %5s\n",
-			    "------", "--------", "----");
+		if (zopt_objects > 1) {
+			vd = rvd->vdev_child[c];
+			print_vdev_metaslab_header(vd);
+
+			for (m = 1; m < zopt_objects; m++) {
+				if (zopt_object[m] < vd->vdev_ms_count)
+					dump_metaslab(
+					    vd->vdev_ms[zopt_object[m]]);
+				else
+					(void) fprintf(stderr, "bad metaslab "
+					    "number %llu\n",
+					    (u_longlong_t)zopt_object[m]);
+			}
+			(void) printf("\n");
+			return;
 		}
+		children = c + 1;
+	}
+	for (; c < children; c++) {
+		vd = rvd->vdev_child[c];
+		print_vdev_metaslab_header(vd);
+
 		for (m = 0; m < vd->vdev_ms_count; m++)
 			dump_metaslab(vd->vdev_ms[m]);
 		(void) printf("\n");
 	}
 }
 
+static void
+dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index)
+{
+	const ddt_phys_t *ddp = dde->dde_phys;
+	const ddt_key_t *ddk = &dde->dde_key;
+	char *types[4] = { "ditto", "single", "double", "triple" };
+	char blkbuf[BP_SPRINTF_LEN];
+	blkptr_t blk;
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+		sprintf_blkptr(blkbuf, &blk);
+		(void) printf("index %llx refcnt %llu %s %s\n",
+		    (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt,
+		    types[p], blkbuf);
+	}
+}
+
+static void
+dump_dedup_ratio(const ddt_stat_t *dds)
+{
+	double rL, rP, rD, D, dedup, compress, copies;
+
+	if (dds->dds_blocks == 0)
+		return;
+
+	rL = (double)dds->dds_ref_lsize;
+	rP = (double)dds->dds_ref_psize;
+	rD = (double)dds->dds_ref_dsize;
+	D = (double)dds->dds_dsize;
+
+	dedup = rD / D;
+	compress = rL / rP;
+	copies = rD / rP;
+
+	(void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, "
+	    "dedup * compress / copies = %.2f\n\n",
+	    dedup, compress, copies, dedup * compress / copies);
+}
+
+static void
+dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	char name[DDT_NAMELEN];
+	ddt_entry_t dde;
+	uint64_t walk = 0;
+	dmu_object_info_t doi;
+	uint64_t count, dspace, mspace;
+	int error;
+
+	error = ddt_object_info(ddt, type, class, &doi);
+
+	if (error == ENOENT)
+		return;
+	ASSERT(error == 0);
+
+	count = ddt_object_count(ddt, type, class);
+	dspace = doi.doi_physical_blocks_512 << 9;
+	mspace = doi.doi_fill_count * doi.doi_data_block_size;
+
+	ASSERT(count != 0);	/* we should have destroyed it */
+
+	ddt_object_name(ddt, type, class, name);
+
+	(void) printf("%s: %llu entries, size %llu on disk, %llu in core\n",
+	    name,
+	    (u_longlong_t)count,
+	    (u_longlong_t)(dspace / count),
+	    (u_longlong_t)(mspace / count));
+
+	if (dump_opt['D'] < 3)
+		return;
+
+	zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]);
+
+	if (dump_opt['D'] < 4)
+		return;
+
+	if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE)
+		return;
+
+	(void) printf("%s contents:\n\n", name);
+
+	while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0)
+		dump_dde(ddt, &dde, walk);
+
+	ASSERT(error == ENOENT);
+
+	(void) printf("\n");
+}
+
+static void
+dump_all_ddts(spa_t *spa)
+{
+	ddt_histogram_t ddh_total = { 0 };
+	ddt_stat_t dds_total = { 0 };
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				dump_ddt(ddt, type, class);
+			}
+		}
+	}
+
+	ddt_get_dedup_stats(spa, &dds_total);
+
+	if (dds_total.dds_blocks == 0) {
+		(void) printf("All DDTs are empty\n");
+		return;
+	}
+
+	(void) printf("\n");
+
+	if (dump_opt['D'] > 1) {
+		(void) printf("DDT histogram (aggregated over all DDTs):\n");
+		ddt_get_dedup_histogram(spa, &ddh_total);
+		zpool_dump_ddt(&dds_total, &ddh_total);
+	}
+
+	dump_dedup_ratio(&dds_total);
+}
+
+static void
+dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	char *prefix = (void *)sm;
+
+	(void) printf("%s [%llu,%llu) length %llu\n",
+	    prefix,
+	    (u_longlong_t)start,
+	    (u_longlong_t)(start + size),
+	    (u_longlong_t)(size));
+}
+
 static void
 dump_dtl(vdev_t *vd, int indent)
 {
-	avl_tree_t *t = &vd->vdev_dtl_map.sm_root;
-	space_seg_t *ss;
-	vdev_t *pvd;
-	int c;
+	spa_t *spa = vd->vdev_spa;
+	boolean_t required;
+	char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" };
+	char prefix[256];
+
+	spa_vdev_state_enter(spa, SCL_NONE);
+	required = vdev_dtl_required(vd);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	if (indent == 0)
 		(void) printf("\nDirty time logs:\n\n");
 
-	(void) printf("\t%*s%s\n", indent, "",
+	(void) printf("\t%*s%s [%s]\n", indent, "",
 	    vd->vdev_path ? vd->vdev_path :
-	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type :
-	    spa_name(vd->vdev_spa));
+	    vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa),
+	    required ? "DTL-required" : "DTL-expendable");
 
-	for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) {
-		/*
-		 * Everything in this DTL must appear in all parent DTL unions.
-		 */
-		for (pvd = vd; pvd; pvd = pvd->vdev_parent)
-			ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map,
-			    ss->ss_start, ss->ss_end - ss->ss_start));
-		(void) printf("\t%*soutage [%llu,%llu] length %llu\n",
-		    indent, "",
-		    (u_longlong_t)ss->ss_start,
-		    (u_longlong_t)ss->ss_end - 1,
-		    (u_longlong_t)(ss->ss_end - ss->ss_start));
+	for (int t = 0; t < DTL_TYPES; t++) {
+		space_map_t *sm = &vd->vdev_dtl[t];
+		if (sm->sm_space == 0)
+			continue;
+		(void) snprintf(prefix, sizeof (prefix), "\t%*s%s",
+		    indent + 2, "", name[t]);
+		mutex_enter(sm->sm_lock);
+		space_map_walk(sm, dump_dtl_seg, (void *)prefix);
+		mutex_exit(sm->sm_lock);
+		if (dump_opt['d'] > 5 && vd->vdev_children == 0)
+			dump_spacemap(spa->spa_meta_objset,
+			    &vd->vdev_dtl_smo, sm);
 	}
 
-	(void) printf("\n");
+	for (int c = 0; c < vd->vdev_children; c++)
+		dump_dtl(vd->vdev_child[c], indent + 4);
+}
 
-	if (dump_opt['d'] > 5 && vd->vdev_children == 0) {
-		dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl,
-		    &vd->vdev_dtl_map);
-		(void) printf("\n");
-	}
+static void
+dump_history(spa_t *spa)
+{
+	nvlist_t **events = NULL;
+	char buf[SPA_MAXBLOCKSIZE];
+	uint64_t resid, len, off = 0;
+	uint_t num = 0;
+	int error;
+	time_t tsec;
+	struct tm t;
+	char tbuf[30];
+	char internalstr[MAXPATHLEN];
+
+	do {
+		len = sizeof (buf);
+
+		if ((error = spa_history_get(spa, &off, &len, buf)) != 0) {
+			(void) fprintf(stderr, "Unable to read history: "
+			    "error %d\n", error);
+			return;
+		}
 
-	for (c = 0; c < vd->vdev_children; c++)
-		dump_dtl(vd->vdev_child[c], indent + 4);
+		if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0)
+			break;
+
+		off -= resid;
+	} while (len != 0);
+
+	(void) printf("\nHistory:\n");
+	for (int i = 0; i < num; i++) {
+		uint64_t time, txg, ievent;
+		char *cmd, *intstr;
+
+		if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME,
+		    &time) != 0)
+			continue;
+		if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD,
+		    &cmd) != 0) {
+			if (nvlist_lookup_uint64(events[i],
+			    ZPOOL_HIST_INT_EVENT, &ievent) != 0)
+				continue;
+			verify(nvlist_lookup_uint64(events[i],
+			    ZPOOL_HIST_TXG, &txg) == 0);
+			verify(nvlist_lookup_string(events[i],
+			    ZPOOL_HIST_INT_STR, &intstr) == 0);
+			if (ievent >= LOG_END)
+				continue;
+
+			(void) snprintf(internalstr,
+			    sizeof (internalstr),
+			    "[internal %s txg:%lld] %s",
+			    hist_event_table[ievent], txg,
+			    intstr);
+			cmd = internalstr;
+		}
+		tsec = time;
+		(void) localtime_r(&tsec, &t);
+		(void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t);
+		(void) printf("%s %s\n", tbuf, cmd);
+	}
 }
 
 /*ARGSUSED*/
@@ -567,35 +805,48 @@ dump_dnode(objset_t *os, uint64_t object, void *data, size_t size)
 }
 
 static uint64_t
-blkid2offset(const dnode_phys_t *dnp, int level, uint64_t blkid)
+blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb)
 {
-	if (level < 0)
-		return (blkid);
+	if (dnp == NULL) {
+		ASSERT(zb->zb_level < 0);
+		if (zb->zb_object == 0)
+			return (zb->zb_blkid);
+		return (zb->zb_blkid * BP_GET_LSIZE(bp));
+	}
+
+	ASSERT(zb->zb_level >= 0);
 
-	return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
+	return ((zb->zb_blkid <<
+	    (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) *
 	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT);
 }
 
 static void
-sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas)
+sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp)
 {
 	dva_t *dva = bp->blk_dva;
-	int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1;
-	int i;
+	int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1;
+
+	if (dump_opt['b'] >= 5) {
+		sprintf_blkptr(blkbuf, bp);
+		return;
+	}
 
 	blkbuf[0] = '\0';
 
-	for (i = 0; i < ndvas; i++)
+	for (int i = 0; i < ndvas; i++)
 		(void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ",
 		    (u_longlong_t)DVA_GET_VDEV(&dva[i]),
 		    (u_longlong_t)DVA_GET_OFFSET(&dva[i]),
 		    (u_longlong_t)DVA_GET_ASIZE(&dva[i]));
 
-	(void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu",
+	(void) sprintf(blkbuf + strlen(blkbuf),
+	    "%llxL/%llxP F=%llu B=%llu/%llu",
 	    (u_longlong_t)BP_GET_LSIZE(bp),
 	    (u_longlong_t)BP_GET_PSIZE(bp),
 	    (u_longlong_t)bp->blk_fill,
-	    (u_longlong_t)bp->blk_birth);
+	    (u_longlong_t)bp->blk_birth,
+	    (u_longlong_t)BP_PHYSICAL_BIRTH(bp));
 }
 
 static void
@@ -608,8 +859,7 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
 	ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type);
 	ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level);
 
-	(void) printf("%16llx ",
-	    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid));
+	(void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb));
 
 	ASSERT(zb->zb_level >= 0);
 
@@ -621,23 +871,15 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb,
 		}
 	}
 
-	sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+	sprintf_blkptr_compact(blkbuf, bp);
 	(void) printf("%s\n", blkbuf);
 }
 
-#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-	(zb)->zb_objset = objset;                       \
-	(zb)->zb_object = object;                       \
-	(zb)->zb_level = level;                         \
-	(zb)->zb_blkid = blkid;                         \
-}
-
 static int
 visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
     blkptr_t *bp, const zbookmark_t *zb)
 {
-	int err;
+	int err = 0;
 
 	if (bp->blk_birth == 0)
 		return (0);
@@ -670,7 +912,8 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp,
 				break;
 			fill += cbp->blk_fill;
 		}
-		ASSERT3U(fill, ==, bp->blk_fill);
+		if (!err)
+			ASSERT3U(fill, ==, bp->blk_fill);
 		(void) arc_buf_remove_ref(buf, &buf);
 	}
 
@@ -687,11 +930,11 @@ dump_indirect(dnode_t *dn)
 
 	(void) printf("Indirect blocks:\n");
 
-	SET_BOOKMARK(&czb, dmu_objset_id(&dn->dn_objset->os),
+	SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset),
 	    dn->dn_object, dnp->dn_nlevels - 1, 0);
 	for (j = 0; j < dnp->dn_nblkptr; j++) {
 		czb.zb_blkid = j;
-		(void) visit_indirect(dmu_objset_spa(&dn->dn_objset->os), dnp,
+		(void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp,
 		    &dnp->dn_blkptr[j], &czb);
 	}
 
@@ -767,7 +1010,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 	nicenum(ds->ds_compressed_bytes, compressed);
 	nicenum(ds->ds_uncompressed_bytes, uncompressed);
 	nicenum(ds->ds_unique_bytes, unique);
-	sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp);
+	sprintf_blkptr(blkbuf, &ds->ds_bp);
 
 	(void) printf("\t\tdir_obj = %llu\n",
 	    (u_longlong_t)ds->ds_dir_obj);
@@ -781,6 +1024,8 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size)
 	    (u_longlong_t)ds->ds_snapnames_zapobj);
 	(void) printf("\t\tnum_children = %llu\n",
 	    (u_longlong_t)ds->ds_num_children);
+	(void) printf("\t\tuserrefs_obj = %llu\n",
+	    (u_longlong_t)ds->ds_userrefs_obj);
 	(void) printf("\t\tcreation_time = %s", ctime(&crtime));
 	(void) printf("\t\tcreation_txg = %llu\n",
 	    (u_longlong_t)ds->ds_creation_txg);
@@ -816,11 +1061,11 @@ dump_bplist(objset_t *mos, uint64_t object, char *name)
 	if (dump_opt['d'] < 3)
 		return;
 
-	mutex_init(&bpl.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+	bplist_init(&bpl);
 	VERIFY(0 == bplist_open(&bpl, mos, object));
 	if (bplist_empty(&bpl)) {
 		bplist_close(&bpl);
-		mutex_destroy(&bpl.bpl_lock);
+		bplist_fini(&bpl);
 		return;
 	}
 
@@ -838,7 +1083,7 @@ dump_bplist(objset_t *mos, uint64_t object, char *name)
 
 	if (dump_opt['d'] < 5) {
 		bplist_close(&bpl);
-		mutex_destroy(&bpl.bpl_lock);
+		bplist_fini(&bpl);
 		return;
 	}
 
@@ -848,13 +1093,13 @@ dump_bplist(objset_t *mos, uint64_t object, char *name)
 		char blkbuf[BP_SPRINTF_LEN];
 
 		ASSERT(bp->blk_birth != 0);
-		sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0);
+		sprintf_blkptr_compact(blkbuf, bp);
 		(void) printf("\tItem %3llu: %s\n",
 		    (u_longlong_t)itor - 1, blkbuf);
 	}
 
 	bplist_close(&bpl);
-	mutex_destroy(&bpl.bpl_lock);
+	bplist_fini(&bpl);
 }
 
 static avl_tree_t idx_tree;
@@ -906,6 +1151,7 @@ dump_uidgid(objset_t *os, znode_phys_t *zp)
 		/* first find the fuid object.  It lives in the master node */
 		VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES,
 		    8, 1, &fuid_obj) == 0);
+		zfs_fuid_avl_tree_create(&idx_tree, &domain_tree);
 		(void) zfs_fuid_table_load(os, fuid_obj,
 		    &idx_tree, &domain_tree);
 		fuid_table_loaded = B_TRUE;
@@ -969,7 +1215,7 @@ dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size)
 {
 }
 
-static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
+static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = {
 	dump_none,		/* unallocated			*/
 	dump_zap,		/* object directory		*/
 	dump_uint64,		/* object array			*/
@@ -1009,6 +1255,12 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = {
 	dump_packed_nvlist,	/* FUID nvlist size		*/
 	dump_zap,		/* DSL dataset next clones	*/
 	dump_zap,		/* DSL scrub queue		*/
+	dump_zap,		/* ZFS user/group used		*/
+	dump_zap,		/* ZFS user/group quota		*/
+	dump_zap,		/* snapshot refcount tags	*/
+	dump_ddt_zap,		/* DDT ZAP object		*/
+	dump_zap,		/* DDT statistics		*/
+	dump_unknown		/* Unknown type, must be last	*/
 };
 
 static void
@@ -1019,18 +1271,19 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
 	dnode_t *dn;
 	void *bonus = NULL;
 	size_t bsize = 0;
-	char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], segsize[6];
+	char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], fill[7];
 	char aux[50];
 	int error;
 
 	if (*print_header) {
-		(void) printf("\n    Object  lvl   iblk   dblk  lsize"
-		    "  asize  type\n");
+		(void) printf("\n%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
+		    "Object", "lvl", "iblk", "dblk", "dsize", "lsize",
+		    "%full", "type");
 		*print_header = 0;
 	}
 
 	if (object == 0) {
-		dn = os->os->os_meta_dnode;
+		dn = os->os_meta_dnode;
 	} else {
 		error = dmu_bonus_hold(os, object, FTAG, &db);
 		if (error)
@@ -1044,36 +1297,47 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
 
 	nicenum(doi.doi_metadata_block_size, iblk);
 	nicenum(doi.doi_data_block_size, dblk);
-	nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1),
-	    lsize);
-	nicenum(doi.doi_physical_blks << 9, asize);
+	nicenum(doi.doi_max_offset, lsize);
+	nicenum(doi.doi_physical_blocks_512 << 9, asize);
 	nicenum(doi.doi_bonus_size, bonus_size);
+	(void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count *
+	    doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) /
+	    doi.doi_max_offset);
 
 	aux[0] = '\0';
 
 	if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)",
-		    zio_checksum_table[doi.doi_checksum].ci_name);
+		    ZDB_CHECKSUM_NAME(doi.doi_checksum));
 	}
 
 	if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) {
 		(void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)",
-		    zio_compress_table[doi.doi_compress].ci_name);
+		    ZDB_COMPRESS_NAME(doi.doi_compress));
 	}
 
-	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %s%s\n",
-	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize,
-	    asize, dmu_ot[doi.doi_type].ot_name, aux);
+	(void) printf("%10lld  %3u  %5s  %5s  %5s  %5s  %6s  %s%s\n",
+	    (u_longlong_t)object, doi.doi_indirection, iblk, dblk,
+	    asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux);
 
 	if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) {
-		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %s\n",
-		    "", "", "", "", bonus_size, "bonus",
-		    dmu_ot[doi.doi_bonus_type].ot_name);
+		(void) printf("%10s  %3s  %5s  %5s  %5s  %5s  %6s  %s\n",
+		    "", "", "", "", "", bonus_size, "bonus",
+		    ZDB_OT_NAME(doi.doi_bonus_type));
 	}
 
 	if (verbosity >= 4) {
-		object_viewer[doi.doi_bonus_type](os, object, bonus, bsize);
-		object_viewer[doi.doi_type](os, object, NULL, 0);
+		(void) printf("\tdnode flags: %s%s\n",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ?
+		    "USED_BYTES " : "",
+		    (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ?
+		    "USERUSED_ACCOUNTED " : "");
+		(void) printf("\tdnode maxblkid: %llu\n",
+		    (longlong_t)dn->dn_phys->dn_maxblkid);
+
+		object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object,
+		    bonus, bsize);
+		object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0);
 		*print_header = 1;
 	}
 
@@ -1095,6 +1359,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header)
 		}
 
 		for (;;) {
+			char segsize[6];
 			error = dnode_next_offset(dn,
 			    0, &start, minlvl, blkfill, 0);
 			if (error)
@@ -1126,7 +1391,7 @@ dump_dir(objset_t *os)
 	uint64_t object, object_count;
 	uint64_t refdbytes, usedobjs, scratch;
 	char numbuf[8];
-	char blkbuf[BP_SPRINTF_LEN];
+	char blkbuf[BP_SPRINTF_LEN + 20];
 	char osname[MAXNAMELEN];
 	char *type = "UNKNOWN";
 	int verbosity = dump_opt['d'];
@@ -1140,21 +1405,20 @@ dump_dir(objset_t *os)
 
 	if (dds.dds_type == DMU_OST_META) {
 		dds.dds_creation_txg = TXG_INITIAL;
-		usedobjs = os->os->os_rootbp->blk_fill;
-		refdbytes = os->os->os_spa->spa_dsl_pool->
+		usedobjs = os->os_rootbp->blk_fill;
+		refdbytes = os->os_spa->spa_dsl_pool->
 		    dp_mos_dir->dd_phys->dd_used_bytes;
 	} else {
 		dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch);
 	}
 
-	ASSERT3U(usedobjs, ==, os->os->os_rootbp->blk_fill);
+	ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill);
 
 	nicenum(refdbytes, numbuf);
 
 	if (verbosity >= 4) {
-		(void) strcpy(blkbuf, ", rootbp ");
-		sprintf_blkptr(blkbuf + strlen(blkbuf),
-		    BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp);
+		(void) sprintf(blkbuf, ", rootbp ");
+		(void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp);
 	} else {
 		blkbuf[0] = '\0';
 	}
@@ -1167,7 +1431,16 @@ dump_dir(objset_t *os)
 	    (u_longlong_t)dds.dds_creation_txg,
 	    numbuf, (u_longlong_t)usedobjs, blkbuf);
 
-	dump_intent_log(dmu_objset_zil(os));
+	if (zopt_objects != 0) {
+		for (i = 0; i < zopt_objects; i++)
+			dump_object(os, zopt_object[i], verbosity,
+			    &print_header);
+		(void) printf("\n");
+		return;
+	}
+
+	if (dump_opt['i'] != 0 || verbosity >= 2)
+		dump_intent_log(dmu_objset_zil(os));
 
 	if (dmu_objset_ds(os) != NULL)
 		dump_bplist(dmu_objset_pool(os)->dp_meta_objset,
@@ -1176,19 +1449,16 @@ dump_dir(objset_t *os)
 	if (verbosity < 2)
 		return;
 
-	if (os->os->os_rootbp->blk_birth == 0)
-		return;
-
-	if (zopt_objects != 0) {
-		for (i = 0; i < zopt_objects; i++)
-			dump_object(os, zopt_object[i], verbosity,
-			    &print_header);
-		(void) printf("\n");
+	if (os->os_rootbp->blk_birth == 0)
 		return;
-	}
 
 	dump_object(os, 0, verbosity, &print_header);
-	object_count = 1;
+	object_count = 0;
+	if (os->os_userused_dnode &&
+	    os->os_userused_dnode->dn_type != 0) {
+		dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header);
+		dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header);
+	}
 
 	object = 0;
 	while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) {
@@ -1200,16 +1470,18 @@ dump_dir(objset_t *os)
 
 	(void) printf("\n");
 
-	if (error != ESRCH)
-		fatal("dmu_object_next() = %d", error);
+	if (error != ESRCH) {
+		(void) fprintf(stderr, "dmu_object_next() = %d\n", error);
+		abort();
+	}
 }
 
 static void
-dump_uberblock(uberblock_t *ub)
+dump_uberblock(uberblock_t *ub, const char *header, const char *footer)
 {
 	time_t timestamp = ub->ub_timestamp;
 
-	(void) printf("Uberblock\n\n");
+	(void) printf(header ? header : "");
 	(void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic);
 	(void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version);
 	(void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg);
@@ -1218,25 +1490,34 @@ dump_uberblock(uberblock_t *ub)
 	    (u_longlong_t)ub->ub_timestamp, asctime(localtime(&timestamp)));
 	if (dump_opt['u'] >= 3) {
 		char blkbuf[BP_SPRINTF_LEN];
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ub->ub_rootbp);
+		sprintf_blkptr(blkbuf, &ub->ub_rootbp);
 		(void) printf("\trootbp = %s\n", blkbuf);
 	}
-	(void) printf("\n");
+	(void) printf(footer ? footer : "");
 }
 
 static void
-dump_config(const char *pool)
+dump_config(spa_t *spa)
 {
-	spa_t *spa = NULL;
+	dmu_buf_t *db;
+	size_t nvsize = 0;
+	int error = 0;
 
-	mutex_enter(&spa_namespace_lock);
-	while ((spa = spa_next(spa)) != NULL) {
-		if (pool == NULL)
-			(void) printf("%s\n", spa_name(spa));
-		if (pool == NULL || strcmp(pool, spa_name(spa)) == 0)
-			dump_nvlist(spa->spa_config, 4);
+
+	error = dmu_bonus_hold(spa->spa_meta_objset,
+	    spa->spa_config_object, FTAG, &db);
+
+	if (error == 0) {
+		nvsize = *(uint64_t *)db->db_data;
+		dmu_buf_rele(db, FTAG);
+
+		(void) printf("\nMOS Configuration:\n");
+		dump_packed_nvlist(spa->spa_meta_objset,
+		    spa->spa_config_object, (void *)&nvsize, 1);
+	} else {
+		(void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d",
+		    (u_longlong_t)spa->spa_config_object, error);
 	}
-	mutex_exit(&spa_namespace_lock);
 }
 
 static void
@@ -1285,6 +1566,30 @@ dump_cachefile(const char *cachefile)
 	nvlist_free(config);
 }
 
+#define	ZDB_MAX_UB_HEADER_SIZE 32
+
+static void
+dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift)
+{
+	vdev_t vd;
+	vdev_t *vdp = &vd;
+	char header[ZDB_MAX_UB_HEADER_SIZE];
+
+	vd.vdev_ashift = ashift;
+	vdp->vdev_top = vdp;
+
+	for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) {
+		uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i);
+		uberblock_t *ub = (void *)((char *)lbl + uoff);
+
+		if (uberblock_verify(ub))
+			continue;
+		(void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE,
+		    "Uberblock[%d]\n", i);
+		dump_uberblock(ub, header, "");
+	}
+}
+
 static void
 dump_label(const char *dev)
 {
@@ -1293,8 +1598,7 @@ dump_label(const char *dev)
 	char *buf = label.vl_vdev_phys.vp_nvlist;
 	size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist);
 	struct stat64 statbuf;
-	uint64_t psize;
-	int l;
+	uint64_t psize, ashift;
 
 	if ((fd = open64(dev, O_RDONLY)) < 0) {
 		(void) printf("cannot open '%s': %s\n", dev, strerror(errno));
@@ -1304,14 +1608,12 @@ dump_label(const char *dev)
 	if (fstat64(fd, &statbuf) != 0) {
 		(void) printf("failed to stat '%s': %s\n", dev,
 		    strerror(errno));
-		exit(1);
 	}
 
 	psize = statbuf.st_size;
 	psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t));
 
-	for (l = 0; l < VDEV_LABELS; l++) {
-
+	for (int l = 0; l < VDEV_LABELS; l++) {
 		nvlist_t *config = NULL;
 
 		(void) printf("--------------------------------------------\n");
@@ -1326,130 +1628,89 @@ dump_label(const char *dev)
 
 		if (nvlist_unpack(buf, buflen, &config, 0) != 0) {
 			(void) printf("failed to unpack label %d\n", l);
-			continue;
+			ashift = SPA_MINBLOCKSHIFT;
+		} else {
+			nvlist_t *vdev_tree = NULL;
+
+			dump_nvlist(config, 4);
+			if ((nvlist_lookup_nvlist(config,
+			    ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) ||
+			    (nvlist_lookup_uint64(vdev_tree,
+			    ZPOOL_CONFIG_ASHIFT, &ashift) != 0))
+				ashift = SPA_MINBLOCKSHIFT;
+			nvlist_free(config);
 		}
-		dump_nvlist(config, 4);
-		nvlist_free(config);
+		if (dump_opt['u'])
+			dump_label_uberblocks(&label, ashift);
 	}
 }
 
 /*ARGSUSED*/
 static int
-dump_one_dir(char *dsname, void *arg)
+dump_one_dir(const char *dsname, void *arg)
 {
 	int error;
 	objset_t *os;
 
-	error = dmu_objset_open(dsname, DMU_OST_ANY,
-	    DS_MODE_USER | DS_MODE_READONLY, &os);
+	error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os);
 	if (error) {
-		(void) printf("Could not open %s\n", dsname);
+		(void) printf("Could not open %s, error %d\n", dsname, error);
 		return (0);
 	}
 	dump_dir(os);
-	dmu_objset_close(os);
+	dmu_objset_disown(os, FTAG);
 	fuid_table_destroy();
 	return (0);
 }
 
-static void
-zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	vdev_t *vd = sm->sm_ppd;
+/*
+ * Block statistics.
+ */
+typedef struct zdb_blkstats {
+	uint64_t	zb_asize;
+	uint64_t	zb_lsize;
+	uint64_t	zb_psize;
+	uint64_t	zb_count;
+} zdb_blkstats_t;
 
-	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
-	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
-}
+/*
+ * Extended object types to report deferred frees and dedup auto-ditto blocks.
+ */
+#define	ZDB_OT_DEFERRED	(DMU_OT_NUMTYPES + 0)
+#define	ZDB_OT_DITTO	(DMU_OT_NUMTYPES + 1)
+#define	ZDB_OT_TOTAL	(DMU_OT_NUMTYPES + 2)
+
+static char *zdb_ot_extname[] = {
+	"deferred free",
+	"dedup ditto",
+	"Total",
+};
 
-/* ARGSUSED */
-static void
-zdb_space_map_load(space_map_t *sm)
-{
-}
+#define	ZB_TOTAL	DN_MAX_LEVELS
 
-static void
-zdb_space_map_unload(space_map_t *sm)
-{
-	space_map_vacate(sm, zdb_leak, sm);
-}
+typedef struct zdb_cb {
+	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1];
+	uint64_t	zcb_dedup_asize;
+	uint64_t	zcb_dedup_blocks;
+	uint64_t	zcb_errors[256];
+	int		zcb_readfails;
+	int		zcb_haderrors;
+} zdb_cb_t;
 
-/* ARGSUSED */
 static void
-zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp,
+    dmu_object_type_t type)
 {
-}
+	uint64_t refcnt = 0;
 
-static space_map_ops_t zdb_space_map_ops = {
-	zdb_space_map_load,
-	zdb_space_map_unload,
-	NULL,	/* alloc */
-	zdb_space_map_claim,
-	NULL	/* free */
-};
+	ASSERT(type < ZDB_OT_TOTAL);
 
-static void
-zdb_leak_init(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *vd = rvd->vdev_child[c];
-		for (int m = 0; m < vd->vdev_ms_count; m++) {
-			metaslab_t *msp = vd->vdev_ms[m];
-			mutex_enter(&msp->ms_lock);
-			VERIFY(space_map_load(&msp->ms_map, &zdb_space_map_ops,
-			    SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset) == 0);
-			msp->ms_map.sm_ppd = vd;
-			mutex_exit(&msp->ms_lock);
-		}
-	}
-}
-
-static void
-zdb_leak_fini(spa_t *spa)
-{
-	vdev_t *rvd = spa->spa_root_vdev;
-
-	for (int c = 0; c < rvd->vdev_children; c++) {
-		vdev_t *vd = rvd->vdev_child[c];
-		for (int m = 0; m < vd->vdev_ms_count; m++) {
-			metaslab_t *msp = vd->vdev_ms[m];
-			mutex_enter(&msp->ms_lock);
-			space_map_unload(&msp->ms_map);
-			mutex_exit(&msp->ms_lock);
-		}
-	}
-}
-
-/*
- * Verify that the sum of the sizes of all blocks in the pool adds up
- * to the SPA's sa_alloc total.
- */
-typedef struct zdb_blkstats {
-	uint64_t	zb_asize;
-	uint64_t	zb_lsize;
-	uint64_t	zb_psize;
-	uint64_t	zb_count;
-} zdb_blkstats_t;
-
-#define	DMU_OT_DEFERRED	DMU_OT_NONE
-#define	DMU_OT_TOTAL	DMU_OT_NUMTYPES
-
-#define	ZB_TOTAL	DN_MAX_LEVELS
-
-typedef struct zdb_cb {
-	zdb_blkstats_t	zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1];
-	uint64_t	zcb_errors[256];
-	int		zcb_readfails;
-	int		zcb_haderrors;
-} zdb_cb_t;
+	if (zilog && zil_bp_tree_add(zilog, bp) != 0)
+		return;
 
-static void
-zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
-{
 	for (int i = 0; i < 4; i++) {
 		int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL;
-		int t = (i & 1) ? type : DMU_OT_TOTAL;
+		int t = (i & 1) ? type : ZDB_OT_TOTAL;
 		zdb_blkstats_t *zb = &zcb->zcb_type[l][t];
 
 		zb->zb_asize += BP_GET_ASIZE(bp);
@@ -1458,114 +1719,240 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type)
 		zb->zb_count++;
 	}
 
-	if (dump_opt['S']) {
-		boolean_t print_sig;
-
-		print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 &&
-		    BP_GET_TYPE(bp) == DMU_OT_PLAIN_FILE_CONTENTS);
-
-		if (BP_GET_CHECKSUM(bp) < zdb_sig_cksumalg)
-			print_sig = B_FALSE;
-
-		if (print_sig) {
-			(void) printf("%llu\t%lld\t%lld\t%s\t%s\t%s\t"
-			    "%llx:%llx:%llx:%llx\n",
-			    (u_longlong_t)BP_GET_LEVEL(bp),
-			    (longlong_t)BP_GET_PSIZE(bp),
-			    (longlong_t)BP_GET_NDVAS(bp),
-			    dmu_ot[BP_GET_TYPE(bp)].ot_name,
-			    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
-			    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
-			    (u_longlong_t)bp->blk_cksum.zc_word[0],
-			    (u_longlong_t)bp->blk_cksum.zc_word[1],
-			    (u_longlong_t)bp->blk_cksum.zc_word[2],
-			    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+	if (dump_opt['L'])
+		return;
+
+	if (BP_GET_DEDUP(bp)) {
+		ddt_t *ddt;
+		ddt_entry_t *dde;
+
+		ddt = ddt_select(spa, bp);
+		ddt_enter(ddt);
+		dde = ddt_lookup(ddt, bp, B_FALSE);
+
+		if (dde == NULL) {
+			refcnt = 0;
+		} else {
+			ddt_phys_t *ddp = ddt_phys_select(dde, bp);
+			ddt_phys_decref(ddp);
+			refcnt = ddp->ddp_refcnt;
+			if (ddt_phys_total_refcnt(dde) == 0)
+				ddt_remove(ddt, dde);
 		}
+		ddt_exit(ddt);
 	}
 
-	if (!dump_opt['L'])
-		VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp,
-		    NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0);
+	VERIFY3U(zio_wait(zio_claim(NULL, spa,
+	    refcnt ? 0 : spa_first_txg(spa),
+	    bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0);
 }
 
 static int
-zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	zdb_cb_t *zcb = arg;
 	char blkbuf[BP_SPRINTF_LEN];
+	dmu_object_type_t type;
+	boolean_t is_metadata;
 
 	if (bp == NULL)
 		return (0);
 
-	zdb_count_block(spa, zcb, bp, BP_GET_TYPE(bp));
+	type = BP_GET_TYPE(bp);
+
+	zdb_count_block(spa, zilog, zcb, bp, type);
+
+	is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata);
 
-	if (dump_opt['c'] || dump_opt['S']) {
-		int ioerr, size;
-		void *data;
+	if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) {
+		int ioerr;
+		size_t size = BP_GET_PSIZE(bp);
+		void *data = malloc(size);
+		int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW;
+
+		/* If it's an intent log block, failure is expected. */
+		if (zb->zb_level == ZB_ZIL_LEVEL)
+			flags |= ZIO_FLAG_SPECULATIVE;
 
-		size = BP_GET_LSIZE(bp);
-		data = malloc(size);
 		ioerr = zio_wait(zio_read(NULL, spa, bp, data, size,
-		    NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB, zb));
+		    NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb));
+
 		free(data);
 
-		/* We expect io errors on intent log */
-		if (ioerr && BP_GET_TYPE(bp) != DMU_OT_INTENT_LOG) {
+		if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) {
 			zcb->zcb_haderrors = 1;
 			zcb->zcb_errors[ioerr]++;
 
 			if (dump_opt['b'] >= 2)
-				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+				sprintf_blkptr(blkbuf, bp);
 			else
 				blkbuf[0] = '\0';
 
-			if (!dump_opt['S']) {
-				(void) printf("zdb_blkptr_cb: "
-				    "Got error %d reading "
-				    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
-				    ioerr,
-				    (u_longlong_t)zb->zb_objset,
-				    (u_longlong_t)zb->zb_object,
-				    (u_longlong_t)zb->zb_level,
-				    (u_longlong_t)zb->zb_blkid,
-				    blkbuf);
-			}
+			(void) printf("zdb_blkptr_cb: "
+			    "Got error %d reading "
+			    "<%llu, %llu, %lld, %llx> %s -- skipping\n",
+			    ioerr,
+			    (u_longlong_t)zb->zb_objset,
+			    (u_longlong_t)zb->zb_object,
+			    (u_longlong_t)zb->zb_level,
+			    (u_longlong_t)zb->zb_blkid,
+			    blkbuf);
 		}
 	}
 
 	zcb->zcb_readfails = 0;
 
 	if (dump_opt['b'] >= 4) {
-		sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
-		(void) printf("objset %llu object %llu offset 0x%llx %s\n",
+		sprintf_blkptr(blkbuf, bp);
+		(void) printf("objset %llu object %llu "
+		    "level %lld offset 0x%llx %s\n",
 		    (u_longlong_t)zb->zb_objset,
 		    (u_longlong_t)zb->zb_object,
-		    (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid),
+		    (longlong_t)zb->zb_level,
+		    (u_longlong_t)blkid2offset(dnp, bp, zb),
 		    blkbuf);
 	}
 
 	return (0);
 }
 
+static void
+zdb_leak(space_map_t *sm, uint64_t start, uint64_t size)
+{
+	vdev_t *vd = sm->sm_ppd;
+
+	(void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n",
+	    (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size);
+}
+
+/* ARGSUSED */
+static void
+zdb_space_map_load(space_map_t *sm)
+{
+}
+
+static void
+zdb_space_map_unload(space_map_t *sm)
+{
+	space_map_vacate(sm, zdb_leak, sm);
+}
+
+/* ARGSUSED */
+static void
+zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size)
+{
+}
+
+static space_map_ops_t zdb_space_map_ops = {
+	zdb_space_map_load,
+	zdb_space_map_unload,
+	NULL,	/* alloc */
+	zdb_space_map_claim,
+	NULL,	/* free */
+	NULL	/* maxsize */
+};
+
+static void
+zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+	ddt_bookmark_t ddb = { 0 };
+	ddt_entry_t dde;
+	int error;
+
+	while ((error = ddt_walk(spa, &ddb, &dde)) == 0) {
+		blkptr_t blk;
+		ddt_phys_t *ddp = dde.dde_phys;
+
+		if (ddb.ddb_class == DDT_CLASS_UNIQUE)
+			return;
+
+		ASSERT(ddt_phys_total_refcnt(&dde) > 1);
+
+		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+			if (ddp->ddp_phys_birth == 0)
+				continue;
+			ddt_bp_create(ddb.ddb_checksum,
+			    &dde.dde_key, ddp, &blk);
+			if (p == DDT_PHYS_DITTO) {
+				zdb_count_block(spa, NULL, zcb, &blk,
+				    ZDB_OT_DITTO);
+			} else {
+				zcb->zcb_dedup_asize +=
+				    BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1);
+				zcb->zcb_dedup_blocks++;
+			}
+		}
+		if (!dump_opt['L']) {
+			ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum];
+			ddt_enter(ddt);
+			VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL);
+			ddt_exit(ddt);
+		}
+	}
+
+	ASSERT(error == ENOENT);
+}
+
+static void
+zdb_leak_init(spa_t *spa, zdb_cb_t *zcb)
+{
+	if (!dump_opt['L']) {
+		vdev_t *rvd = spa->spa_root_vdev;
+		for (int c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *vd = rvd->vdev_child[c];
+			for (int m = 0; m < vd->vdev_ms_count; m++) {
+				metaslab_t *msp = vd->vdev_ms[m];
+				mutex_enter(&msp->ms_lock);
+				space_map_unload(&msp->ms_map);
+				VERIFY(space_map_load(&msp->ms_map,
+				    &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo,
+				    spa->spa_meta_objset) == 0);
+				msp->ms_map.sm_ppd = vd;
+				mutex_exit(&msp->ms_lock);
+			}
+		}
+	}
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	zdb_ddt_leak_init(spa, zcb);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+}
+
+static void
+zdb_leak_fini(spa_t *spa)
+{
+	if (!dump_opt['L']) {
+		vdev_t *rvd = spa->spa_root_vdev;
+		for (int c = 0; c < rvd->vdev_children; c++) {
+			vdev_t *vd = rvd->vdev_child[c];
+			for (int m = 0; m < vd->vdev_ms_count; m++) {
+				metaslab_t *msp = vd->vdev_ms[m];
+				mutex_enter(&msp->ms_lock);
+				space_map_unload(&msp->ms_map);
+				mutex_exit(&msp->ms_lock);
+			}
+		}
+	}
+}
+
 static int
 dump_block_stats(spa_t *spa)
 {
 	zdb_cb_t zcb = { 0 };
 	zdb_blkstats_t *zb, *tzb;
-	uint64_t alloc, space, logalloc;
-	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t norm_alloc, norm_space, total_alloc, total_found;
+	int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD;
 	int leaks = 0;
-	int c, e;
 
-	if (!dump_opt['S']) {
-		(void) printf("\nTraversing all blocks %s%s%s%s...\n",
-		    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
-		    dump_opt['c'] ? "checksums " : "",
-		    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
-		    !dump_opt['L'] ? "nothing leaked " : "");
-	}
+	(void) printf("\nTraversing all blocks %s%s%s%s%s...\n",
+	    (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "",
+	    (dump_opt['c'] == 1) ? "metadata " : "",
+	    dump_opt['c'] ? "checksums " : "",
+	    (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "",
+	    !dump_opt['L'] ? "nothing leaked " : "");
 
 	/*
 	 * Load all space maps as SM_ALLOC maps, then traverse the pool
@@ -1575,39 +1962,41 @@ dump_block_stats(spa_t *spa)
 	 * it's not part of any space map) is a double allocation,
 	 * reference to a freed block, or an unclaimed log block.
 	 */
-	if (!dump_opt['L'])
-		zdb_leak_init(spa);
+	zdb_leak_init(spa, &zcb);
 
 	/*
 	 * If there's a deferred-free bplist, process that first.
 	 */
-	if (spa->spa_sync_bplist_obj != 0) {
-		bplist_t *bpl = &spa->spa_sync_bplist;
+	if (spa->spa_deferred_bplist_obj != 0) {
+		bplist_t *bpl = &spa->spa_deferred_bplist;
 		blkptr_t blk;
 		uint64_t itor = 0;
 
 		VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset,
-		    spa->spa_sync_bplist_obj));
+		    spa->spa_deferred_bplist_obj));
 
 		while (bplist_iterate(bpl, &itor, &blk) == 0) {
 			if (dump_opt['b'] >= 4) {
 				char blkbuf[BP_SPRINTF_LEN];
-				sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk);
+				sprintf_blkptr(blkbuf, &blk);
 				(void) printf("[%s] %s\n",
 				    "deferred free", blkbuf);
 			}
-			zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED);
+			zdb_count_block(spa, NULL, &zcb, &blk, ZDB_OT_DEFERRED);
 		}
 
 		bplist_close(bpl);
 	}
 
-	zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb);
+	if (dump_opt['c'] > 1)
+		flags |= TRAVERSE_PREFETCH_DATA;
+
+	zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb);
 
-	if (zcb.zcb_haderrors && !dump_opt['S']) {
+	if (zcb.zcb_haderrors) {
 		(void) printf("\nError counts:\n\n");
 		(void) printf("\t%5s  %s\n", "errno", "count");
-		for (e = 0; e < 256; e++) {
+		for (int e = 0; e < 256; e++) {
 			if (zcb.zcb_errors[e] != 0) {
 				(void) printf("\t%5d  %llu\n",
 				    e, (u_longlong_t)zcb.zcb_errors[e]);
@@ -1618,43 +2007,27 @@ dump_block_stats(spa_t *spa)
 	/*
 	 * Report any leaked segments.
 	 */
-	if (!dump_opt['L'])
-		zdb_leak_fini(spa);
+	zdb_leak_fini(spa);
 
-	/*
-	 * If we're interested in printing out the blkptr signatures,
-	 * return now as we don't print out anything else (including
-	 * errors and leaks).
-	 */
-	if (dump_opt['S'])
-		return (zcb.zcb_haderrors ? 3 : 0);
-
-	alloc = spa_get_alloc(spa);
-	space = spa_get_space(spa);
-
-	/*
-	 * Log blocks allocated from a separate log device don't count
-	 * as part of the normal pool space; factor them in here.
-	 */
-	logalloc = 0;
+	tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL];
 
-	for (c = 0; c < rvd->vdev_children; c++)
-		if (rvd->vdev_child[c]->vdev_islog)
-			logalloc += rvd->vdev_child[c]->vdev_stat.vs_alloc;
+	norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+	norm_space = metaslab_class_get_space(spa_normal_class(spa));
 
-	tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL];
+	total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa));
+	total_found = tzb->zb_asize - zcb.zcb_dedup_asize;
 
-	if (tzb->zb_asize == alloc + logalloc) {
+	if (total_found == total_alloc) {
 		if (!dump_opt['L'])
 			(void) printf("\n\tNo leaks (block sum matches space"
 			    " maps exactly)\n");
 	} else {
 		(void) printf("block traversal size %llu != alloc %llu "
 		    "(%s %lld)\n",
-		    (u_longlong_t)tzb->zb_asize,
-		    (u_longlong_t)alloc + logalloc,
+		    (u_longlong_t)total_found,
+		    (u_longlong_t)total_alloc,
 		    (dump_opt['L']) ? "unreachable" : "leaked",
-		    (longlong_t)(alloc + logalloc - tzb->zb_asize));
+		    (longlong_t)(total_alloc - total_found));
 		leaks = 1;
 	}
 
@@ -1664,33 +2037,40 @@ dump_block_stats(spa_t *spa)
 	(void) printf("\n");
 	(void) printf("\tbp count:      %10llu\n",
 	    (u_longlong_t)tzb->zb_count);
-	(void) printf("\tbp logical:    %10llu\t avg: %6llu\n",
+	(void) printf("\tbp logical:    %10llu      avg: %6llu\n",
 	    (u_longlong_t)tzb->zb_lsize,
 	    (u_longlong_t)(tzb->zb_lsize / tzb->zb_count));
-	(void) printf("\tbp physical:   %10llu\t avg:"
-	    " %6llu\tcompression: %6.2f\n",
+	(void) printf("\tbp physical:   %10llu      avg:"
+	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_psize,
 	    (u_longlong_t)(tzb->zb_psize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_psize);
-	(void) printf("\tbp allocated:  %10llu\t avg:"
-	    " %6llu\tcompression: %6.2f\n",
+	(void) printf("\tbp allocated:  %10llu      avg:"
+	    " %6llu     compression: %6.2f\n",
 	    (u_longlong_t)tzb->zb_asize,
 	    (u_longlong_t)(tzb->zb_asize / tzb->zb_count),
 	    (double)tzb->zb_lsize / tzb->zb_asize);
-	(void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n",
-	    (u_longlong_t)alloc, 100.0 * alloc / space);
+	(void) printf("\tbp deduped:    %10llu    ref>1:"
+	    " %6llu   deduplication: %6.2f\n",
+	    (u_longlong_t)zcb.zcb_dedup_asize,
+	    (u_longlong_t)zcb.zcb_dedup_blocks,
+	    (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0);
+	(void) printf("\tSPA allocated: %10llu     used: %5.2f%%\n",
+	    (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space);
 
 	if (dump_opt['b'] >= 2) {
 		int l, t, level;
 		(void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE"
 		    "\t  avg\t comp\t%%Total\tType\n");
 
-		for (t = 0; t <= DMU_OT_NUMTYPES; t++) {
+		for (t = 0; t <= ZDB_OT_TOTAL; t++) {
 			char csize[6], lsize[6], psize[6], asize[6], avg[6];
 			char *typename;
 
-			typename = t == DMU_OT_DEFERRED ? "deferred free" :
-			    t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name;
+			if (t < DMU_OT_NUMTYPES)
+				typename = dmu_ot[t].ot_name;
+			else
+				typename = zdb_ot_extname[t - DMU_OT_NUMTYPES];
 
 			if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) {
 				(void) printf("%6s\t%5s\t%5s\t%5s"
@@ -1752,33 +2132,154 @@ dump_block_stats(spa_t *spa)
 	return (0);
 }
 
+typedef struct zdb_ddt_entry {
+	ddt_key_t	zdde_key;
+	uint64_t	zdde_ref_blocks;
+	uint64_t	zdde_ref_lsize;
+	uint64_t	zdde_ref_psize;
+	uint64_t	zdde_ref_dsize;
+	avl_node_t	zdde_node;
+} zdb_ddt_entry_t;
+
+/* ARGSUSED */
+static int
+zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	avl_tree_t *t = arg;
+	avl_index_t where;
+	zdb_ddt_entry_t *zdde, zdde_search;
+
+	if (bp == NULL)
+		return (0);
+
+	if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) {
+		(void) printf("traversing objset %llu, %llu objects, "
+		    "%lu blocks so far\n",
+		    (u_longlong_t)zb->zb_objset,
+		    (u_longlong_t)bp->blk_fill,
+		    avl_numnodes(t));
+	}
+
+	if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF ||
+	    BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata)
+		return (0);
+
+	ddt_key_fill(&zdde_search.zdde_key, bp);
+
+	zdde = avl_find(t, &zdde_search, &where);
+
+	if (zdde == NULL) {
+		zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL);
+		zdde->zdde_key = zdde_search.zdde_key;
+		avl_insert(t, zdde, where);
+	}
+
+	zdde->zdde_ref_blocks += 1;
+	zdde->zdde_ref_lsize += BP_GET_LSIZE(bp);
+	zdde->zdde_ref_psize += BP_GET_PSIZE(bp);
+	zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp);
+
+	return (0);
+}
+
+static void
+dump_simulated_ddt(spa_t *spa)
+{
+	avl_tree_t t;
+	void *cookie = NULL;
+	zdb_ddt_entry_t *zdde;
+	ddt_histogram_t ddh_total = { 0 };
+	ddt_stat_t dds_total = { 0 };
+
+	avl_create(&t, ddt_entry_compare,
+	    sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node));
+
+	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+
+	(void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA,
+	    zdb_ddt_add_cb, &t);
+
+	spa_config_exit(spa, SCL_CONFIG, FTAG);
+
+	while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) {
+		ddt_stat_t dds;
+		uint64_t refcnt = zdde->zdde_ref_blocks;
+		ASSERT(refcnt != 0);
+
+		dds.dds_blocks = zdde->zdde_ref_blocks / refcnt;
+		dds.dds_lsize = zdde->zdde_ref_lsize / refcnt;
+		dds.dds_psize = zdde->zdde_ref_psize / refcnt;
+		dds.dds_dsize = zdde->zdde_ref_dsize / refcnt;
+
+		dds.dds_ref_blocks = zdde->zdde_ref_blocks;
+		dds.dds_ref_lsize = zdde->zdde_ref_lsize;
+		dds.dds_ref_psize = zdde->zdde_ref_psize;
+		dds.dds_ref_dsize = zdde->zdde_ref_dsize;
+
+		ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0);
+
+		umem_free(zdde, sizeof (*zdde));
+	}
+
+	avl_destroy(&t);
+
+	ddt_histogram_stat(&dds_total, &ddh_total);
+
+	(void) printf("Simulated DDT histogram:\n");
+
+	zpool_dump_ddt(&dds_total, &ddh_total);
+
+	dump_dedup_ratio(&dds_total);
+}
+
 static void
 dump_zpool(spa_t *spa)
 {
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	int rc = 0;
 
+	if (dump_opt['S']) {
+		dump_simulated_ddt(spa);
+		return;
+	}
+
+	if (!dump_opt['e'] && dump_opt['C'] > 1) {
+		(void) printf("\nCached configuration:\n");
+		dump_nvlist(spa->spa_config, 8);
+	}
+
+	if (dump_opt['C'])
+		dump_config(spa);
+
 	if (dump_opt['u'])
-		dump_uberblock(&spa->spa_uberblock);
+		dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n");
+
+	if (dump_opt['D'])
+		dump_all_ddts(spa);
+
+	if (dump_opt['d'] > 2 || dump_opt['m'])
+		dump_metaslabs(spa);
 
 	if (dump_opt['d'] || dump_opt['i']) {
 		dump_dir(dp->dp_meta_objset);
 		if (dump_opt['d'] >= 3) {
 			dump_bplist(dp->dp_meta_objset,
-			    spa->spa_sync_bplist_obj, "Deferred frees");
+			    spa->spa_deferred_bplist_obj, "Deferred frees");
 			dump_dtl(spa->spa_root_vdev, 0);
-			dump_metaslabs(spa);
 		}
-		(void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL,
-		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+		(void) dmu_objset_find(spa_name(spa), dump_one_dir,
+		    NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
 	}
-
-	if (dump_opt['b'] || dump_opt['c'] || dump_opt['S'])
+	if (dump_opt['b'] || dump_opt['c'])
 		rc = dump_block_stats(spa);
 
 	if (dump_opt['s'])
 		show_pool_stats(spa);
 
+	if (dump_opt['h'])
+		dump_history(spa);
+
 	if (rc != 0)
 		exit(rc);
 }
@@ -1797,51 +2298,13 @@ int flagbits[256];
 static void
 zdb_print_blkptr(blkptr_t *bp, int flags)
 {
-	dva_t *dva = bp->blk_dva;
-	int d;
+	char blkbuf[BP_SPRINTF_LEN];
 
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array((void *)bp, sizeof (blkptr_t));
-	/*
-	 * Super-ick warning:  This code is also duplicated in
-	 * cmd/mdb/common/modules/zfs/zfs.c .  Yeah, I hate code
-	 * replication, too.
-	 */
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		(void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d,
-		    (longlong_t)DVA_GET_VDEV(&dva[d]),
-		    (longlong_t)DVA_GET_OFFSET(&dva[d]));
-		(void) printf("\tDVA[%d]:       GANG: %-5s  GRID:  %04llx\t"
-		    "ASIZE: %llx\n", d,
-		    DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE",
-		    (longlong_t)DVA_GET_GRID(&dva[d]),
-		    (longlong_t)DVA_GET_ASIZE(&dva[d]));
-		(void) printf("\tDVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", d,
-		    (u_longlong_t)DVA_GET_VDEV(&dva[d]),
-		    (longlong_t)DVA_GET_OFFSET(&dva[d]),
-		    (longlong_t)BP_GET_PSIZE(bp),
-		    BP_SHOULD_BYTESWAP(bp) ? "e" : "",
-		    !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ?
-		    "d" : "",
-		    DVA_GET_GANG(&dva[d]) ? "g" : "",
-		    BP_GET_COMPRESS(bp) != 0 ? "d" : "");
-	}
-	(void) printf("\tLSIZE:  %-16llx\t\tPSIZE: %llx\n",
-	    (longlong_t)BP_GET_LSIZE(bp), (longlong_t)BP_GET_PSIZE(bp));
-	(void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE:  %s\n",
-	    BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG",
-	    dmu_ot[BP_GET_TYPE(bp)].ot_name);
-	(void) printf("\tBIRTH:  %-16llx   LEVEL: %-2llu\tFILL:  %llx\n",
-	    (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_GET_LEVEL(bp),
-	    (u_longlong_t)bp->blk_fill);
-	(void) printf("\tCKFUNC: %-16s\t\tCOMP:  %s\n",
-	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
-	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name);
-	(void) printf("\tCKSUM:  %llx:%llx:%llx:%llx\n",
-	    (u_longlong_t)bp->blk_cksum.zc_word[0],
-	    (u_longlong_t)bp->blk_cksum.zc_word[1],
-	    (u_longlong_t)bp->blk_cksum.zc_word[2],
-	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+
+	sprintf_blkptr(blkbuf, bp);
+	(void) printf("%s\n", blkbuf);
 }
 
 static void
@@ -1864,7 +2327,7 @@ zdb_dump_block_raw(void *buf, uint64_t size, int flags)
 {
 	if (flags & ZDB_FLAG_BSWAP)
 		byteswap_uint64_array(buf, size);
-	(void) write(2, buf, size);
+	(void) write(1, buf, size);
 }
 
 static void
@@ -1967,31 +2430,30 @@ zdb_vdev_lookup(vdev_t *vdev, char *path)
  *	flags          - A string of characters specifying options
  *		 b: Decode a blkptr at given offset within block
  *		*c: Calculate and display checksums
- *		*d: Decompress data before dumping
+ *		 d: Decompress data before dumping
  *		 e: Byteswap data before dumping
- *		*g: Display data as a gang block header
- *		*i: Display as an indirect block
+ *		 g: Display data as a gang block header
+ *		 i: Display as an indirect block
  *		 p: Do I/O to physical offset
  *		 r: Dump raw data to stdout
  *
  *              * = not yet implemented
  */
 static void
-zdb_read_block(char *thing, spa_t **spap)
+zdb_read_block(char *thing, spa_t *spa)
 {
-	spa_t *spa = *spap;
+	blkptr_t blk, *bp = &blk;
+	dva_t *dva = bp->blk_dva;
 	int flags = 0;
-	uint64_t offset = 0, size = 0, blkptr_offset = 0;
+	uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0;
 	zio_t *zio;
 	vdev_t *vd;
-	void *buf;
-	char *s, *p, *dup, *pool, *vdev, *flagstr;
-	int i, error, zio_flags;
+	void *pbuf, *lbuf, *buf;
+	char *s, *p, *dup, *vdev, *flagstr;
+	int i, error;
 
 	dup = strdup(thing);
 	s = strtok(dup, ":");
-	pool = s ? s : "";
-	s = strtok(NULL, ":");
 	vdev = s ? s : "";
 	s = strtok(NULL, ":");
 	offset = strtoull(s ? s : "", NULL, 16);
@@ -2025,7 +2487,7 @@ zdb_read_block(char *thing, spa_t **spap)
 			flags |= bit;
 
 			/* If it's not something with an argument, keep going */
-			if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS |
+			if ((bit & (ZDB_FLAG_CHECKSUM |
 			    ZDB_FLAG_PRINT_BLKPTR)) == 0)
 				continue;
 
@@ -2040,16 +2502,6 @@ zdb_read_block(char *thing, spa_t **spap)
 		}
 	}
 
-	if (spa == NULL || strcmp(spa_name(spa), pool) != 0) {
-		if (spa)
-			spa_close(spa, (void *)zdb_read_block);
-		error = spa_open(pool, spap, (void *)zdb_read_block);
-		if (error)
-			fatal("Failed to open pool '%s': %s",
-			    pool, strerror(error));
-		spa = *spap;
-	}
-
 	vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev);
 	if (vd == NULL) {
 		(void) printf("***Invalid vdev: %s\n", vdev);
@@ -2057,22 +2509,58 @@ zdb_read_block(char *thing, spa_t **spap)
 		return;
 	} else {
 		if (vd->vdev_path)
-			(void) printf("Found vdev: %s\n", vd->vdev_path);
+			(void) fprintf(stderr, "Found vdev: %s\n",
+			    vd->vdev_path);
 		else
-			(void) printf("Found vdev type: %s\n",
+			(void) fprintf(stderr, "Found vdev type: %s\n",
 			    vd->vdev_ops->vdev_op_type);
 	}
 
-	buf = umem_alloc(size, UMEM_NOFAIL);
+	psize = size;
+	lsize = size;
+
+	pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+	lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+	BP_ZERO(bp);
+
+	DVA_SET_VDEV(&dva[0], vd->vdev_id);
+	DVA_SET_OFFSET(&dva[0], offset);
+	DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH));
+	DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize));
+
+	BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL);
 
-	zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
-	    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY;
+	BP_SET_LSIZE(bp, lsize);
+	BP_SET_PSIZE(bp, psize);
+	BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF);
+	BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF);
+	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_LEVEL(bp, 0);
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
 
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
 	zio = zio_root(spa, NULL, NULL, 0);
-	/* XXX todo - cons up a BP so RAID-Z will be happy */
-	zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size,
-	    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL));
+
+	if (vd == vd->vdev_top) {
+		/*
+		 * Treat this as a normal block read.
+		 */
+		zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL,
+		    ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL));
+	} else {
+		/*
+		 * Treat this as a vdev child I/O.
+		 */
+		zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize,
+		    ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ,
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE |
+		    ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY |
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL));
+	}
+
 	error = zio_wait(zio);
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
@@ -2081,6 +2569,52 @@ zdb_read_block(char *thing, spa_t **spap)
 		goto out;
 	}
 
+	if (flags & ZDB_FLAG_DECOMPRESS) {
+		/*
+		 * We don't know how the data was compressed, so just try
+		 * every decompress function at every inflated blocksize.
+		 */
+		enum zio_compress c;
+		void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+		void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL);
+
+		bcopy(pbuf, pbuf2, psize);
+
+		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize,
+		    SPA_MAXBLOCKSIZE - psize) == 0);
+
+		VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize,
+		    SPA_MAXBLOCKSIZE - psize) == 0);
+
+		for (lsize = SPA_MAXBLOCKSIZE; lsize > psize;
+		    lsize -= SPA_MINBLOCKSIZE) {
+			for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) {
+				if (zio_decompress_data(c, pbuf, lbuf,
+				    psize, lsize) == 0 &&
+				    zio_decompress_data(c, pbuf2, lbuf2,
+				    psize, lsize) == 0 &&
+				    bcmp(lbuf, lbuf2, lsize) == 0)
+					break;
+			}
+			if (c != ZIO_COMPRESS_FUNCTIONS)
+				break;
+			lsize -= SPA_MINBLOCKSIZE;
+		}
+
+		umem_free(pbuf2, SPA_MAXBLOCKSIZE);
+		umem_free(lbuf2, SPA_MAXBLOCKSIZE);
+
+		if (lsize <= psize) {
+			(void) printf("Decompress of %s failed\n", thing);
+			goto out;
+		}
+		buf = lbuf;
+		size = lsize;
+	} else {
+		buf = pbuf;
+		size = psize;
+	}
+
 	if (flags & ZDB_FLAG_PRINT_BLKPTR)
 		zdb_print_blkptr((blkptr_t *)(void *)
 		    ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags);
@@ -2095,134 +2629,92 @@ zdb_read_block(char *thing, spa_t **spap)
 		zdb_dump_block(thing, buf, size, flags);
 
 out:
-	umem_free(buf, size);
+	umem_free(pbuf, SPA_MAXBLOCKSIZE);
+	umem_free(lbuf, SPA_MAXBLOCKSIZE);
 	free(dup);
 }
 
 static boolean_t
-nvlist_string_match(nvlist_t *config, char *name, char *tgt)
+pool_match(nvlist_t *cfg, char *tgt)
 {
+	uint64_t v, guid = strtoull(tgt, NULL, 0);
 	char *s;
 
-	if (nvlist_lookup_string(config, name, &s) != 0)
-		return (B_FALSE);
-
-	return (strcmp(s, tgt) == 0);
-}
-
-static boolean_t
-nvlist_uint64_match(nvlist_t *config, char *name, uint64_t tgt)
-{
-	uint64_t val;
-
-	if (nvlist_lookup_uint64(config, name, &val) != 0)
-		return (B_FALSE);
-
-	return (val == tgt);
-}
-
-static boolean_t
-vdev_child_guid_match(nvlist_t *vdev, uint64_t guid)
-{
-	nvlist_t **child;
-	uint_t c, children;
-
-	verify(nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0);
-	for (c = 0; c < children; ++c)
-		if (nvlist_uint64_match(child[c], ZPOOL_CONFIG_GUID, guid))
-			return (B_TRUE);
-	return (B_FALSE);
-}
-
-static boolean_t
-vdev_child_string_match(nvlist_t *vdev, char *tgt)
-{
-	nvlist_t **child;
-	uint_t c, children;
-
-	verify(nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0);
-	for (c = 0; c < children; ++c) {
-		if (nvlist_string_match(child[c], ZPOOL_CONFIG_PATH, tgt) ||
-		    nvlist_string_match(child[c], ZPOOL_CONFIG_DEVID, tgt))
-			return (B_TRUE);
-	}
-	return (B_FALSE);
-}
-
-static boolean_t
-vdev_guid_match(nvlist_t *config, uint64_t guid)
-{
-	nvlist_t *nvroot;
-
-	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-
-	return (nvlist_uint64_match(nvroot, ZPOOL_CONFIG_GUID, guid) ||
-	    vdev_child_guid_match(nvroot, guid));
-}
-
-static boolean_t
-vdev_string_match(nvlist_t *config, char *tgt)
-{
-	nvlist_t *nvroot;
-
-	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-
-	return (vdev_child_string_match(nvroot, tgt));
-}
-
-static boolean_t
-pool_match(nvlist_t *config, char *tgt)
-{
-	uint64_t guid = strtoull(tgt, NULL, 0);
-
 	if (guid != 0) {
-		return (
-		    nvlist_uint64_match(config, ZPOOL_CONFIG_POOL_GUID, guid) ||
-		    vdev_guid_match(config, guid));
+		if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0)
+			return (v == guid);
 	} else {
-		return (
-		    nvlist_string_match(config, ZPOOL_CONFIG_POOL_NAME, tgt) ||
-		    vdev_string_match(config, tgt));
+		if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0)
+			return (strcmp(s, tgt) == 0);
 	}
+	return (B_FALSE);
 }
 
-static int
-find_exported_zpool(char *pool_id, nvlist_t **configp, char *vdev_dir)
+static char *
+find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv)
 {
 	nvlist_t *pools;
-	int error = ENOENT;
 	nvlist_t *match = NULL;
+	char *name = NULL;
+	char *sepp = NULL;
+	char sep;
+	int count = 0;
+	importargs_t args = { 0 };
 
-	if (vdev_dir != NULL)
-		pools = zpool_find_import_activeok(g_zfs, 1, &vdev_dir);
-	else
-		pools = zpool_find_import_activeok(g_zfs, 0, NULL);
+	args.paths = dirc;
+	args.path = dirv;
+	args.can_be_active = B_TRUE;
+
+	if ((sepp = strpbrk(*target, "/@")) != NULL) {
+		sep = *sepp;
+		*sepp = '\0';
+	}
+
+	pools = zpool_search_import(g_zfs, &args);
 
 	if (pools != NULL) {
 		nvpair_t *elem = NULL;
-
 		while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) {
 			verify(nvpair_value_nvlist(elem, configp) == 0);
-			if (pool_match(*configp, pool_id)) {
+			if (pool_match(*configp, *target)) {
+				count++;
 				if (match != NULL) {
-					(void) fatal(
-					    "More than one matching pool - "
-					    "specify guid/devid/device path.");
+					/* print previously found config */
+					if (name != NULL) {
+						(void) printf("%s\n", name);
+						dump_nvlist(match, 8);
+						name = NULL;
+					}
+					(void) printf("%s\n",
+					    nvpair_name(elem));
+					dump_nvlist(*configp, 8);
 				} else {
 					match = *configp;
-					error = 0;
+					name = nvpair_name(elem);
 				}
 			}
 		}
 	}
+	if (count > 1)
+		(void) fatal("\tMatched %d pools - use pool GUID "
+		    "instead of pool name or \n"
+		    "\tpool name part of a dataset name to select pool", count);
 
-	*configp = error ? NULL : match;
+	if (sepp)
+		*sepp = sep;
+	/*
+	 * If pool GUID was specified for pool id, replace it with pool name
+	 */
+	if (name && (strstr(*target, name) != *target)) {
+		int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0);
 
-	return (error);
+		*target = umem_alloc(sz, UMEM_NOFAIL);
+		(void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : "");
+	}
+
+	*configp = name ? match : NULL;
+
+	return (name);
 }
 
 int
@@ -2230,66 +2722,76 @@ main(int argc, char **argv)
 {
 	int i, c;
 	struct rlimit rl = { 1024, 1024 };
-	spa_t *spa;
+	spa_t *spa = NULL;
 	objset_t *os = NULL;
-	char *endstr;
 	int dump_all = 1;
 	int verbose = 0;
-	int error;
-	int exported = 0;
-	char *vdev_dir = NULL;
+	int error = 0;
+	char **searchdirs = NULL;
+	int nsearch = 0;
+	char *target;
+	nvlist_t *policy = NULL;
+	uint64_t max_txg = UINT64_MAX;
+	int rewind = ZPOOL_NEVER_REWIND;
 
 	(void) setrlimit(RLIMIT_NOFILE, &rl);
 	(void) enable_extended_FILE_stdio(-1, -1);
 
 	dprintf_setup(&argc, argv);
 
-	while ((c = getopt(argc, argv, "udibcsvCLS:U:lRep:")) != -1) {
+	while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:")) != -1) {
 		switch (c) {
-		case 'u':
-		case 'd':
-		case 'i':
 		case 'b':
 		case 'c':
+		case 'd':
+		case 'h':
+		case 'i':
+		case 'l':
+		case 'm':
 		case 's':
+		case 'u':
 		case 'C':
-		case 'l':
+		case 'D':
 		case 'R':
+		case 'S':
 			dump_opt[c]++;
 			dump_all = 0;
 			break;
+		case 'A':
+		case 'F':
 		case 'L':
+		case 'X':
+		case 'e':
 			dump_opt[c]++;
 			break;
 		case 'v':
 			verbose++;
 			break;
-		case 'U':
-			spa_config_path = optarg;
-			break;
-		case 'e':
-			exported = 1;
-			break;
 		case 'p':
-			vdev_dir = optarg;
+			if (searchdirs == NULL) {
+				searchdirs = umem_alloc(sizeof (char *),
+				    UMEM_NOFAIL);
+			} else {
+				char **tmp = umem_alloc((nsearch + 1) *
+				    sizeof (char *), UMEM_NOFAIL);
+				bcopy(searchdirs, tmp, nsearch *
+				    sizeof (char *));
+				umem_free(searchdirs,
+				    nsearch * sizeof (char *));
+				searchdirs = tmp;
+			}
+			searchdirs[nsearch++] = optarg;
 			break;
-		case 'S':
-			dump_opt[c]++;
-			dump_all = 0;
-			zdb_sig_user_data = (strncmp(optarg, "user:", 5) == 0);
-			if (!zdb_sig_user_data && strncmp(optarg, "all:", 4))
-				usage();
-			endstr = strchr(optarg, ':') + 1;
-			if (strcmp(endstr, "fletcher2") == 0)
-				zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
-			else if (strcmp(endstr, "fletcher4") == 0)
-				zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_4;
-			else if (strcmp(endstr, "sha256") == 0)
-				zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256;
-			else if (strcmp(endstr, "all") == 0)
-				zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2;
-			else
+		case 't':
+			max_txg = strtoull(optarg, NULL, 0);
+			if (max_txg < TXG_INITIAL) {
+				(void) fprintf(stderr, "incorrect txg "
+				    "specified: %s\n", optarg);
 				usage();
+			}
+			break;
+		case 'U':
+			spa_config_path = optarg;
 			break;
 		default:
 			usage();
@@ -2297,7 +2799,7 @@ main(int argc, char **argv)
 		}
 	}
 
-	if (vdev_dir != NULL && exported == 0) {
+	if (!dump_opt['e'] && searchdirs != NULL) {
 		(void) fprintf(stderr, "-p option requires use of -e\n");
 		usage();
 	}
@@ -2306,18 +2808,26 @@ main(int argc, char **argv)
 	g_zfs = libzfs_init();
 	ASSERT(g_zfs != NULL);
 
+	if (dump_all)
+		verbose = MAX(verbose, 1);
+
 	for (c = 0; c < 256; c++) {
-		if (dump_all && c != 'l' && c != 'R')
+		if (dump_all && !strchr("elAFLRSX", c))
 			dump_opt[c] = 1;
 		if (dump_opt[c])
 			dump_opt[c] += verbose;
 	}
 
+	aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2);
+	zfs_recover = (dump_opt['A'] > 1);
+
 	argc -= optind;
 	argv += optind;
 
+	if (argc < 2 && dump_opt['R'])
+		usage();
 	if (argc < 1) {
-		if (dump_opt['C']) {
+		if (!dump_opt['e'] && dump_opt['C']) {
 			dump_cachefile(spa_config_path);
 			return (0);
 		}
@@ -2329,98 +2839,102 @@ main(int argc, char **argv)
 		return (0);
 	}
 
-	if (dump_opt['R']) {
-		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
-		flagbits['c'] = ZDB_FLAG_CHECKSUM;
-		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
-		flagbits['e'] = ZDB_FLAG_BSWAP;
-		flagbits['g'] = ZDB_FLAG_GBH;
-		flagbits['i'] = ZDB_FLAG_INDIRECT;
-		flagbits['p'] = ZDB_FLAG_PHYS;
-		flagbits['r'] = ZDB_FLAG_RAW;
-
-		spa = NULL;
-		while (argv[0]) {
-			zdb_read_block(argv[0], &spa);
-			argv++;
-			argc--;
-		}
-		if (spa)
-			spa_close(spa, (void *)zdb_read_block);
-		return (0);
-	}
+	if (dump_opt['X'] || dump_opt['F'])
+		rewind = ZPOOL_DO_REWIND |
+		    (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0);
 
-	if (dump_opt['C'])
-		dump_config(argv[0]);
+	if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 ||
+	    nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 ||
+	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0)
+		fatal("internal error: %s", strerror(ENOMEM));
 
 	error = 0;
-	if (exported) {
-		/*
-		 * Check to see if the name refers to an exported zpool
-		 */
-		char *slash;
-		nvlist_t *exported_conf = NULL;
-
-		if ((slash = strchr(argv[0], '/')) != NULL)
-			*slash = '\0';
-
-		error = find_exported_zpool(argv[0], &exported_conf, vdev_dir);
-		if (error == 0) {
-			nvlist_t *nvl = NULL;
-
-			if (vdev_dir != NULL) {
-				if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
-					error = ENOMEM;
-				else if (nvlist_add_string(nvl,
-				    zpool_prop_to_name(ZPOOL_PROP_ALTROOT),
-				    vdev_dir) != 0)
-					error = ENOMEM;
-			}
+	target = argv[0];
 
-			if (error == 0)
-				error = spa_import_faulted(argv[0],
-				    exported_conf, nvl);
+	if (dump_opt['e']) {
+		nvlist_t *cfg = NULL;
+		char *name = find_zpool(&target, &cfg, nsearch, searchdirs);
 
-			nvlist_free(nvl);
+		error = ENOENT;
+		if (name) {
+			if (dump_opt['C'] > 1) {
+				(void) printf("\nConfiguration for import:\n");
+				dump_nvlist(cfg, 8);
+			}
+			if (nvlist_add_nvlist(cfg,
+			    ZPOOL_REWIND_POLICY, policy) != 0) {
+				fatal("can't open '%s': %s",
+				    target, strerror(ENOMEM));
+			}
+			if ((error = spa_import(name, cfg, NULL)) != 0)
+				error = spa_import_verbatim(name, cfg, NULL);
 		}
-
-		if (slash != NULL)
-			*slash = '/';
 	}
 
 	if (error == 0) {
-		if (strchr(argv[0], '/') != NULL) {
-			error = dmu_objset_open(argv[0], DMU_OST_ANY,
-			    DS_MODE_USER | DS_MODE_READONLY, &os);
+		if (strpbrk(target, "/@") == NULL || dump_opt['R']) {
+			error = spa_open_rewind(target, &spa, FTAG, policy,
+			    NULL);
+			if (error) {
+				/*
+				 * If we're missing the log device then
+				 * try opening the pool after clearing the
+				 * log state.
+				 */
+				mutex_enter(&spa_namespace_lock);
+				if ((spa = spa_lookup(target)) != NULL &&
+				    spa->spa_log_state == SPA_LOG_MISSING) {
+					spa->spa_log_state = SPA_LOG_CLEAR;
+					error = 0;
+				}
+				mutex_exit(&spa_namespace_lock);
+
+				if (!error) {
+					error = spa_open_rewind(target, &spa,
+					    FTAG, policy, NULL);
+				}
+			}
 		} else {
-			error = spa_open(argv[0], &spa, FTAG);
+			error = dmu_objset_own(target, DMU_OST_ANY,
+			    B_TRUE, FTAG, &os);
 		}
 	}
+	nvlist_free(policy);
 
 	if (error)
-		fatal("can't open %s: %s", argv[0], strerror(error));
+		fatal("can't open '%s': %s", target, strerror(error));
 
 	argv++;
-	if (--argc > 0) {
-		zopt_objects = argc;
-		zopt_object = calloc(zopt_objects, sizeof (uint64_t));
-		for (i = 0; i < zopt_objects; i++) {
-			errno = 0;
-			zopt_object[i] = strtoull(argv[i], NULL, 0);
-			if (zopt_object[i] == 0 && errno != 0)
-				fatal("bad object number %s: %s",
-				    argv[i], strerror(errno));
+	argc--;
+	if (!dump_opt['R']) {
+		if (argc > 0) {
+			zopt_objects = argc;
+			zopt_object = calloc(zopt_objects, sizeof (uint64_t));
+			for (i = 0; i < zopt_objects; i++) {
+				errno = 0;
+				zopt_object[i] = strtoull(argv[i], NULL, 0);
+				if (zopt_object[i] == 0 && errno != 0)
+					fatal("bad number %s: %s",
+					    argv[i], strerror(errno));
+			}
 		}
-	}
-
-	if (os != NULL) {
-		dump_dir(os);
-		dmu_objset_close(os);
+		(os != NULL) ? dump_dir(os) : dump_zpool(spa);
 	} else {
-		dump_zpool(spa);
-		spa_close(spa, FTAG);
+		flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR;
+		flagbits['c'] = ZDB_FLAG_CHECKSUM;
+		flagbits['d'] = ZDB_FLAG_DECOMPRESS;
+		flagbits['e'] = ZDB_FLAG_BSWAP;
+		flagbits['g'] = ZDB_FLAG_GBH;
+		flagbits['i'] = ZDB_FLAG_INDIRECT;
+		flagbits['p'] = ZDB_FLAG_PHYS;
+		flagbits['r'] = ZDB_FLAG_RAW;
+
+		for (i = 0; i < argc; i++)
+			zdb_read_block(argv[i], spa);
 	}
 
+	(os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG);
+
 	fuid_table_destroy();
 
 	libzfs_fini(g_zfs);
diff --git a/external/cddl/osnet/dist/cmd/zdb/zdb_il.c b/external/cddl/osnet/dist/cmd/zdb/zdb_il.c
index 02d35a050332e..a0ed985f52b77 100644
--- a/external/cddl/osnet/dist/cmd/zdb/zdb_il.c
+++ b/external/cddl/osnet/dist/cmd/zdb/zdb_il.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Print intent log header and statistics.
  */
@@ -42,12 +40,14 @@
 
 extern uint8_t dump_opt[256];
 
+static char prefix[4] = "\t\t\t";
+
 static void
 print_log_bp(const blkptr_t *bp, const char *prefix)
 {
 	char blkbuf[BP_SPRINTF_LEN];
 
-	sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp);
+	sprintf_blkptr(blkbuf, bp);
 	(void) printf("%s%s\n", prefix, blkbuf);
 }
 
@@ -56,19 +56,29 @@ static void
 zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
 {
 	time_t crtime = lr->lr_crtime[0];
-	char *name = (char *)(lr + 1);
-	char *link = name + strlen(name) + 1;
+	char *name, *link;
+	lr_attr_t *lrattr;
 
-	if (txtype == TX_SYMLINK)
-		(void) printf("\t\t\t%s -> %s\n", name, link);
-	else
-		(void) printf("\t\t\t%s\n", name);
+	name = (char *)(lr + 1);
 
-	(void) printf("\t\t\t%s", ctime(&crtime));
-	(void) printf("\t\t\tdoid %llu, foid %llu, mode %llo\n",
+	if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR ||
+	    lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) {
+		lrattr = (lr_attr_t *)(lr + 1);
+		name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
+	}
+
+	if (txtype == TX_SYMLINK) {
+		link = name + strlen(name) + 1;
+		(void) printf("%s%s -> %s\n", prefix, name, link);
+	} else if (txtype != TX_MKXATTR) {
+		(void) printf("%s%s\n", prefix, name);
+	}
+
+	(void) printf("%s%s", prefix, ctime(&crtime));
+	(void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix,
 	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid,
 	    (longlong_t)lr->lr_mode);
-	(void) printf("\t\t\tuid %llu, gid %llu, gen %llu, rdev 0x%llx\n",
+	(void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix,
 	    (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid,
 	    (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev);
 }
@@ -77,7 +87,7 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr)
 static void
 zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
 {
-	(void) printf("\t\t\tdoid %llu, name %s\n",
+	(void) printf("%sdoid %llu, name %s\n", prefix,
 	    (u_longlong_t)lr->lr_doid, (char *)(lr + 1));
 }
 
@@ -85,7 +95,7 @@ zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr)
 static void
 zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr)
 {
-	(void) printf("\t\t\tdoid %llu, link_obj %llu, name %s\n",
+	(void) printf("%sdoid %llu, link_obj %llu, name %s\n", prefix,
 	    (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj,
 	    (char *)(lr + 1));
 }
@@ -97,9 +107,9 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr)
 	char *snm = (char *)(lr + 1);
 	char *tnm = snm + strlen(snm) + 1;
 
-	(void) printf("\t\t\tsdoid %llu, tdoid %llu\n",
+	(void) printf("%ssdoid %llu, tdoid %llu\n", prefix,
 	    (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid);
-	(void) printf("\t\t\tsrc %s tgt %s\n", snm, tnm);
+	(void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm);
 }
 
 /* ARGSUSED */
@@ -108,43 +118,48 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 {
 	char *data, *dlimit;
 	blkptr_t *bp = &lr->lr_blkptr;
+	zbookmark_t zb;
 	char buf[SPA_MAXBLOCKSIZE];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int error;
 
-	(void) printf("\t\t\tfoid %llu, offset 0x%llx,"
-	    " length 0x%llx, blkoff 0x%llx\n",
-	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
-	    (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff);
+	(void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix,
+	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset,
+	    (u_longlong_t)lr->lr_length);
 
-	if (verbose < 5)
+	if (txtype == TX_WRITE2 || verbose < 5)
 		return;
 
 	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
-		(void) printf("\t\t\thas blkptr, %s\n",
+		(void) printf("%shas blkptr, %s\n", prefix,
 		    bp->blk_birth >= spa_first_txg(zilog->zl_spa) ?
 		    "will claim" : "won't claim");
-		print_log_bp(bp, "\t\t\t");
+		print_log_bp(bp, prefix);
+
+		if (BP_IS_HOLE(bp)) {
+			(void) printf("\t\t\tLSIZE 0x%llx\n",
+			    (u_longlong_t)BP_GET_LSIZE(bp));
+		}
 		if (bp->blk_birth == 0) {
 			bzero(buf, sizeof (buf));
-		} else {
-			zbookmark_t zb;
-
-			ASSERT3U(bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ==,
-			    dmu_objset_id(zilog->zl_os));
-
-			zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
-			zb.zb_object = 0;
-			zb.zb_level = -1;
-			zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-
-			error = zio_wait(zio_read(NULL, zilog->zl_spa,
-			    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
-			    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
-			if (error)
-				return;
+			(void) printf("%s<hole>\n", prefix);
+			return;
 		}
-		data = buf + lr->lr_blkoff;
+		if (bp->blk_birth < zilog->zl_header->zh_claim_txg) {
+			(void) printf("%s<block already committed>\n", prefix);
+			return;
+		}
+
+		SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os),
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
+
+		error = zio_wait(zio_read(NULL, zilog->zl_spa,
+		    bp, buf, BP_GET_LSIZE(bp), NULL, NULL,
+		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb));
+		if (error)
+			return;
+		data = buf;
 	} else {
 		data = (char *)(lr + 1);
 	}
@@ -152,7 +167,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 	dlimit = data + MIN(lr->lr_length,
 	    (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE));
 
-	(void) printf("\t\t\t");
+	(void) printf("%s", prefix);
 	while (data < dlimit) {
 		if (isprint(*data))
 			(void) printf("%c ", *data);
@@ -167,7 +182,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr)
 static void
 zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr)
 {
-	(void) printf("\t\t\tfoid %llu, offset 0x%llx, length 0x%llx\n",
+	(void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", prefix,
 	    (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset,
 	    (u_longlong_t)lr->lr_length);
 }
@@ -179,38 +194,38 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
 	time_t atime = (time_t)lr->lr_atime[0];
 	time_t mtime = (time_t)lr->lr_mtime[0];
 
-	(void) printf("\t\t\tfoid %llu, mask 0x%llx\n",
+	(void) printf("%sfoid %llu, mask 0x%llx\n", prefix,
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask);
 
 	if (lr->lr_mask & AT_MODE) {
-		(void) printf("\t\t\tAT_MODE  %llo\n",
+		(void) printf("%sAT_MODE  %llo\n", prefix,
 		    (longlong_t)lr->lr_mode);
 	}
 
 	if (lr->lr_mask & AT_UID) {
-		(void) printf("\t\t\tAT_UID   %llu\n",
+		(void) printf("%sAT_UID   %llu\n", prefix,
 		    (u_longlong_t)lr->lr_uid);
 	}
 
 	if (lr->lr_mask & AT_GID) {
-		(void) printf("\t\t\tAT_GID   %llu\n",
+		(void) printf("%sAT_GID   %llu\n", prefix,
 		    (u_longlong_t)lr->lr_gid);
 	}
 
 	if (lr->lr_mask & AT_SIZE) {
-		(void) printf("\t\t\tAT_SIZE  %llu\n",
+		(void) printf("%sAT_SIZE  %llu\n", prefix,
 		    (u_longlong_t)lr->lr_size);
 	}
 
 	if (lr->lr_mask & AT_ATIME) {
-		(void) printf("\t\t\tAT_ATIME %llu.%09llu %s",
+		(void) printf("%sAT_ATIME %llu.%09llu %s", prefix,
 		    (u_longlong_t)lr->lr_atime[0],
 		    (u_longlong_t)lr->lr_atime[1],
 		    ctime(&atime));
 	}
 
 	if (lr->lr_mask & AT_MTIME) {
-		(void) printf("\t\t\tAT_MTIME %llu.%09llu %s",
+		(void) printf("%sAT_MTIME %llu.%09llu %s", prefix,
 		    (u_longlong_t)lr->lr_mtime[0],
 		    (u_longlong_t)lr->lr_mtime[1],
 		    ctime(&mtime));
@@ -221,7 +236,7 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr)
 static void
 zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr)
 {
-	(void) printf("\t\t\tfoid %llu, aclcnt %llu\n",
+	(void) printf("%sfoid %llu, aclcnt %llu\n", prefix,
 	    (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt);
 }
 
@@ -253,10 +268,11 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = {
 	{	zil_prt_rec_create,	"TX_MKDIR_ACL       " },
 	{	zil_prt_rec_create,	"TX_MKDIR_ATTR      " },
 	{	zil_prt_rec_create,	"TX_MKDIR_ACL_ATTR  " },
+	{	zil_prt_rec_write,	"TX_WRITE2          " },
 };
 
 /* ARGSUSED */
-static void
+static int
 print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
 {
 	int txtype;
@@ -280,23 +296,24 @@ print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg)
 
 	zil_rec_info[txtype].zri_count++;
 	zil_rec_info[0].zri_count++;
+
+	return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
-	char blkbuf[BP_SPRINTF_LEN];
+	char blkbuf[BP_SPRINTF_LEN + 10];
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	char *claim;
 
 	if (verbose <= 3)
-		return;
+		return (0);
 
 	if (verbose >= 5) {
 		(void) strcpy(blkbuf, ", ");
-		sprintf_blkptr(blkbuf + strlen(blkbuf),
-		    BP_SPRINTF_LEN - strlen(blkbuf), bp);
+		sprintf_blkptr(blkbuf + strlen(blkbuf), bp);
 	} else {
 		blkbuf[0] = '\0';
 	}
@@ -310,6 +327,8 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 
 	(void) printf("\tBlock seqno %llu, %s%s\n",
 	    (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf);
+
+	return (0);
 }
 
 static void
@@ -342,14 +361,16 @@ dump_intent_log(zilog_t *zilog)
 	int verbose = MAX(dump_opt['d'], dump_opt['i']);
 	int i;
 
-	if (zh->zh_log.blk_birth == 0 || verbose < 2)
+	if (zh->zh_log.blk_birth == 0 || verbose < 1)
 		return;
 
-	(void) printf("\n    ZIL header: claim_txg %llu, seq %llu\n",
-	    (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_replay_seq);
-
-	if (verbose >= 4)
-		print_log_bp(&zh->zh_log, "\n\tfirst block: ");
+	(void) printf("\n    ZIL header: claim_txg %llu, "
+	    "claim_blk_seq %llu, claim_lr_seq %llu",
+	    (u_longlong_t)zh->zh_claim_txg,
+	    (u_longlong_t)zh->zh_claim_blk_seq,
+	    (u_longlong_t)zh->zh_claim_lr_seq);
+	(void) printf(" replay_seq %llu, flags 0x%llx\n",
+	    (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags);
 
 	for (i = 0; i < TX_MAX_TYPE; i++)
 		zil_rec_info[i].zri_count = 0;
diff --git a/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c b/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c
index a22370a027956..f70bebe00b53a 100644
--- a/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c
+++ b/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -53,11 +53,14 @@ typedef struct zfs_node {
 } zfs_node_t;
 
 typedef struct callback_data {
-	uu_avl_t	*cb_avl;
-	int		cb_flags;
-	zfs_type_t	cb_types;
-	zfs_sort_column_t *cb_sortcol;
-	zprop_list_t	**cb_proplist;
+	uu_avl_t		*cb_avl;
+	int			cb_flags;
+	zfs_type_t		cb_types;
+	zfs_sort_column_t	*cb_sortcol;
+	zprop_list_t		**cb_proplist;
+	int			cb_depth_limit;
+	int			cb_depth;
+	uint8_t			cb_props_table[ZFS_NUM_PROPS];
 } callback_data_t;
 
 uu_avl_pool_t *avl_pool;
@@ -98,10 +101,18 @@ zfs_callback(zfs_handle_t *zhp, void *data)
 		uu_avl_node_init(node, &node->zn_avlnode, avl_pool);
 		if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol,
 		    &idx) == NULL) {
-			if (cb->cb_proplist &&
-			    zfs_expand_proplist(zhp, cb->cb_proplist) != 0) {
-				free(node);
-				return (-1);
+			if (cb->cb_proplist) {
+				if ((*cb->cb_proplist) &&
+				    !(*cb->cb_proplist)->pl_all)
+					zfs_prune_proplist(zhp,
+					    cb->cb_props_table);
+
+				if (zfs_expand_proplist(zhp, cb->cb_proplist,
+				    (cb->cb_flags & ZFS_ITER_RECVD_PROPS))
+				    != 0) {
+					free(node);
+					return (-1);
+				}
 			}
 			uu_avl_insert(cb->cb_avl, node, idx);
 			dontclose = 1;
@@ -113,11 +124,15 @@ zfs_callback(zfs_handle_t *zhp, void *data)
 	/*
 	 * Recurse if necessary.
 	 */
-	if (cb->cb_flags & ZFS_ITER_RECURSE) {
+	if (cb->cb_flags & ZFS_ITER_RECURSE &&
+	    ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 ||
+	    cb->cb_depth < cb->cb_depth_limit)) {
+		cb->cb_depth++;
 		if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM)
 			(void) zfs_iter_filesystems(zhp, zfs_callback, data);
 		if ((zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) && include_snaps)
 			(void) zfs_iter_snapshots(zhp, zfs_callback, data);
+		cb->cb_depth--;
 	}
 
 	if (!dontclose)
@@ -325,10 +340,10 @@ zfs_sort(const void *larg, const void *rarg, void *data)
 
 int
 zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
-    zfs_sort_column_t *sortcol, zprop_list_t **proplist,
+    zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit,
     zfs_iter_f callback, void *data)
 {
-	callback_data_t cb;
+	callback_data_t cb = {0};
 	int ret = 0;
 	zfs_node_t *node;
 	uu_avl_walk_t *walk;
@@ -346,6 +361,45 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types,
 	cb.cb_flags = flags;
 	cb.cb_proplist = proplist;
 	cb.cb_types = types;
+	cb.cb_depth_limit = limit;
+	/*
+	 * If cb_proplist is provided then in the zfs_handles created we
+	 * retain only those properties listed in cb_proplist and sortcol.
+	 * The rest are pruned. So, the caller should make sure that no other
+	 * properties other than those listed in cb_proplist/sortcol are
+	 * accessed.
+	 *
+	 * If cb_proplist is NULL then we retain all the properties.  We
+	 * always retain the zoned property, which some other properties
+	 * need (userquota & friends), and the createtxg property, which
+	 * we need to sort snapshots.
+	 */
+	if (cb.cb_proplist && *cb.cb_proplist) {
+		zprop_list_t *p = *cb.cb_proplist;
+
+		while (p) {
+			if (p->pl_prop >= ZFS_PROP_TYPE &&
+			    p->pl_prop < ZFS_NUM_PROPS) {
+				cb.cb_props_table[p->pl_prop] = B_TRUE;
+			}
+			p = p->pl_next;
+		}
+
+		while (sortcol) {
+			if (sortcol->sc_prop >= ZFS_PROP_TYPE &&
+			    sortcol->sc_prop < ZFS_NUM_PROPS) {
+				cb.cb_props_table[sortcol->sc_prop] = B_TRUE;
+			}
+			sortcol = sortcol->sc_next;
+		}
+
+		cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE;
+		cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE;
+	} else {
+		(void) memset(cb.cb_props_table, B_TRUE,
+		    sizeof (cb.cb_props_table));
+	}
+
 	if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) {
 		(void) fprintf(stderr,
 		    gettext("internal error: out of memory\n"));
diff --git a/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h b/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h
index 76a11085a1ef5..8c6b9fdef54f0 100644
--- a/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h
+++ b/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -41,9 +41,11 @@ typedef struct zfs_sort_column {
 #define	ZFS_ITER_RECURSE	   (1 << 0)
 #define	ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1)
 #define	ZFS_ITER_PROP_LISTSNAPS    (1 << 2)
+#define	ZFS_ITER_DEPTH_LIMIT	   (1 << 3)
+#define	ZFS_ITER_RECVD_PROPS	   (1 << 4)
 
 int zfs_for_each(int, char **, int options, zfs_type_t,
-    zfs_sort_column_t *, zprop_list_t **, zfs_iter_f, void *);
+    zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *);
 int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t);
 void zfs_free_sort_columns(zfs_sort_column_t *);
 
diff --git a/external/cddl/osnet/dist/cmd/zfs/zfs_main.c b/external/cddl/osnet/dist/cmd/zfs/zfs_main.c
index a343b5c563fbd..ce65fd57a8d6b 100644
--- a/external/cddl/osnet/dist/cmd/zfs/zfs_main.c
+++ b/external/cddl/osnet/dist/cmd/zfs/zfs_main.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,12 +39,14 @@
 #include <unistd.h>
 #include <fcntl.h>
 #include <zone.h>
+#include <grp.h>
+#include <pwd.h>
 #include <sys/mkdev.h>
 #include <sys/mntent.h>
 #include <sys/mnttab.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
-#include <sys/avl.h>
+#include <sys/fs/zfs.h>
 
 #include <libzfs.h>
 #include <libuutil.h>
@@ -56,6 +58,7 @@ libzfs_handle_t *g_zfs;
 
 static FILE *mnttab_file;
 static char history_str[HIS_MAX_RECORD_LEN];
+const char *pypath = "/usr/lib/zfs/pyzfs.py";
 
 static int zfs_do_clone(int argc, char **argv);
 static int zfs_do_create(int argc, char **argv);
@@ -75,8 +78,10 @@ static int zfs_do_unshare(int argc, char **argv);
 static int zfs_do_send(int argc, char **argv);
 static int zfs_do_receive(int argc, char **argv);
 static int zfs_do_promote(int argc, char **argv);
-static int zfs_do_allow(int argc, char **argv);
-static int zfs_do_unallow(int argc, char **argv);
+static int zfs_do_userspace(int argc, char **argv);
+static int zfs_do_python(int argc, char **argv);
+static int zfs_do_hold(int argc, char **argv);
+static int zfs_do_release(int argc, char **argv);
 
 /*
  * Enable a reasonable set of defaults for libumem debugging on DEBUG builds.
@@ -116,7 +121,12 @@ typedef enum {
 	HELP_UNMOUNT,
 	HELP_UNSHARE,
 	HELP_ALLOW,
-	HELP_UNALLOW
+	HELP_UNALLOW,
+	HELP_USERSPACE,
+	HELP_GROUPSPACE,
+	HELP_HOLD,
+	HELP_HOLDS,
+	HELP_RELEASE
 } zfs_help_t;
 
 typedef struct zfs_command {
@@ -147,9 +157,11 @@ static zfs_command_t command_table[] = {
 	{ "list",	zfs_do_list,		HELP_LIST		},
 	{ NULL },
 	{ "set",	zfs_do_set,		HELP_SET		},
-	{ "get", 	zfs_do_get,		HELP_GET		},
+	{ "get",	zfs_do_get,		HELP_GET		},
 	{ "inherit",	zfs_do_inherit,		HELP_INHERIT		},
 	{ "upgrade",	zfs_do_upgrade,		HELP_UPGRADE		},
+	{ "userspace",	zfs_do_userspace,	HELP_USERSPACE		},
+	{ "groupspace",	zfs_do_userspace,	HELP_GROUPSPACE		},
 	{ NULL },
 	{ "mount",	zfs_do_mount,		HELP_MOUNT		},
 	{ "unmount",	zfs_do_unmount,		HELP_UNMOUNT		},
@@ -159,9 +171,13 @@ static zfs_command_t command_table[] = {
 	{ "send",	zfs_do_send,		HELP_SEND		},
 	{ "receive",	zfs_do_receive,		HELP_RECEIVE		},
 	{ NULL },
-	{ "allow",	zfs_do_allow,		HELP_ALLOW		},
+	{ "allow",	zfs_do_python,		HELP_ALLOW		},
 	{ NULL },
-	{ "unallow",	zfs_do_unallow,		HELP_UNALLOW		},
+	{ "unallow",	zfs_do_python,		HELP_UNALLOW		},
+	{ NULL },
+	{ "hold",	zfs_do_hold,		HELP_HOLD		},
+	{ "holds",	zfs_do_python,		HELP_HOLDS		},
+	{ "release",	zfs_do_release,		HELP_RELEASE		},
 };
 
 #define	NCOMMAND	(sizeof (command_table) / sizeof (command_table[0]))
@@ -181,22 +197,22 @@ get_usage(zfs_help_t idx)
 		    "\tcreate [-ps] [-b blocksize] [-o property=value] ... "
 		    "-V <size> <volume>\n"));
 	case HELP_DESTROY:
-		return (gettext("\tdestroy [-rRf] "
-		    "<filesystem|volume|snapshot>\n"));
+		return (gettext("\tdestroy [-rRf] <filesystem|volume>\n"
+		    "\tdestroy [-rRd] <snapshot>\n"));
 	case HELP_GET:
-		return (gettext("\tget [-rHp] [-o field[,...]] "
-		    "[-s source[,...]]\n"
+		return (gettext("\tget [-rHp] [-d max] "
+		    "[-o \"all\" | field[,...]] [-s source[,...]]\n"
 		    "\t    <\"all\" | property[,...]> "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_INHERIT:
-		return (gettext("\tinherit [-r] <property> "
+		return (gettext("\tinherit [-rS] <property> "
 		    "<filesystem|volume|snapshot> ...\n"));
 	case HELP_UPGRADE:
 		return (gettext("\tupgrade [-v]\n"
 		    "\tupgrade [-r] [-V version] <-a | filesystem ...>\n"));
 	case HELP_LIST:
-		return (gettext("\tlist [-rH] [-o property[,...]] "
-		    "[-t type[,...]] [-s property] ...\n"
+		return (gettext("\tlist [-rH][-d max] "
+		    "[-o property[,...]] [-t type[,...]] [-s property] ...\n"
 		    "\t    [-S property] ... "
 		    "[filesystem|volume|snapshot] ...\n"));
 	case HELP_MOUNT:
@@ -216,7 +232,7 @@ get_usage(zfs_help_t idx)
 	case HELP_ROLLBACK:
 		return (gettext("\trollback [-rRf] <snapshot>\n"));
 	case HELP_SEND:
-		return (gettext("\tsend [-R] [-[iI] snapshot] <snapshot>\n"));
+		return (gettext("\tsend [-RDp] [-[iI] snapshot] <snapshot>\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> "
 		    "<filesystem|volume|snapshot> ...\n"));
@@ -229,10 +245,11 @@ get_usage(zfs_help_t idx)
 		return (gettext("\tunmount [-f] "
 		    "<-a | filesystem|mountpoint>\n"));
 	case HELP_UNSHARE:
-		return (gettext("\tunshare [-f] "
+		return (gettext("\tunshare "
 		    "<-a | filesystem|mountpoint>\n"));
 	case HELP_ALLOW:
-		return (gettext("\tallow [-ldug] "
+		return (gettext("\tallow <filesystem|volume>\n"
+		    "\tallow [-ldug] "
 		    "<\"everyone\"|user|group>[,...] <perm|@setname>[,...]\n"
 		    "\t    <filesystem|volume>\n"
 		    "\tallow [-ld] -e <perm|@setname>[,...] "
@@ -250,6 +267,20 @@ get_usage(zfs_help_t idx)
 		    "<filesystem|volume>\n"
 		    "\tunallow [-r] -s @setname [<perm|@setname>[,...]] "
 		    "<filesystem|volume>\n"));
+	case HELP_USERSPACE:
+		return (gettext("\tuserspace [-hniHp] [-o field[,...]] "
+		    "[-sS field] ... [-t type[,...]]\n"
+		    "\t    <filesystem|snapshot>\n"));
+	case HELP_GROUPSPACE:
+		return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] "
+		    "[-sS field] ... [-t type[,...]]\n"
+		    "\t    <filesystem|snapshot>\n"));
+	case HELP_HOLD:
+		return (gettext("\thold [-r] <tag> <snapshot> ...\n"));
+	case HELP_HOLDS:
+		return (gettext("\tholds [-r] <snapshot> ...\n"));
+	case HELP_RELEASE:
+		return (gettext("\trelease [-r] <tag> <snapshot> ...\n"));
 	}
 
 	abort();
@@ -311,7 +342,6 @@ usage(boolean_t requested)
 {
 	int i;
 	boolean_t show_properties = B_FALSE;
-	boolean_t show_permissions = B_FALSE;
 	FILE *fp = requested ? stdout : stderr;
 
 	if (current_command == NULL) {
@@ -342,13 +372,7 @@ usage(boolean_t requested)
 	    strcmp(current_command->name, "list") == 0))
 		show_properties = B_TRUE;
 
-	if (current_command != NULL &&
-	    (strcmp(current_command->name, "allow") == 0 ||
-	    strcmp(current_command->name, "unallow") == 0))
-		show_permissions = B_TRUE;
-
 	if (show_properties) {
-
 		(void) fprintf(fp,
 		    gettext("\nThe following properties are supported:\n"));
 
@@ -359,29 +383,33 @@ usage(boolean_t requested)
 		(void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE,
 		    ZFS_TYPE_DATASET);
 
+		(void) fprintf(fp, "\t%-15s ", "userused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "groupused@...");
+		(void) fprintf(fp, " NO       NO   <size>\n");
+		(void) fprintf(fp, "\t%-15s ", "userquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+		(void) fprintf(fp, "\t%-15s ", "groupquota@...");
+		(void) fprintf(fp, "YES       NO   <size> | none\n");
+
 		(void) fprintf(fp, gettext("\nSizes are specified in bytes "
 		    "with standard units such as K, M, G, etc.\n"));
 		(void) fprintf(fp, gettext("\nUser-defined properties can "
 		    "be specified by using a name containing a colon (:).\n"));
-
-	} else if (show_permissions) {
-		(void) fprintf(fp,
-		    gettext("\nThe following permissions are supported:\n"));
-
-		zfs_deleg_permissions();
+		(void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ "
+		    "properties must be appended with\n"
+		    "a user or group specifier of one of these forms:\n"
+		    "    POSIX name      (eg: \"matt\")\n"
+		    "    POSIX id        (eg: \"126829\")\n"
+		    "    SMB name@domain (eg: \"matt@sun\")\n"
+		    "    SMB SID         (eg: \"S-1-234-567-89\")\n"));
 	} else {
-		/*
-		 * TRANSLATION NOTE:
-		 * "zfs set|get" must not be localised this is the
-		 * command name and arguments.
-		 */
-
 		(void) fprintf(fp,
-		    gettext("\nFor the property list, run: zfs set|get\n"));
-
+		    gettext("\nFor the property list, run: %s\n"),
+		    "zfs set|get");
 		(void) fprintf(fp,
-		    gettext("\nFor the delegated permission list, run:"
-		    " zfs allow|unallow\n"));
+		    gettext("\nFor the delegated permission list, run: %s\n"),
+		    "zfs allow|unallow");
 	}
 
 	/*
@@ -419,7 +447,27 @@ parseprop(nvlist_t *props)
 		return (-1);
 	}
 	return (0);
+}
 
+static int
+parse_depth(char *opt, int *flags)
+{
+	char *tmp;
+	int depth;
+
+	depth = (int)strtol(opt, &tmp, 0);
+	if (*tmp) {
+		(void) fprintf(stderr,
+		    gettext("%s is not an integer\n"), optarg);
+		usage(B_FALSE);
+	}
+	if (depth < 0) {
+		(void) fprintf(stderr,
+		    gettext("Depth can not be negative.\n"));
+		usage(B_FALSE);
+	}
+	*flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE);
+	return (depth);
 }
 
 /*
@@ -666,6 +714,7 @@ zfs_do_create(int argc, char **argv)
 			resv_prop = ZFS_PROP_REFRESERVATION;
 		else
 			resv_prop = ZFS_PROP_RESERVATION;
+		volsize = zvol_volsize_to_reservation(volsize, props);
 
 		if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop),
 		    &strval) != 0) {
@@ -736,11 +785,13 @@ zfs_do_create(int argc, char **argv)
 }
 
 /*
- * zfs destroy [-rf] <fs, snap, vol>
+ * zfs destroy [-rRf] <fs, vol>
+ * zfs destroy [-rRd] <snap>
  *
- * 	-r	Recursively destroy all children
- * 	-R	Recursively destroy all dependents, including clones
- * 	-f	Force unmounting of any dependents
+ *	-r	Recursively destroy all children
+ *	-R	Recursively destroy all dependents, including clones
+ *	-f	Force unmounting of any dependents
+ *	-d	If we can't destroy now, mark for deferred destruction
  *
  * Destroys the given dataset.  By default, it will unmount any filesystems,
  * and refuse to destroy a dataset that has any dependents.  A dependent can
@@ -756,6 +807,7 @@ typedef struct destroy_cbdata {
 	boolean_t	cb_closezhp;
 	zfs_handle_t	*cb_target;
 	char		*cb_snapname;
+	boolean_t	cb_defer_destroy;
 } destroy_cbdata_t;
 
 /*
@@ -824,7 +876,7 @@ destroy_callback(zfs_handle_t *zhp, void *data)
 
 	/*
 	 * Ignore pools (which we've already flagged as an error before getting
-	 * here.
+	 * here).
 	 */
 	if (strchr(zfs_get_name(zhp), '/') == NULL &&
 	    zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) {
@@ -836,7 +888,7 @@ destroy_callback(zfs_handle_t *zhp, void *data)
 	 * Bail out on the first error.
 	 */
 	if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 ||
-	    zfs_destroy(zhp) != 0) {
+	    zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) {
 		zfs_close(zhp);
 		return (-1);
 	}
@@ -888,10 +940,15 @@ zfs_do_destroy(int argc, char **argv)
 	int c;
 	zfs_handle_t *zhp;
 	char *cp;
+	zfs_type_t type = ZFS_TYPE_DATASET;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "frR")) != -1) {
+	while ((c = getopt(argc, argv, "dfrR")) != -1) {
 		switch (c) {
+		case 'd':
+			cb.cb_defer_destroy = B_TRUE;
+			type = ZFS_TYPE_SNAPSHOT;
+			break;
 		case 'f':
 			cb.cb_force = 1;
 			break;
@@ -937,14 +994,22 @@ zfs_do_destroy(int argc, char **argv)
 		cp++;
 
 		if (cb.cb_doclones) {
+			boolean_t defer = cb.cb_defer_destroy;
+
+			/*
+			 * Temporarily ignore the defer_destroy setting since
+			 * it's not supported for clones.
+			 */
+			cb.cb_defer_destroy = B_FALSE;
 			cb.cb_snapname = cp;
 			if (destroy_snap_clones(zhp, &cb) != 0) {
 				zfs_close(zhp);
 				return (1);
 			}
+			cb.cb_defer_destroy = defer;
 		}
 
-		ret = zfs_destroy_snaps(zhp, cp);
+		ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy);
 		zfs_close(zhp);
 		if (ret) {
 			(void) fprintf(stderr,
@@ -953,9 +1018,8 @@ zfs_do_destroy(int argc, char **argv)
 		return (ret != 0);
 	}
 
-
 	/* Open the given dataset */
-	if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL)
+	if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL)
 		return (1);
 
 	cb.cb_target = zhp;
@@ -981,15 +1045,15 @@ zfs_do_destroy(int argc, char **argv)
 	 * Check for any dependents and/or clones.
 	 */
 	cb.cb_first = B_TRUE;
-	if (!cb.cb_doclones &&
+	if (!cb.cb_doclones && !cb.cb_defer_destroy &&
 	    zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent,
 	    &cb) != 0) {
 		zfs_close(zhp);
 		return (1);
 	}
 
-	if (cb.cb_error ||
-	    zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) {
+	if (cb.cb_error || (!cb.cb_defer_destroy &&
+	    (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) {
 		zfs_close(zhp);
 		return (1);
 	}
@@ -1002,22 +1066,35 @@ zfs_do_destroy(int argc, char **argv)
 	if (destroy_callback(zhp, &cb) != 0)
 		return (1);
 
-
 	return (0);
 }
 
+static boolean_t
+is_recvd_column(zprop_get_cbdata_t *cbp)
+{
+	int i;
+	zfs_get_column_t col;
+
+	for (i = 0; i < ZFS_GET_NCOLS &&
+	    (col = cbp->cb_columns[i]) != GET_COL_NONE; i++)
+		if (col == GET_COL_RECVD)
+			return (B_TRUE);
+	return (B_FALSE);
+}
+
 /*
- * zfs get [-rHp] [-o field[,field]...] [-s source[,source]...]
- * 	< all | property[,property]... > < fs | snap | vol > ...
+ * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...]
+ *	< all | property[,property]... > < fs | snap | vol > ...
  *
  *	-r	recurse over any child datasets
  *	-H	scripted mode.  Headers are stripped, and fields are separated
  *		by tabs instead of spaces.
- *	-o	Set of fields to display.  One of "name,property,value,source".
- *		Default is all four.
+ *	-o	Set of fields to display.  One of "name,property,value,
+ *		received,source". Default is "name,property,value,source".
+ *		"all" is an alias for all five.
  *	-s	Set of sources to allow.  One of
- *		"local,default,inherited,temporary,none".  Default is all
- *		five.
+ *		"local,default,inherited,received,temporary,none".  Default is
+ *		all six.
  *	-p	Display values in parsable (literal) format.
  *
  *  Prints properties for the given datasets.  The user can control which
@@ -1031,16 +1108,19 @@ static int
 get_callback(zfs_handle_t *zhp, void *data)
 {
 	char buf[ZFS_MAXPROPLEN];
+	char rbuf[ZFS_MAXPROPLEN];
 	zprop_source_t sourcetype;
 	char source[ZFS_MAXNAMELEN];
 	zprop_get_cbdata_t *cbp = data;
-	nvlist_t *userprop = zfs_get_user_props(zhp);
+	nvlist_t *user_props = zfs_get_user_props(zhp);
 	zprop_list_t *pl = cbp->cb_proplist;
 	nvlist_t *propval;
 	char *strval;
 	char *sourceval;
+	boolean_t received = is_recvd_column(cbp);
 
 	for (; pl != NULL; pl = pl->pl_next) {
+		char *recvdval = NULL;
 		/*
 		 * Skip the special fake placeholder.  This will also skip over
 		 * the name property when 'all' is specified.
@@ -1067,11 +1147,27 @@ get_callback(zfs_handle_t *zhp, void *data)
 				(void) strlcpy(buf, "-", sizeof (buf));
 			}
 
+			if (received && (zfs_prop_get_recvd(zhp,
+			    zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf),
+			    cbp->cb_literal) == 0))
+				recvdval = rbuf;
+
 			zprop_print_one_property(zfs_get_name(zhp), cbp,
 			    zfs_prop_to_name(pl->pl_prop),
-			    buf, sourcetype, source);
+			    buf, sourcetype, source, recvdval);
+		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
+			sourcetype = ZPROP_SRC_LOCAL;
+
+			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+			    buf, sizeof (buf), cbp->cb_literal) != 0) {
+				sourcetype = ZPROP_SRC_NONE;
+				(void) strlcpy(buf, "-", sizeof (buf));
+			}
+
+			zprop_print_one_property(zfs_get_name(zhp), cbp,
+			    pl->pl_user_prop, buf, sourcetype, source, NULL);
 		} else {
-			if (nvlist_lookup_nvlist(userprop,
+			if (nvlist_lookup_nvlist(user_props,
 			    pl->pl_user_prop, &propval) != 0) {
 				if (pl->pl_all)
 					continue;
@@ -1086,6 +1182,9 @@ get_callback(zfs_handle_t *zhp, void *data)
 				if (strcmp(sourceval,
 				    zfs_get_name(zhp)) == 0) {
 					sourcetype = ZPROP_SRC_LOCAL;
+				} else if (strcmp(sourceval,
+				    ZPROP_SOURCE_VAL_RECVD) == 0) {
+					sourcetype = ZPROP_SRC_RECEIVED;
 				} else {
 					sourcetype = ZPROP_SRC_INHERITED;
 					(void) strlcpy(source,
@@ -1093,9 +1192,14 @@ get_callback(zfs_handle_t *zhp, void *data)
 				}
 			}
 
+			if (received && (zfs_prop_get_recvd(zhp,
+			    pl->pl_user_prop, rbuf, sizeof (rbuf),
+			    cbp->cb_literal) == 0))
+				recvdval = rbuf;
+
 			zprop_print_one_property(zfs_get_name(zhp), cbp,
 			    pl->pl_user_prop, strval, sourcetype,
-			    source);
+			    source, recvdval);
 		}
 	}
 
@@ -1109,6 +1213,7 @@ zfs_do_get(int argc, char **argv)
 	int i, c, flags = 0;
 	char *value, *fields;
 	int ret;
+	int limit = 0;
 	zprop_list_t fake_name = { 0 };
 
 	/*
@@ -1122,11 +1227,14 @@ zfs_do_get(int argc, char **argv)
 	cb.cb_type = ZFS_TYPE_DATASET;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":o:s:rHp")) != -1) {
+	while ((c = getopt(argc, argv, ":d:o:s:rHp")) != -1) {
 		switch (c) {
 		case 'p':
 			cb.cb_literal = B_TRUE;
 			break;
+		case 'd':
+			limit = parse_depth(optarg, &flags);
+			break;
 		case 'r':
 			flags |= ZFS_ITER_RECURSE;
 			break;
@@ -1147,10 +1255,10 @@ zfs_do_get(int argc, char **argv)
 			i = 0;
 			while (*optarg != '\0') {
 				static char *col_subopts[] =
-				    { "name", "property", "value", "source",
-				    NULL };
+				    { "name", "property", "value", "received",
+				    "source", "all", NULL };
 
-				if (i == 4) {
+				if (i == ZFS_GET_NCOLS) {
 					(void) fprintf(stderr, gettext("too "
 					    "many fields given to -o "
 					    "option\n"));
@@ -1169,8 +1277,28 @@ zfs_do_get(int argc, char **argv)
 					cb.cb_columns[i++] = GET_COL_VALUE;
 					break;
 				case 3:
+					cb.cb_columns[i++] = GET_COL_RECVD;
+					flags |= ZFS_ITER_RECVD_PROPS;
+					break;
+				case 4:
 					cb.cb_columns[i++] = GET_COL_SOURCE;
 					break;
+				case 5:
+					if (i > 0) {
+						(void) fprintf(stderr,
+						    gettext("\"all\" conflicts "
+						    "with specific fields "
+						    "given to -o option\n"));
+						usage(B_FALSE);
+					}
+					cb.cb_columns[0] = GET_COL_NAME;
+					cb.cb_columns[1] = GET_COL_PROPERTY;
+					cb.cb_columns[2] = GET_COL_VALUE;
+					cb.cb_columns[3] = GET_COL_RECVD;
+					cb.cb_columns[4] = GET_COL_SOURCE;
+					flags |= ZFS_ITER_RECVD_PROPS;
+					i = ZFS_GET_NCOLS;
+					break;
 				default:
 					(void) fprintf(stderr,
 					    gettext("invalid column name "
@@ -1185,7 +1313,8 @@ zfs_do_get(int argc, char **argv)
 			while (*optarg != '\0') {
 				static char *source_subopts[] = {
 					"local", "default", "inherited",
-					"temporary", "none", NULL };
+					"received", "temporary", "none",
+					NULL };
 
 				switch (getsubopt(&optarg, source_subopts,
 				    &value)) {
@@ -1199,9 +1328,12 @@ zfs_do_get(int argc, char **argv)
 					cb.cb_sources |= ZPROP_SRC_INHERITED;
 					break;
 				case 3:
-					cb.cb_sources |= ZPROP_SRC_TEMPORARY;
+					cb.cb_sources |= ZPROP_SRC_RECEIVED;
 					break;
 				case 4:
+					cb.cb_sources |= ZPROP_SRC_TEMPORARY;
+					break;
+				case 5:
 					cb.cb_sources |= ZPROP_SRC_NONE;
 					break;
 				default:
@@ -1257,7 +1389,7 @@ zfs_do_get(int argc, char **argv)
 
 	/* run for each object */
 	ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL,
-	    &cb.cb_proplist, get_callback, &cb);
+	    &cb.cb_proplist, limit, get_callback, &cb);
 
 	if (cb.cb_proplist == &fake_name)
 		zprop_free_list(fake_name.pl_next);
@@ -1268,9 +1400,10 @@ zfs_do_get(int argc, char **argv)
 }
 
 /*
- * inherit [-r] <property> <fs|vol> ...
+ * inherit [-rS] <property> <fs|vol> ...
  *
- * 	-r	Recurse over all children
+ *	-r	Recurse over all children
+ *	-S	Revert to received value, if any
  *
  * For each dataset specified on the command line, inherit the given property
  * from its parent.  Inheriting a property at the pool level will cause it to
@@ -1279,11 +1412,16 @@ zfs_do_get(int argc, char **argv)
  * local modifications for each dataset.
  */
 
+typedef struct inherit_cbdata {
+	const char *cb_propname;
+	boolean_t cb_received;
+} inherit_cbdata_t;
+
 static int
 inherit_recurse_cb(zfs_handle_t *zhp, void *data)
 {
-	char *propname = data;
-	zfs_prop_t prop = zfs_name_to_prop(propname);
+	inherit_cbdata_t *cb = data;
+	zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname);
 
 	/*
 	 * If we're doing it recursively, then ignore properties that
@@ -1293,15 +1431,15 @@ inherit_recurse_cb(zfs_handle_t *zhp, void *data)
 	    !zfs_prop_valid_for_type(prop, zfs_get_type(zhp)))
 		return (0);
 
-	return (zfs_prop_inherit(zhp, propname) != 0);
+	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
 }
 
 static int
 inherit_cb(zfs_handle_t *zhp, void *data)
 {
-	char *propname = data;
+	inherit_cbdata_t *cb = data;
 
-	return (zfs_prop_inherit(zhp, propname) != 0);
+	return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0);
 }
 
 static int
@@ -1309,16 +1447,21 @@ zfs_do_inherit(int argc, char **argv)
 {
 	int c;
 	zfs_prop_t prop;
+	inherit_cbdata_t cb = { 0 };
 	char *propname;
 	int ret;
 	int flags = 0;
+	boolean_t received = B_FALSE;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "r")) != -1) {
+	while ((c = getopt(argc, argv, "rS")) != -1) {
 		switch (c) {
 		case 'r':
 			flags |= ZFS_ITER_RECURSE;
 			break;
+		case 'S':
+			received = B_TRUE;
+			break;
 		case '?':
 		default:
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -1351,7 +1494,7 @@ zfs_do_inherit(int argc, char **argv)
 			    propname);
 			return (1);
 		}
-		if (!zfs_prop_inheritable(prop)) {
+		if (!zfs_prop_inheritable(prop) && !received) {
 			(void) fprintf(stderr, gettext("'%s' property cannot "
 			    "be inherited\n"), propname);
 			if (prop == ZFS_PROP_QUOTA ||
@@ -1362,18 +1505,27 @@ zfs_do_inherit(int argc, char **argv)
 				    "%s=none' to clear\n"), propname);
 			return (1);
 		}
+		if (received && (prop == ZFS_PROP_VOLSIZE ||
+		    prop == ZFS_PROP_VERSION)) {
+			(void) fprintf(stderr, gettext("'%s' property cannot "
+			    "be reverted to a received value\n"), propname);
+			return (1);
+		}
 	} else if (!zfs_prop_user(propname)) {
 		(void) fprintf(stderr, gettext("invalid property '%s'\n"),
 		    propname);
 		usage(B_FALSE);
 	}
 
+	cb.cb_propname = propname;
+	cb.cb_received = received;
+
 	if (flags & ZFS_ITER_RECURSE) {
 		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
-		    NULL, NULL, inherit_recurse_cb, propname);
+		    NULL, NULL, 0, inherit_recurse_cb, &cb);
 	} else {
 		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET,
-		    NULL, NULL, inherit_cb, propname);
+		    NULL, NULL, 0, inherit_cb, &cb);
 	}
 
 	return (ret);
@@ -1442,21 +1594,30 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data)
 {
 	upgrade_cbdata_t *cb = data;
 	int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION);
-
-	if (cb->cb_version >= ZPL_VERSION_FUID) {
-		int spa_version;
-
-		if (zfs_spa_version(zhp, &spa_version) < 0)
-			return (-1);
-
-		if (spa_version < SPA_VERSION_FUID) {
-			/* can't upgrade */
-			(void) printf(gettext("%s: can not be upgraded; "
-			    "the pool version needs to first be upgraded\nto "
-			    "version %d\n\n"),
-			    zfs_get_name(zhp), SPA_VERSION_FUID);
-			cb->cb_numfailed++;
-			return (0);
+	int i;
+	static struct { int zplver; int spaver; } table[] = {
+		{ZPL_VERSION_FUID, SPA_VERSION_FUID},
+		{ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE},
+		{0, 0}
+	};
+
+
+	for (i = 0; table[i].zplver; i++) {
+		if (cb->cb_version >= table[i].zplver) {
+			int spa_version;
+
+			if (zfs_spa_version(zhp, &spa_version) < 0)
+				return (-1);
+
+			if (spa_version < table[i].spaver) {
+				/* can't upgrade */
+				(void) printf(gettext("%s: can not be "
+				    "upgraded; the pool version needs to first "
+				    "be upgraded\nto version %d\n\n"),
+				    zfs_get_name(zhp), table[i].spaver);
+				cb->cb_numfailed++;
+				return (0);
+			}
 		}
 	}
 
@@ -1556,7 +1717,9 @@ zfs_do_upgrade(int argc, char **argv)
 		(void) printf(gettext(" 1   Initial ZFS filesystem version\n"));
 		(void) printf(gettext(" 2   Enhanced directory entries\n"));
 		(void) printf(gettext(" 3   Case insensitive and File system "
-		    "unique identifer (FUID)\n"));
+		    "unique identifier (FUID)\n"));
+		(void) printf(gettext(" 4   userquota, groupquota "
+		    "properties\n"));
 		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
@@ -1568,7 +1731,7 @@ zfs_do_upgrade(int argc, char **argv)
 		if (cb.cb_version == 0)
 			cb.cb_version = ZPL_VERSION;
 		ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM,
-		    NULL, NULL, upgrade_set_callback, &cb);
+		    NULL, NULL, 0, upgrade_set_callback, &cb);
 		(void) printf(gettext("%llu filesystems upgraded\n"),
 		    cb.cb_numupgraded);
 		if (cb.cb_numsamegraded) {
@@ -1586,14 +1749,14 @@ zfs_do_upgrade(int argc, char **argv)
 
 		flags |= ZFS_ITER_RECURSE;
 		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
-		    NULL, NULL, upgrade_list_callback, &cb);
+		    NULL, NULL, 0, upgrade_list_callback, &cb);
 
 		found = cb.cb_foundone;
 		cb.cb_foundone = B_FALSE;
 		cb.cb_newer = B_TRUE;
 
 		ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM,
-		    NULL, NULL, upgrade_list_callback, &cb);
+		    NULL, NULL, 0, upgrade_list_callback, &cb);
 
 		if (!cb.cb_foundone && !found) {
 			(void) printf(gettext("All filesystems are "
@@ -1605,14 +1768,93 @@ zfs_do_upgrade(int argc, char **argv)
 }
 
 /*
- * list [-rH] [-o property[,property]...] [-t type[,type]...]
+ * zfs userspace
+ */
+static int
+userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space)
+{
+	zfs_userquota_prop_t *typep = arg;
+	zfs_userquota_prop_t p = *typep;
+	char *name = NULL;
+	char *ug, *propname;
+	char namebuf[32];
+	char sizebuf[32];
+
+	if (domain == NULL || domain[0] == '\0') {
+		if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) {
+			struct group *g = getgrgid(rid);
+			if (g)
+				name = g->gr_name;
+		} else {
+			struct passwd *p = getpwuid(rid);
+			if (p)
+				name = p->pw_name;
+		}
+	}
+
+	if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA)
+		ug = "group";
+	else
+		ug = "user";
+
+	if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED)
+		propname = "used";
+	else
+		propname = "quota";
+
+	if (name == NULL) {
+		(void) snprintf(namebuf, sizeof (namebuf),
+		    "%llu", (longlong_t)rid);
+		name = namebuf;
+	}
+	zfs_nicenum(space, sizebuf, sizeof (sizebuf));
+
+	(void) printf("%s %s %s%c%s %s\n", propname, ug, domain,
+	    domain[0] ? '-' : ' ', name, sizebuf);
+
+	return (0);
+}
+
+static int
+zfs_do_userspace(int argc, char **argv)
+{
+	zfs_handle_t *zhp;
+	zfs_userquota_prop_t p;
+	int error;
+
+	/*
+	 * Try the python version.  If the execv fails, we'll continue
+	 * and do a simplistic implementation.
+	 */
+	(void) execv(pypath, argv-1);
+
+	(void) printf("internal error: %s not found\n"
+	    "falling back on built-in implementation, "
+	    "some features will not work\n", pypath);
+
+	if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL)
+		return (1);
+
+	(void) printf("PROP TYPE NAME VALUE\n");
+
+	for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) {
+		error = zfs_userspace(zhp, p, userspace_cb, &p);
+		if (error)
+			break;
+	}
+	return (error);
+}
+
+/*
+ * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...]
  *      [-s property [-s property]...] [-S property [-S property]...]
  *      <dataset> ...
  *
- * 	-r	Recurse over all children
- * 	-H	Scripted mode; elide headers and separate columns by tabs
- * 	-o	Control which fields to display.
- * 	-t	Control which object types to display.
+ *	-r	Recurse over all children
+ *	-d	Limit recursion by depth.
+ *	-H	Scripted mode; elide headers and separate columns by tabs
+ *	-o	Control which fields to display.
+ *	-t	Control which object types to display.
  *	-s	Specify sort columns, descending order.
  *	-S	Specify sort columns, ascending order.
  *
@@ -1692,7 +1934,6 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
 			first = B_FALSE;
 		}
 
-		right_justify = B_FALSE;
 		if (pl->pl_prop != ZPROP_INVAL) {
 			if (zfs_prop_get(zhp, pl->pl_prop, property,
 			    sizeof (property), NULL, NULL, 0, B_FALSE) != 0)
@@ -1701,6 +1942,13 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
 				propstr = property;
 
 			right_justify = zfs_prop_align_right(pl->pl_prop);
+		} else if (zfs_prop_userquota(pl->pl_user_prop)) {
+			if (zfs_prop_get_userquota(zhp, pl->pl_user_prop,
+			    property, sizeof (property), B_FALSE) != 0)
+				propstr = "-";
+			else
+				propstr = property;
+			right_justify = B_TRUE;
 		} else {
 			if (nvlist_lookup_nvlist(userprops,
 			    pl->pl_user_prop, &propval) != 0)
@@ -1708,6 +1956,7 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted)
 			else
 				verify(nvlist_lookup_string(propval,
 				    ZPROP_VALUE, &propstr) == 0);
+			right_justify = B_FALSE;
 		}
 
 		width = pl->pl_width;
@@ -1759,16 +2008,20 @@ zfs_do_list(int argc, char **argv)
 	char *fields = NULL;
 	list_cbdata_t cb = { 0 };
 	char *value;
+	int limit = 0;
 	int ret;
 	zfs_sort_column_t *sortcol = NULL;
 	int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":o:rt:Hs:S:")) != -1) {
+	while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) {
 		switch (c) {
 		case 'o':
 			fields = optarg;
 			break;
+		case 'd':
+			limit = parse_depth(optarg, &flags);
+			break;
 		case 'r':
 			flags |= ZFS_ITER_RECURSE;
 			break;
@@ -1859,7 +2112,7 @@ zfs_do_list(int argc, char **argv)
 	cb.cb_first = B_TRUE;
 
 	ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist,
-	    list_callback, &cb);
+	    limit, list_callback, &cb);
 
 	zprop_free_list(cb.cb_proplist);
 	zfs_free_sort_columns(sortcol);
@@ -1998,9 +2251,9 @@ zfs_do_promote(int argc, char **argv)
 /*
  * zfs rollback [-rRf] <snapshot>
  *
- * 	-r	Delete any intervening snapshots before doing rollback
- * 	-R	Delete any snapshots and their clones
- * 	-f	ignored for backwards compatability
+ *	-r	Delete any intervening snapshots before doing rollback
+ *	-R	Delete any snapshots and their clones
+ *	-f	ignored for backwards compatability
  *
  * Given a filesystem, rollback to a specific snapshot, discarding any changes
  * since then and making it the active dataset.  If more recent snapshots exist,
@@ -2242,7 +2495,7 @@ zfs_do_set(int argc, char **argv)
 	}
 
 	ret = zfs_for_each(argc - 2, argv + 2, NULL,
-	    ZFS_TYPE_DATASET, NULL, NULL, set_callback, &cb);
+	    ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb);
 
 	return (ret);
 }
@@ -2310,8 +2563,8 @@ zfs_do_snapshot(int argc, char **argv)
 }
 
 /*
- * zfs send [-v] -R [-i|-I <@snap>] <fs@snap>
- * zfs send [-v] [-i|-I <@snap>] <fs@snap>
+ * zfs send [-vDp] -R [-i|-I <@snap>] <fs@snap>
+ * zfs send [-vDp] [-i|-I <@snap>] <fs@snap>
  *
  * Send a backup stream to stdout.
  */
@@ -2322,14 +2575,11 @@ zfs_do_send(int argc, char **argv)
 	char *toname = NULL;
 	char *cp;
 	zfs_handle_t *zhp;
-	boolean_t doall = B_FALSE;
-	boolean_t replicate = B_FALSE;
-	boolean_t fromorigin = B_FALSE;
-	boolean_t verbose = B_FALSE;
+	sendflags_t flags = { 0 };
 	int c, err;
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":i:I:Rv")) != -1) {
+	while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) {
 		switch (c) {
 		case 'i':
 			if (fromname)
@@ -2340,13 +2590,19 @@ zfs_do_send(int argc, char **argv)
 			if (fromname)
 				usage(B_FALSE);
 			fromname = optarg;
-			doall = B_TRUE;
+			flags.doall = B_TRUE;
 			break;
 		case 'R':
-			replicate = B_TRUE;
+			flags.replicate = B_TRUE;
+			break;
+		case 'p':
+			flags.props = B_TRUE;
 			break;
 		case 'v':
-			verbose = B_TRUE;
+			flags.verbose = B_TRUE;
+			break;
+		case 'D':
+			flags.dedup = B_TRUE;
 			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
@@ -2406,7 +2662,7 @@ zfs_do_send(int argc, char **argv)
 
 		if (strcmp(origin, fromname) == 0) {
 			fromname = NULL;
-			fromorigin = B_TRUE;
+			flags.fromorigin = B_TRUE;
 		} else {
 			*cp = '\0';
 			if (cp != fromname && strcmp(argv[0], fromname)) {
@@ -2424,18 +2680,17 @@ zfs_do_send(int argc, char **argv)
 		}
 	}
 
-	if (replicate && fromname == NULL)
-		doall = B_TRUE;
+	if (flags.replicate && fromname == NULL)
+		flags.doall = B_TRUE;
 
-	err = zfs_send(zhp, fromname, toname, replicate, doall, fromorigin,
-	    verbose, STDOUT_FILENO);
+	err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0);
 	zfs_close(zhp);
 
 	return (err != 0);
 }
 
 /*
- * zfs receive [-dnvF] <fs@snap>
+ * zfs receive [-denvF] <fs@snap>
  *
  * Restore a backup stream from stdin.
  */
@@ -2443,18 +2698,24 @@ static int
 zfs_do_receive(int argc, char **argv)
 {
 	int c, err;
-	recvflags_t flags;
+	recvflags_t flags = { 0 };
 
-	bzero(&flags, sizeof (recvflags_t));
 	/* check options */
-	while ((c = getopt(argc, argv, ":dnvF")) != -1) {
+	while ((c = getopt(argc, argv, ":denuvF")) != -1) {
 		switch (c) {
 		case 'd':
 			flags.isprefix = B_TRUE;
 			break;
+		case 'e':
+			flags.isprefix = B_TRUE;
+			flags.istail = B_TRUE;
+			break;
 		case 'n':
 			flags.dryrun = B_TRUE;
 			break;
+		case 'u':
+			flags.nomount = B_TRUE;
+			break;
 		case 'v':
 			flags.verbose = B_TRUE;
 			break;
@@ -2499,386 +2760,111 @@ zfs_do_receive(int argc, char **argv)
 	return (err != 0);
 }
 
-typedef struct allow_cb {
-	int  a_permcnt;
-	size_t a_treeoffset;
-} allow_cb_t;
-
-static void
-zfs_print_perms(avl_tree_t *tree)
-{
-	zfs_perm_node_t *permnode;
-
-	permnode = avl_first(tree);
-	while (permnode != NULL) {
-		(void) printf("%s", permnode->z_pname);
-		permnode = AVL_NEXT(tree, permnode);
-		if (permnode)
-			(void) printf(",");
-		else
-			(void) printf("\n");
-	}
-}
-
-/*
- * Iterate over user/groups/everyone/... and the call perm_iter
- * function to print actual permission when tree has >0 nodes.
- */
-static void
-zfs_iter_perms(avl_tree_t *tree, const char *banner, allow_cb_t *cb)
-{
-	zfs_allow_node_t *item;
-	avl_tree_t *ptree;
-
-	item = avl_first(tree);
-	while (item) {
-		ptree = (void *)((char *)item + cb->a_treeoffset);
-		if (avl_numnodes(ptree)) {
-			if (cb->a_permcnt++ == 0)
-				(void) printf("%s\n", banner);
-			(void) printf("\t%s", item->z_key);
-			/*
-			 * Avoid an extra space being printed
-			 * for "everyone" which is keyed with a null
-			 * string
-			 */
-			if (item->z_key[0] != '\0')
-				(void) printf(" ");
-			zfs_print_perms(ptree);
-		}
-		item = AVL_NEXT(tree, item);
-	}
-}
-
-#define	LINES "-------------------------------------------------------------\n"
 static int
-zfs_print_allows(char *ds)
-{
-	zfs_allow_t *curperms, *perms;
-	zfs_handle_t *zhp;
-	allow_cb_t allowcb = { 0 };
-	char banner[MAXPATHLEN];
-
-	if (ds[0] == '-')
-		usage(B_FALSE);
-
-	if (strrchr(ds, '@')) {
-		(void) fprintf(stderr, gettext("Snapshots don't have 'allow'"
-		    " permissions\n"));
-		return (1);
-	}
-	if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL)
-		return (1);
-
-	if (zfs_perm_get(zhp, &perms)) {
-		(void) fprintf(stderr,
-		    gettext("Failed to retrieve 'allows' on %s\n"), ds);
-		zfs_close(zhp);
-		return (1);
-	}
-
-	zfs_close(zhp);
-
-	if (perms != NULL)
-		(void) printf("%s", LINES);
-	for (curperms = perms; curperms; curperms = curperms->z_next) {
-
-		(void) snprintf(banner, sizeof (banner),
-		    "Permission sets on (%s)", curperms->z_setpoint);
-		allowcb.a_treeoffset =
-		    offsetof(zfs_allow_node_t, z_localdescend);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_sets, banner, &allowcb);
-
-		(void) snprintf(banner, sizeof (banner),
-		    "Create time permissions on (%s)", curperms->z_setpoint);
-		allowcb.a_treeoffset =
-		    offsetof(zfs_allow_node_t, z_localdescend);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_crperms, banner, &allowcb);
-
-
-		(void) snprintf(banner, sizeof (banner),
-		    "Local permissions on (%s)", curperms->z_setpoint);
-		allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_local);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_user, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_group, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
-
-		(void) snprintf(banner, sizeof (banner),
-		    "Descendent permissions on (%s)", curperms->z_setpoint);
-		allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_descend);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_user, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_group, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
-
-		(void) snprintf(banner, sizeof (banner),
-		    "Local+Descendent permissions on (%s)",
-		    curperms->z_setpoint);
-		allowcb.a_treeoffset =
-		    offsetof(zfs_allow_node_t, z_localdescend);
-		allowcb.a_permcnt = 0;
-		zfs_iter_perms(&curperms->z_user, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_group, banner, &allowcb);
-		zfs_iter_perms(&curperms->z_everyone, banner, &allowcb);
-
-		(void) printf("%s", LINES);
-	}
-	zfs_free_allows(perms);
-	return (0);
-}
-
-#define	ALLOWOPTIONS "ldcsu:g:e"
-#define	UNALLOWOPTIONS "ldcsu:g:er"
-
-/*
- * Validate options, and build necessary datastructure to display/remove/add
- * permissions.
- * Returns 0 - If permissions should be added/removed
- * Returns 1 - If permissions should be displayed.
- * Returns -1 - on failure
- */
-int
-parse_allow_args(int *argc, char **argv[], boolean_t unallow,
-    char **ds, int *recurse, nvlist_t **zperms)
+zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding)
 {
+	int errors = 0;
+	int i;
+	const char *tag;
+	boolean_t recursive = B_FALSE;
+	boolean_t temphold = B_FALSE;
+	const char *opts = holding ? "rt" : "r";
 	int c;
-	char *options = unallow ? UNALLOWOPTIONS : ALLOWOPTIONS;
-	zfs_deleg_inherit_t deleg_type = ZFS_DELEG_NONE;
-	zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN;
-	char *who = NULL;
-	char *perms = NULL;
-	zfs_handle_t *zhp;
 
-	while ((c = getopt(*argc, *argv, options)) != -1) {
+	/* check options */
+	while ((c = getopt(argc, argv, opts)) != -1) {
 		switch (c) {
-		case 'l':
-			if (who_type == ZFS_DELEG_CREATE ||
-			    who_type == ZFS_DELEG_NAMED_SET)
-				usage(B_FALSE);
-
-			deleg_type |= ZFS_DELEG_PERM_LOCAL;
-			break;
-		case 'd':
-			if (who_type == ZFS_DELEG_CREATE ||
-			    who_type == ZFS_DELEG_NAMED_SET)
-				usage(B_FALSE);
-
-			deleg_type |= ZFS_DELEG_PERM_DESCENDENT;
-			break;
 		case 'r':
-			*recurse = B_TRUE;
-			break;
-		case 'c':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			if (deleg_type)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_CREATE;
-			break;
-		case 's':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			if (deleg_type)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_NAMED_SET;
-			break;
-		case 'u':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_USER;
-			who = optarg;
-			break;
-		case 'g':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_GROUP;
-			who = optarg;
+			recursive = B_TRUE;
 			break;
-		case 'e':
-			if (who_type != ZFS_DELEG_WHO_UNKNOWN)
-				usage(B_FALSE);
-			who_type = ZFS_DELEG_EVERYONE;
+		case 't':
+			temphold = B_TRUE;
 			break;
-		default:
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
 			usage(B_FALSE);
-			break;
 		}
 	}
 
-	if (deleg_type == 0)
-		deleg_type = ZFS_DELEG_PERM_LOCALDESCENDENT;
-
-	*argc -= optind;
-	*argv += optind;
-
-	if (unallow == B_FALSE && *argc == 1) {
-		/*
-		 * Only print permissions if no options were processed
-		 */
-		if (optind == 1)
-			return (1);
-		else
-			usage(B_FALSE);
-	}
-
-	/*
-	 * initialize variables for zfs_build_perms based on number
-	 * of arguments.
-	 * 3 arguments ==>	zfs [un]allow joe perm,perm,perm <dataset> or
-	 *			zfs [un]allow -s @set1 perm,perm <dataset>
-	 * 2 arguments ==>	zfs [un]allow -c perm,perm <dataset> or
-	 *			zfs [un]allow -u|-g <name> perm <dataset> or
-	 *			zfs [un]allow -e perm,perm <dataset>
-	 *			zfs unallow joe <dataset>
-	 *			zfs unallow -s @set1 <dataset>
-	 * 1 argument  ==>	zfs [un]allow -e <dataset> or
-	 *			zfs [un]allow -c <dataset>
-	 */
-
-	switch (*argc) {
-	case 3:
-		perms = (*argv)[1];
-		who = (*argv)[0];
-		*ds = (*argv)[2];
-
-		/*
-		 * advance argc/argv for do_allow cases.
-		 * for do_allow case make sure who have a know who type
-		 * and its not a permission set.
-		 */
-		if (unallow == B_TRUE) {
-			*argc -= 2;
-			*argv += 2;
-		} else if (who_type != ZFS_DELEG_WHO_UNKNOWN &&
-		    who_type != ZFS_DELEG_NAMED_SET)
-			usage(B_FALSE);
-		break;
-
-	case 2:
-		if (unallow == B_TRUE && (who_type == ZFS_DELEG_EVERYONE ||
-		    who_type == ZFS_DELEG_CREATE || who != NULL)) {
-			perms = (*argv)[0];
-			*ds = (*argv)[1];
-		} else {
-			if (unallow == B_FALSE &&
-			    (who_type == ZFS_DELEG_WHO_UNKNOWN ||
-			    who_type == ZFS_DELEG_NAMED_SET))
-				usage(B_FALSE);
-			else if (who_type == ZFS_DELEG_WHO_UNKNOWN ||
-			    who_type == ZFS_DELEG_NAMED_SET)
-				who = (*argv)[0];
-			else if (who_type != ZFS_DELEG_NAMED_SET)
-				perms = (*argv)[0];
-			*ds = (*argv)[1];
-		}
-		if (unallow == B_TRUE) {
-			(*argc)--;
-			(*argv)++;
-		}
-		break;
-
-	case 1:
-		if (unallow == B_FALSE)
-			usage(B_FALSE);
-		if (who == NULL && who_type != ZFS_DELEG_CREATE &&
-		    who_type != ZFS_DELEG_EVERYONE)
-			usage(B_FALSE);
-		*ds = (*argv)[0];
-		break;
+	argc -= optind;
+	argv += optind;
 
-	default:
+	/* check number of arguments */
+	if (argc < 2)
 		usage(B_FALSE);
-	}
 
-	if (strrchr(*ds, '@')) {
-		(void) fprintf(stderr,
-		    gettext("Can't set or remove 'allow' permissions "
-		    "on snapshots.\n"));
-			return (-1);
-	}
+	tag = argv[0];
+	--argc;
+	++argv;
 
-	if ((zhp = zfs_open(g_zfs, *ds, ZFS_TYPE_DATASET)) == NULL)
-		return (-1);
-
-	if ((zfs_build_perms(zhp, who, perms,
-	    who_type, deleg_type, zperms)) != 0) {
-		zfs_close(zhp);
-		return (-1);
+	if (holding && tag[0] == '.') {
+		/* tags starting with '.' are reserved for libzfs */
+		(void) fprintf(stderr, gettext("tag may not start with '.'\n"));
+		usage(B_FALSE);
 	}
-	zfs_close(zhp);
-	return (0);
-}
 
-static int
-zfs_do_allow(int argc, char **argv)
-{
-	char *ds;
-	nvlist_t *zperms = NULL;
-	zfs_handle_t *zhp;
-	int unused;
-	int ret;
-
-	if ((ret = parse_allow_args(&argc, &argv, B_FALSE, &ds,
-	    &unused, &zperms)) == -1)
-		return (1);
-
-	if (ret == 1)
-		return (zfs_print_allows(argv[0]));
+	for (i = 0; i < argc; ++i) {
+		zfs_handle_t *zhp;
+		char parent[ZFS_MAXNAMELEN];
+		const char *delim;
+		char *path = argv[i];
 
-	if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL)
-		return (1);
+		delim = strchr(path, '@');
+		if (delim == NULL) {
+			(void) fprintf(stderr,
+			    gettext("'%s' is not a snapshot\n"), path);
+			++errors;
+			continue;
+		}
+		(void) strncpy(parent, path, delim - path);
+		parent[delim - path] = '\0';
 
-	if (zfs_perm_set(zhp, zperms)) {
+		zhp = zfs_open(g_zfs, parent,
+		    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME);
+		if (zhp == NULL) {
+			++errors;
+			continue;
+		}
+		if (holding) {
+			if (zfs_hold(zhp, delim+1, tag, recursive,
+			    temphold, B_FALSE) != 0)
+				++errors;
+		} else {
+			if (zfs_release(zhp, delim+1, tag, recursive) != 0)
+				++errors;
+		}
 		zfs_close(zhp);
-		nvlist_free(zperms);
-		return (1);
 	}
-	nvlist_free(zperms);
-	zfs_close(zhp);
 
-	return (0);
+	return (errors != 0);
 }
 
+/*
+ * zfs hold [-r] [-t] <tag> <snap> ...
+ *
+ *	-r	Recursively hold
+ *	-t	Temporary hold (hidden option)
+ *
+ * Apply a user-hold with the given tag to the list of snapshots.
+ */
 static int
-unallow_callback(zfs_handle_t *zhp, void *data)
+zfs_do_hold(int argc, char **argv)
 {
-	nvlist_t *nvp = (nvlist_t *)data;
-	int error;
-
-	error = zfs_perm_remove(zhp, nvp);
-	if (error) {
-		(void) fprintf(stderr, gettext("Failed to remove permissions "
-		    "on %s\n"), zfs_get_name(zhp));
-	}
-	return (error);
+	return (zfs_do_hold_rele_impl(argc, argv, B_TRUE));
 }
 
+/*
+ * zfs release [-r] <tag> <snap> ...
+ *
+ *	-r	Recursively release
+ *
+ * Release a user-hold with the given tag from the list of snapshots.
+ */
 static int
-zfs_do_unallow(int argc, char **argv)
+zfs_do_release(int argc, char **argv)
 {
-	int recurse = B_FALSE;
-	char *ds;
-	int error;
-	nvlist_t *zperms = NULL;
-	int flags = 0;
-
-	if (parse_allow_args(&argc, &argv, B_TRUE,
-	    &ds, &recurse, &zperms) == -1)
-		return (1);
-
-	if (recurse)
-		flags |= ZFS_ITER_RECURSE;
-	error = zfs_for_each(argc, argv, flags,
-	    ZFS_TYPE_FILESYSTEM|ZFS_TYPE_VOLUME, NULL,
-	    NULL, unallow_callback, (void *)zperms);
-
-	if (zperms)
-		nvlist_free(zperms);
-
-	return (error);
+	return (zfs_do_hold_rele_impl(argc, argv, B_FALSE));
 }
 
 typedef struct get_all_cbdata {
@@ -3071,7 +3057,6 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
 		    sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0);
 		verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts,
 		    sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0);
-		canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
 
 		if (op == OP_SHARE && strcmp(shareopts, "off") == 0 &&
 		    strcmp(smbshareopts, "off") == 0) {
@@ -3081,7 +3066,8 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
 			(void) fprintf(stderr, gettext("cannot share '%s': "
 			    "legacy share\n"), zfs_get_name(zhp));
 			(void) fprintf(stderr, gettext("use share(1M) to "
-			    "share this filesystem\n"));
+			    "share this filesystem, or set "
+			    "sharenfs property on\n"));
 			return (1);
 		}
 
@@ -3119,6 +3105,7 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol,
 		 * noauto	no		return 0
 		 * noauto	yes		pass through
 		 */
+		canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT);
 		if (canmount == ZFS_CANMOUNT_OFF) {
 			if (!explicit)
 				return (0);
@@ -3945,6 +3932,15 @@ zfs_do_unshare(int argc, char **argv)
 	return (unshare_unmount(OP_SHARE, argc, argv));
 }
 
+/* ARGSUSED */
+static int
+zfs_do_python(int argc, char **argv)
+{
+	(void) execv(pypath, argv-1);
+	(void) printf("internal error: %s not found\n", pypath);
+	return (-1);
+}
+
 /*
  * Called when invoked as /etc/fs/zfs/mount.  Do the mount if the mountpoint is
  * 'legacy'.  Otherwise, complain that use should be using 'zfs mount'.
@@ -4080,27 +4076,6 @@ manual_unmount(int argc, char **argv)
 	return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE));
 }
 
-static int
-volcheck(zpool_handle_t *zhp, void *data)
-{
-	boolean_t isinit = *((boolean_t *)data);
-
-	if (isinit)
-		return (zpool_create_zvol_links(zhp));
-	else
-		return (zpool_remove_zvol_links(zhp));
-}
-
-/*
- * Iterate over all pools in the system and either create or destroy /dev/zvol
- * links, depending on the value of 'isinit'.
- */
-static int
-do_volcheck(boolean_t isinit)
-{
-	return (zpool_iter(g_zfs, volcheck, &isinit) ? 1 : 0);
-}
-
 static int
 find_command_idx(char *command, int *idx)
 {
@@ -4186,18 +4161,10 @@ main(int argc, char **argv)
 		if (strcmp(cmdname, "-?") == 0)
 			usage(B_TRUE);
 
-		/*
-		 * 'volinit' and 'volfini' do not appear in the usage message,
-		 * so we have to special case them here.
-		 */
-		if (strcmp(cmdname, "volinit") == 0)
-			return (do_volcheck(B_TRUE));
-		else if (strcmp(cmdname, "volfini") == 0)
-			return (do_volcheck(B_FALSE));
-
 		/*
 		 * Run the appropriate command.
 		 */
+		libzfs_mnttab_cache(g_zfs, B_TRUE);
 		if (find_command_idx(cmdname, &i) == 0) {
 			current_command = &command_table[i];
 			ret = command_table[i].func(argc - 1, argv + 1);
@@ -4210,6 +4177,7 @@ main(int argc, char **argv)
 			    "command '%s'\n"), cmdname);
 			usage(B_FALSE);
 		}
+		libzfs_mnttab_cache(g_zfs, B_FALSE);
 	}
 
 	(void) fclose(mnttab_file);
diff --git a/external/cddl/osnet/dist/cmd/zpool/zpool_main.c b/external/cddl/osnet/dist/cmd/zpool/zpool_main.c
index 54bba8645c669..96fba62d0f886 100644
--- a/external/cddl/osnet/dist/cmd/zpool/zpool_main.c
+++ b/external/cddl/osnet/dist/cmd/zpool/zpool_main.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -50,6 +50,8 @@
 #include "zpool_util.h"
 #include "zfs_comutil.h"
 
+#include "statcommon.h"
+
 static int zpool_do_create(int, char **);
 static int zpool_do_destroy(int, char **);
 
@@ -67,6 +69,7 @@ static int zpool_do_clear(int, char **);
 static int zpool_do_attach(int, char **);
 static int zpool_do_detach(int, char **);
 static int zpool_do_replace(int, char **);
+static int zpool_do_split(int, char **);
 
 static int zpool_do_scrub(int, char **);
 
@@ -119,7 +122,8 @@ typedef enum {
 	HELP_STATUS,
 	HELP_UPGRADE,
 	HELP_GET,
-	HELP_SET
+	HELP_SET,
+	HELP_SPLIT
 } zpool_help_t;
 
 
@@ -156,6 +160,7 @@ static zpool_command_t command_table[] = {
 	{ "attach",	zpool_do_attach,	HELP_ATTACH		},
 	{ "detach",	zpool_do_detach,	HELP_DETACH		},
 	{ "replace",	zpool_do_replace,	HELP_REPLACE		},
+	{ "split",	zpool_do_split,		HELP_SPLIT		},
 	{ NULL },
 	{ "scrub",	zpool_do_scrub,		HELP_SCRUB		},
 	{ NULL },
@@ -173,6 +178,8 @@ static zpool_command_t command_table[] = {
 zpool_command_t *current_command;
 static char history_str[HIS_MAX_RECORD_LEN];
 
+static uint_t timestamp_fmt = NODATE;
+
 static const char *
 get_usage(zpool_help_t idx) {
 	switch (idx) {
@@ -182,7 +189,7 @@ get_usage(zpool_help_t idx) {
 		return (gettext("\tattach [-f] <pool> <device> "
 		    "<new-device>\n"));
 	case HELP_CLEAR:
-		return (gettext("\tclear <pool> [device]\n"));
+		return (gettext("\tclear [-nF] <pool> [device]\n"));
 	case HELP_CREATE:
 		return (gettext("\tcreate [-fn] [-o property=value] ... \n"
 		    "\t    [-O file-system-property=value] ... \n"
@@ -197,13 +204,14 @@ get_usage(zpool_help_t idx) {
 		return (gettext("\thistory [-il] [<pool>] ...\n"));
 	case HELP_IMPORT:
 		return (gettext("\timport [-d dir] [-D]\n"
+		    "\timport [-d dir | -c cachefile] [-n] -F <pool | id>\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
 		    "\t    [-d dir | -c cachefile] [-D] [-f] [-R root] -a\n"
 		    "\timport [-o mntopts] [-o property=value] ... \n"
 		    "\t    [-d dir | -c cachefile] [-D] [-f] [-R root] "
 		    "<pool | id> [newpool]\n"));
 	case HELP_IOSTAT:
-		return (gettext("\tiostat [-v] [pool] ... [interval "
+		return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval "
 		    "[count]]\n"));
 	case HELP_LIST:
 		return (gettext("\tlist [-H] [-o property[,...]] "
@@ -230,6 +238,10 @@ get_usage(zpool_help_t idx) {
 		    "<pool> ...\n"));
 	case HELP_SET:
 		return (gettext("\tset <property=value> <pool> \n"));
+	case HELP_SPLIT:
+		return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n"
+		    "\t    [-o property=value] <pool> <newpool> "
+		    "[<device> ...]\n"));
 	}
 
 	abort();
@@ -245,12 +257,12 @@ print_prop_cb(int prop, void *cb)
 {
 	FILE *fp = cb;
 
-	(void) fprintf(fp, "\t%-13s  ", zpool_prop_to_name(prop));
+	(void) fprintf(fp, "\t%-15s  ", zpool_prop_to_name(prop));
 
 	if (zpool_prop_readonly(prop))
 		(void) fprintf(fp, "  NO   ");
 	else
-		(void) fprintf(fp, " YES    ");
+		(void) fprintf(fp, " YES   ");
 
 	if (zpool_prop_values(prop) == NULL)
 		(void) fprintf(fp, "-\n");
@@ -297,7 +309,7 @@ usage(boolean_t requested)
 		(void) fprintf(fp,
 		    gettext("\nthe following properties are supported:\n"));
 
-		(void) fprintf(fp, "\n\t%-13s  %s  %s\n\n",
+		(void) fprintf(fp, "\n\t%-15s  %s   %s\n\n",
 		    "PROPERTY", "EDIT", "VALUES");
 
 		/* Iterate over all properties */
@@ -339,7 +351,7 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent,
 		if ((is_log && !print_logs) || (!is_log && print_logs))
 			continue;
 
-		vname = zpool_vdev_name(g_zfs, zhp, child[c]);
+		vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE);
 		print_vdev_tree(zhp, vname, child[c], indent + 2,
 		    B_FALSE);
 		free(vname);
@@ -376,12 +388,11 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props,
 		}
 		normnm = zpool_prop_to_name(prop);
 	} else {
-		if ((fprop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
-			(void) fprintf(stderr, gettext("property '%s' is "
-			    "not a valid file system property\n"), propname);
-			return (2);
+		if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) {
+			normnm = zfs_prop_to_name(fprop);
+		} else {
+			normnm = propname;
 		}
-		normnm = zfs_prop_to_name(fprop);
 	}
 
 	if (nvlist_lookup_string(proplist, normnm, &strval) == 0 &&
@@ -877,17 +888,21 @@ int
 zpool_do_export(int argc, char **argv)
 {
 	boolean_t force = B_FALSE;
+	boolean_t hardforce = B_FALSE;
 	int c;
 	zpool_handle_t *zhp;
 	int ret;
 	int i;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "f")) != -1) {
+	while ((c = getopt(argc, argv, "fF")) != -1) {
 		switch (c) {
 		case 'f':
 			force = B_TRUE;
 			break;
+		case 'F':
+			hardforce = B_TRUE;
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -917,8 +932,12 @@ zpool_do_export(int argc, char **argv)
 			continue;
 		}
 
-		if (zpool_export(zhp, force) != 0)
+		if (hardforce) {
+			if (zpool_export_force(zhp) != 0)
+				ret = 1;
+		} else if (zpool_export(zhp, force) != 0) {
 			ret = 1;
+		}
 
 		zpool_close(zhp);
 	}
@@ -933,7 +952,7 @@ zpool_do_export(int argc, char **argv)
 static int
 max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
 {
-	char *name = zpool_vdev_name(g_zfs, zhp, nv);
+	char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE);
 	nvlist_t **child;
 	uint_t c, children;
 	int ret;
@@ -971,14 +990,199 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max)
 	return (max);
 }
 
+typedef struct spare_cbdata {
+	uint64_t	cb_guid;
+	zpool_handle_t	*cb_zhp;
+} spare_cbdata_t;
+
+static boolean_t
+find_vdev(nvlist_t *nv, uint64_t search)
+{
+	uint64_t guid;
+	nvlist_t **child;
+	uint_t c, children;
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
+	    search == guid)
+		return (B_TRUE);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) == 0) {
+		for (c = 0; c < children; c++)
+			if (find_vdev(child[c], search))
+				return (B_TRUE);
+	}
+
+	return (B_FALSE);
+}
+
+static int
+find_spare(zpool_handle_t *zhp, void *data)
+{
+	spare_cbdata_t *cbp = data;
+	nvlist_t *config, *nvroot;
+
+	config = zpool_get_config(zhp, NULL);
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	if (find_vdev(nvroot, cbp->cb_guid)) {
+		cbp->cb_zhp = zhp;
+		return (1);
+	}
+
+	zpool_close(zhp);
+	return (0);
+}
+
+/*
+ * Print out configuration state as requested by status_callback.
+ */
+void
+print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
+    int namewidth, int depth, boolean_t isspare)
+{
+	nvlist_t **child;
+	uint_t c, children;
+	vdev_stat_t *vs;
+	char rbuf[6], wbuf[6], cbuf[6], repaired[7];
+	char *vname;
+	uint64_t notpresent;
+	spare_cbdata_t cb;
+	char *state;
+
+	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
+	    (uint64_t **)&vs, &c) == 0);
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
+	    &child, &children) != 0)
+		children = 0;
+
+	state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
+	if (isspare) {
+		/*
+		 * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
+		 * online drives.
+		 */
+		if (vs->vs_aux == VDEV_AUX_SPARED)
+			state = "INUSE";
+		else if (vs->vs_state == VDEV_STATE_HEALTHY)
+			state = "AVAIL";
+	}
+
+	(void) printf("\t%*s%-*s  %-8s", depth, "", namewidth - depth,
+	    name, state);
+
+	if (!isspare) {
+		zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
+		zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
+		zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
+		(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
+	}
+
+	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &notpresent) == 0) {
+		char *path;
+		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
+		(void) printf("  was %s", path);
+	} else if (vs->vs_aux != 0) {
+		(void) printf("  ");
+
+		switch (vs->vs_aux) {
+		case VDEV_AUX_OPEN_FAILED:
+			(void) printf(gettext("cannot open"));
+			break;
+
+		case VDEV_AUX_BAD_GUID_SUM:
+			(void) printf(gettext("missing device"));
+			break;
+
+		case VDEV_AUX_NO_REPLICAS:
+			(void) printf(gettext("insufficient replicas"));
+			break;
+
+		case VDEV_AUX_VERSION_NEWER:
+			(void) printf(gettext("newer version"));
+			break;
+
+		case VDEV_AUX_SPARED:
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
+			    &cb.cb_guid) == 0);
+			if (zpool_iter(g_zfs, find_spare, &cb) == 1) {
+				if (strcmp(zpool_get_name(cb.cb_zhp),
+				    zpool_get_name(zhp)) == 0)
+					(void) printf(gettext("currently in "
+					    "use"));
+				else
+					(void) printf(gettext("in use by "
+					    "pool '%s'"),
+					    zpool_get_name(cb.cb_zhp));
+				zpool_close(cb.cb_zhp);
+			} else {
+				(void) printf(gettext("currently in use"));
+			}
+			break;
+
+		case VDEV_AUX_ERR_EXCEEDED:
+			(void) printf(gettext("too many errors"));
+			break;
+
+		case VDEV_AUX_IO_FAILURE:
+			(void) printf(gettext("experienced I/O failures"));
+			break;
+
+		case VDEV_AUX_BAD_LOG:
+			(void) printf(gettext("bad intent log"));
+			break;
+
+		case VDEV_AUX_EXTERNAL:
+			(void) printf(gettext("external device fault"));
+			break;
+
+		case VDEV_AUX_SPLIT_POOL:
+			(void) printf(gettext("split into new pool"));
+			break;
+
+		default:
+			(void) printf(gettext("corrupted data"));
+			break;
+		}
+	} else if (vs->vs_scrub_repaired != 0 && children == 0) {
+		/*
+		 * Report bytes resilvered/repaired on leaf devices.
+		 */
+		zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
+		(void) printf(gettext("  %s %s"), repaired,
+		    (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
+		    "resilvered" : "repaired");
+	}
+
+	(void) printf("\n");
+
+	for (c = 0; c < children; c++) {
+		uint64_t islog = B_FALSE, ishole = B_FALSE;
+
+		/* Don't print logs or holes here */
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &islog);
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+		    &ishole);
+		if (islog || ishole)
+			continue;
+		vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
+		print_status_config(zhp, vname, child[c],
+		    namewidth, depth + 2, isspare);
+		free(vname);
+	}
+}
+
 
 /*
  * Print the configuration of an exported pool.  Iterate over all vdevs in the
  * pool, printing out the name and status for each one.
  */
 void
-print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
-    boolean_t print_logs)
+print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth)
 {
 	nvlist_t **child;
 	uint_t c, children;
@@ -986,7 +1190,8 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
 	char *type, *vname;
 
 	verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0);
-	if (strcmp(type, VDEV_TYPE_MISSING) == 0)
+	if (strcmp(type, VDEV_TYPE_MISSING) == 0 ||
+	    strcmp(type, VDEV_TYPE_HOLE) == 0)
 		return;
 
 	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
@@ -1035,12 +1240,11 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
 
 		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
 		    &is_log);
-		if ((is_log && !print_logs) || (!is_log && print_logs))
+		if (is_log)
 			continue;
 
-		vname = zpool_vdev_name(g_zfs, NULL, child[c]);
-		print_import_config(vname, child[c],
-		    namewidth, depth + 2, B_FALSE);
+		vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE);
+		print_import_config(vname, child[c], namewidth, depth + 2);
 		free(vname);
 	}
 
@@ -1048,7 +1252,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
 	    &child, &children) == 0) {
 		(void) printf(gettext("\tcache\n"));
 		for (c = 0; c < children; c++) {
-			vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+			vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
 			(void) printf("\t  %s\n", vname);
 			free(vname);
 		}
@@ -1058,13 +1262,51 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth,
 	    &child, &children) == 0) {
 		(void) printf(gettext("\tspares\n"));
 		for (c = 0; c < children; c++) {
-			vname = zpool_vdev_name(g_zfs, NULL, child[c]);
+			vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE);
 			(void) printf("\t  %s\n", vname);
 			free(vname);
 		}
 	}
 }
 
+/*
+ * Print log vdevs.
+ * Logs are recorded as top level vdevs in the main pool child array
+ * but with "is_log" set to 1. We use either print_status_config() or
+ * print_import_config() to print the top level logs then any log
+ * children (eg mirrored slogs) are printed recursively - which
+ * works because only the top level vdev is marked "is_log"
+ */
+static void
+print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose)
+{
+	uint_t c, children;
+	nvlist_t **child;
+
+	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) != 0)
+		return;
+
+	(void) printf(gettext("\tlogs\n"));
+
+	for (c = 0; c < children; c++) {
+		uint64_t is_log = B_FALSE;
+		char *name;
+
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+		if (!is_log)
+			continue;
+		name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE);
+		if (verbose)
+			print_status_config(zhp, name, child[c], namewidth,
+			    2, B_FALSE);
+		else
+			print_import_config(name, child[c], namewidth, 2);
+		free(name);
+	}
+}
+
 /*
  * Display the status for the given pool.
  */
@@ -1233,11 +1475,9 @@ show_import(nvlist_t *config)
 	if (namewidth < 10)
 		namewidth = 10;
 
-	print_import_config(name, nvroot, namewidth, 0, B_FALSE);
-	if (num_logs(nvroot) > 0) {
-		(void) printf(gettext("\tlogs\n"));
-		print_import_config(name, nvroot, namewidth, 0, B_TRUE);
-	}
+	print_import_config(name, nvroot, namewidth, 0);
+	if (num_logs(nvroot) > 0)
+		print_logs(NULL, nvroot, namewidth, B_FALSE);
 
 	if (reason == ZPOOL_STATUS_BAD_GUID_SUM) {
 		(void) printf(gettext("\n\tAdditional devices are known to "
@@ -1253,13 +1493,12 @@ show_import(nvlist_t *config)
  */
 static int
 do_import(nvlist_t *config, const char *newname, const char *mntopts,
-    int force, nvlist_t *props, boolean_t allowfaulted)
+    int force, nvlist_t *props, boolean_t do_verbatim)
 {
 	zpool_handle_t *zhp;
 	char *name;
 	uint64_t state;
 	uint64_t version;
-	int error = 0;
 
 	verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
 	    &name) == 0);
@@ -1306,22 +1545,23 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
 		}
 	}
 
-	if (zpool_import_props(g_zfs, config, newname, props,
-	    allowfaulted) != 0)
+	if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0)
 		return (1);
 
 	if (newname != NULL)
 		name = (char *)newname;
 
-	verify((zhp = zpool_open_canfail(g_zfs, name)) != NULL);
+	if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL)
+		return (1);
 
-	if (zpool_enable_datasets(zhp, mntopts, 0) != 0) {
+	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
 		zpool_close(zhp);
 		return (1);
 	}
 
 	zpool_close(zhp);
-	return (error);
+	return (0);
 }
 
 /*
@@ -1329,7 +1569,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
  *       import [-o mntopts] [-o prop=value] ... [-R root] [-D]
  *              [-d dir | -c cachefile] [-f] -a
  *       import [-o mntopts] [-o prop=value] ... [-R root] [-D]
- *              [-d dir | -c cachefile] [-f] <pool | id> [newpool]
+ *              [-d dir | -c cachefile] [-f] [-n] [-F] <pool | id> [newpool]
  *
  *	 -c	Read pool information from a cachefile instead of searching
  *		devices.
@@ -1344,12 +1584,17 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts,
  *		the given root.  The pool will remain exported when the machine
  *		is rebooted.
  *
- *       -f	Force import, even if it appears that the pool is active.
- *
- *       -F	Import even in the presence of faulted vdevs.  This is an
+ *       -V	Import even in the presence of faulted vdevs.  This is an
  *       	intentionally undocumented option for testing purposes, and
  *       	treats the pool configuration as complete, leaving any bad
- *		vdevs in the FAULTED state.
+ *		vdevs in the FAULTED state. In other words, it does verbatim
+ *		import.
+ *
+ *       -f	Force import, even if it appears that the pool is active.
+ *
+ *       -F     Attempt rewind if necessary.
+ *
+ *       -n     See if rewind would work, but don't actually rewind.
  *
  *       -a	Import all pools found.
  *
@@ -1364,7 +1609,7 @@ zpool_do_import(int argc, char **argv)
 	char **searchdirs = NULL;
 	int nsearch = 0;
 	int c;
-	int err;
+	int err = 0;
 	nvlist_t *pools = NULL;
 	boolean_t do_all = B_FALSE;
 	boolean_t do_destroyed = B_FALSE;
@@ -1376,14 +1621,20 @@ zpool_do_import(int argc, char **argv)
 	char *searchname = NULL;
 	char *propval;
 	nvlist_t *found_config;
+	nvlist_t *policy = NULL;
 	nvlist_t *props = NULL;
 	boolean_t first;
-	boolean_t allow_faulted = B_FALSE;
+	boolean_t do_verbatim = B_FALSE;
+	uint32_t rewind_policy = ZPOOL_NO_REWIND;
+	boolean_t dryrun = B_FALSE;
+	boolean_t do_rewind = B_FALSE;
+	boolean_t xtreme_rewind = B_FALSE;
 	uint64_t pool_state;
 	char *cachefile = NULL;
+	importargs_t idata = { 0 };
 
 	/* check options */
-	while ((c = getopt(argc, argv, ":ac:d:DfFo:p:R:")) != -1) {
+	while ((c = getopt(argc, argv, ":aCc:d:DEfFno:rR:VX")) != -1) {
 		switch (c) {
 		case 'a':
 			do_all = B_TRUE;
@@ -1411,7 +1662,10 @@ zpool_do_import(int argc, char **argv)
 			do_force = B_TRUE;
 			break;
 		case 'F':
-			allow_faulted = B_TRUE;
+			do_rewind = B_TRUE;
+			break;
+		case 'n':
+			dryrun = B_TRUE;
 			break;
 		case 'o':
 			if ((propval = strchr(optarg, '=')) != NULL) {
@@ -1436,6 +1690,12 @@ zpool_do_import(int argc, char **argv)
 			    ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE))
 				goto error;
 			break;
+		case 'V':
+			do_verbatim = B_TRUE;
+			break;
+		case 'X':
+			xtreme_rewind = B_TRUE;
+			break;
 		case ':':
 			(void) fprintf(stderr, gettext("missing argument for "
 			    "'%c' option\n"), optopt);
@@ -1456,6 +1716,23 @@ zpool_do_import(int argc, char **argv)
 		usage(B_FALSE);
 	}
 
+	if ((dryrun || xtreme_rewind) && !do_rewind) {
+		(void) fprintf(stderr,
+		    gettext("-n or -X only meaningful with -F\n"));
+		usage(B_FALSE);
+	}
+	if (dryrun)
+		rewind_policy = ZPOOL_TRY_REWIND;
+	else if (do_rewind)
+		rewind_policy = ZPOOL_DO_REWIND;
+	if (xtreme_rewind)
+		rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+	/* In the future, we can capture further policy and include it here */
+	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
+		goto error;
+
 	if (searchdirs == NULL) {
 		searchdirs = safe_malloc(sizeof (char *));
 		searchdirs[0] = "/dev/dsk";
@@ -1483,6 +1760,7 @@ zpool_do_import(int argc, char **argv)
 			(void) fprintf(stderr, gettext("cannot "
 			    "discover pools: permission denied\n"));
 			free(searchdirs);
+			nvlist_free(policy);
 			return (1);
 		}
 	}
@@ -1508,28 +1786,49 @@ zpool_do_import(int argc, char **argv)
 		if (errno != 0 || *endptr != '\0')
 			searchname = argv[0];
 		found_config = NULL;
-	}
 
-	if (cachefile) {
-		pools = zpool_find_import_cached(g_zfs, cachefile, searchname,
-		    searchguid);
-	} else if (searchname != NULL) {
-		pools = zpool_find_import_byname(g_zfs, nsearch, searchdirs,
-		    searchname);
-	} else {
 		/*
-		 * It's OK to search by guid even if searchguid is 0.
+		 * User specified a name or guid.  Ensure it's unique.
 		 */
-		pools = zpool_find_import_byguid(g_zfs, nsearch, searchdirs,
-		    searchguid);
-	}
-
-	if (pools == NULL) {
+		idata.unique = B_TRUE;
+	}
+
+
+	idata.path = searchdirs;
+	idata.paths = nsearch;
+	idata.poolname = searchname;
+	idata.guid = searchguid;
+	idata.cachefile = cachefile;
+
+	pools = zpool_search_import(g_zfs, &idata);
+
+	if (pools != NULL && idata.exists &&
+	    (argc == 1 || strcmp(argv[0], argv[1]) == 0)) {
+		(void) fprintf(stderr, gettext("cannot import '%s': "
+		    "a pool with that name already exists\n"),
+		    argv[0]);
+		(void) fprintf(stderr, gettext("use the form '%s "
+		    "<pool | id> <newpool>' to give it a new name\n"),
+		    "zpool import");
+		err = 1;
+	} else if (pools == NULL && idata.exists) {
+		(void) fprintf(stderr, gettext("cannot import '%s': "
+		    "a pool with that name is already created/imported,\n"),
+		    argv[0]);
+		(void) fprintf(stderr, gettext("and no additional pools "
+		    "with that name were found\n"));
+		err = 1;
+	} else if (pools == NULL) {
 		if (argc != 0) {
 			(void) fprintf(stderr, gettext("cannot import '%s': "
 			    "no such pool available\n"), argv[0]);
 		}
+		err = 1;
+	}
+
+	if (err == 1) {
 		free(searchdirs);
+		nvlist_free(policy);
 		return (1);
 	}
 
@@ -1553,17 +1852,21 @@ zpool_do_import(int argc, char **argv)
 		if (do_destroyed && pool_state != POOL_STATE_DESTROYED)
 			continue;
 
+		verify(nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY,
+		    policy) == 0);
+
 		if (argc == 0) {
 			if (first)
 				first = B_FALSE;
 			else if (!do_all)
 				(void) printf("\n");
 
-			if (do_all)
+			if (do_all) {
 				err |= do_import(config, NULL, mntopts,
-				    do_force, props, allow_faulted);
-			else
+				    do_force, props, do_verbatim);
+			} else {
 				show_import(config);
+			}
 		} else if (searchname != NULL) {
 			char *name;
 
@@ -1609,7 +1912,7 @@ zpool_do_import(int argc, char **argv)
 			err = B_TRUE;
 		} else {
 			err |= do_import(found_config, argc == 1 ? NULL :
-			    argv[1], mntopts, do_force, props, allow_faulted);
+			    argv[1], mntopts, do_force, props, do_verbatim);
 		}
 	}
 
@@ -1624,6 +1927,7 @@ zpool_do_import(int argc, char **argv)
 error:
 	nvlist_free(props);
 	nvlist_free(pools);
+	nvlist_free(policy);
 	free(searchdirs);
 
 	return (err ? 1 : 0);
@@ -1651,7 +1955,7 @@ print_iostat_header(iostat_cbdata_t *cb)
 {
 	(void) printf("%*s     capacity     operations    bandwidth\n",
 	    cb->cb_namewidth, "");
-	(void) printf("%-*s   used  avail   read  write   read  write\n",
+	(void) printf("%-*s  alloc   free   read  write   read  write\n",
 	    cb->cb_namewidth, "pool");
 	print_iostat_separator(cb);
 }
@@ -1742,7 +2046,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
 		return;
 
 	for (c = 0; c < children; c++) {
-		vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+		vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE);
 		print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
 		    newchild[c], cb, depth + 2);
 		free(vname);
@@ -1763,7 +2067,8 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv,
 		(void) printf("%-*s      -      -      -      -      -      "
 		    "-\n", cb->cb_namewidth, "cache");
 		for (c = 0; c < children; c++) {
-			vname = zpool_vdev_name(g_zfs, zhp, newchild[c]);
+			vname = zpool_vdev_name(g_zfs, zhp, newchild[c],
+			    B_FALSE);
 			print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL,
 			    newchild[c], cb, depth + 2);
 			free(vname);
@@ -1852,8 +2157,9 @@ get_namewidth(zpool_handle_t *zhp, void *data)
 }
 
 /*
- * zpool iostat [-v] [pool] ... [interval [count]]
+ * zpool iostat [-T d|u] [-v] [pool] ... [interval [count]]
  *
+ *	-T	Display a timestamp in date(1) or Unix format
  *	-v	Display statistics for individual vdevs
  *
  * This command can be tricky because we want to be able to deal with pool
@@ -1874,8 +2180,20 @@ zpool_do_iostat(int argc, char **argv)
 	iostat_cbdata_t cb;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "v")) != -1) {
+	while ((c = getopt(argc, argv, "T:v")) != -1) {
 		switch (c) {
+		case 'T':
+			if (optarg) {
+				if (*optarg == 'u')
+					timestamp_fmt = UDATE;
+				else if (*optarg == 'd')
+					timestamp_fmt = DDATE;
+				else
+					usage(B_FALSE);
+			} else {
+				usage(B_FALSE);
+			}
+			break;
 		case 'v':
 			verbose = B_TRUE;
 			break;
@@ -1992,6 +2310,9 @@ zpool_do_iostat(int argc, char **argv)
 		cb.cb_namewidth = 0;
 		(void) pool_list_iter(list, B_FALSE, get_namewidth, &cb);
 
+		if (timestamp_fmt != NODATE)
+			print_timestamp(timestamp_fmt);
+
 		/*
 		 * If it's the first time, or verbose mode, print the header.
 		 */
@@ -2148,7 +2469,7 @@ list_callback(zpool_handle_t *zhp, void *data)
  *	-H	Scripted mode.  Don't display headers, and separate properties
  *		by a single tab.
  *	-o	List of properties to display.  Defaults to
- *		"name,size,used,available,capacity,health,altroot"
+ *		"name,size,allocated,free,capacity,health,altroot"
  *
  * List all pools in the system, whether or not they're healthy.  Output space
  * statistics for each one, as well as health status summary.
@@ -2160,7 +2481,7 @@ zpool_do_list(int argc, char **argv)
 	int ret;
 	list_cbdata_t cb = { 0 };
 	static char default_props[] =
-	    "name,size,used,available,capacity,health,altroot";
+	    "name,size,allocated,free,capacity,dedupratio,health,altroot";
 	char *props = default_props;
 
 	/* check options */
@@ -2408,20 +2729,164 @@ zpool_do_detach(int argc, char **argv)
 }
 
 /*
- * zpool online <pool> <device> ...
+ * zpool split [-n] [-o prop=val] ...
+ *		[-o mntopt] ...
+ *		[-R altroot] <pool> <newpool> [<device> ...]
+ *
+ *	-n	Do not split the pool, but display the resulting layout if
+ *		it were to be split.
+ *	-o	Set property=value, or set mount options.
+ *	-R	Mount the split-off pool under an alternate root.
+ *
+ * Splits the named pool and gives it the new pool name.  Devices to be split
+ * off may be listed, provided that no more than one device is specified
+ * per top-level vdev mirror.  The newly split pool is left in an exported
+ * state unless -R is specified.
+ *
+ * Restrictions: the top-level of the pool pool must only be made up of
+ * mirrors; all devices in the pool must be healthy; no device may be
+ * undergoing a resilvering operation.
  */
 int
-zpool_do_online(int argc, char **argv)
+zpool_do_split(int argc, char **argv)
 {
-	int c, i;
-	char *poolname;
+	char *srcpool, *newpool, *propval;
+	char *mntopts = NULL;
+	splitflags_t flags;
+	int c, ret = 0;
 	zpool_handle_t *zhp;
-	int ret = 0;
-	vdev_state_t newstate;
+	nvlist_t *config, *props = NULL;
+
+	flags.dryrun = B_FALSE;
+	flags.import = B_FALSE;
 
 	/* check options */
-	while ((c = getopt(argc, argv, "t")) != -1) {
+	while ((c = getopt(argc, argv, ":R:no:")) != -1) {
 		switch (c) {
+		case 'R':
+			flags.import = B_TRUE;
+			if (add_prop_list(
+			    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg,
+			    &props, B_TRUE) != 0) {
+				if (props)
+					nvlist_free(props);
+				usage(B_FALSE);
+			}
+			break;
+		case 'n':
+			flags.dryrun = B_TRUE;
+			break;
+		case 'o':
+			if ((propval = strchr(optarg, '=')) != NULL) {
+				*propval = '\0';
+				propval++;
+				if (add_prop_list(optarg, propval,
+				    &props, B_TRUE) != 0) {
+					if (props)
+						nvlist_free(props);
+					usage(B_FALSE);
+				}
+			} else {
+				mntopts = optarg;
+			}
+			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+			break;
+		}
+	}
+
+	if (!flags.import && mntopts != NULL) {
+		(void) fprintf(stderr, gettext("setting mntopts is only "
+		    "valid when importing the pool\n"));
+		usage(B_FALSE);
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
+		(void) fprintf(stderr, gettext("Missing pool name\n"));
+		usage(B_FALSE);
+	}
+	if (argc < 2) {
+		(void) fprintf(stderr, gettext("Missing new pool name\n"));
+		usage(B_FALSE);
+	}
+
+	srcpool = argv[0];
+	newpool = argv[1];
+
+	argc -= 2;
+	argv += 2;
+
+	if ((zhp = zpool_open(g_zfs, srcpool)) == NULL)
+		return (1);
+
+	config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv);
+	if (config == NULL) {
+		ret = 1;
+	} else {
+		if (flags.dryrun) {
+			(void) printf(gettext("would create '%s' with the "
+			    "following layout:\n\n"), newpool);
+			print_vdev_tree(NULL, newpool, config, 0, B_FALSE);
+		}
+		nvlist_free(config);
+	}
+
+	zpool_close(zhp);
+
+	if (ret != 0 || flags.dryrun || !flags.import)
+		return (ret);
+
+	/*
+	 * The split was successful. Now we need to open the new
+	 * pool and import it.
+	 */
+	if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL)
+		return (1);
+	if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL &&
+	    zpool_enable_datasets(zhp, mntopts, 0) != 0) {
+		ret = 1;
+		(void) fprintf(stderr, gettext("Split was succssful, but "
+		    "the datasets could not all be mounted\n"));
+		(void) fprintf(stderr, gettext("Try doing '%s' with a "
+		    "different altroot\n"), "zpool import");
+	}
+	zpool_close(zhp);
+
+	return (ret);
+}
+
+
+
+/*
+ * zpool online <pool> <device> ...
+ */
+int
+zpool_do_online(int argc, char **argv)
+{
+	int c, i;
+	char *poolname;
+	zpool_handle_t *zhp;
+	int ret = 0;
+	vdev_state_t newstate;
+	int flags = 0;
+
+	/* check options */
+	while ((c = getopt(argc, argv, "et")) != -1) {
+		switch (c) {
+		case 'e':
+			flags |= ZFS_ONLINE_EXPAND;
+			break;
 		case 't':
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
@@ -2449,7 +2914,7 @@ zpool_do_online(int argc, char **argv)
 		return (1);
 
 	for (i = 1; i < argc; i++) {
-		if (zpool_vdev_online(zhp, argv[i], 0, &newstate) == 0) {
+		if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) {
 			if (newstate != VDEV_STATE_HEALTHY) {
 				(void) printf(gettext("warning: device '%s' "
 				    "onlined, but remains in faulted state\n"),
@@ -2543,31 +3008,80 @@ zpool_do_offline(int argc, char **argv)
 int
 zpool_do_clear(int argc, char **argv)
 {
+	int c;
 	int ret = 0;
+	boolean_t dryrun = B_FALSE;
+	boolean_t do_rewind = B_FALSE;
+	boolean_t xtreme_rewind = B_FALSE;
+	uint32_t rewind_policy = ZPOOL_NO_REWIND;
+	nvlist_t *policy = NULL;
 	zpool_handle_t *zhp;
 	char *pool, *device;
 
-	if (argc < 2) {
+	/* check options */
+	while ((c = getopt(argc, argv, "FnX")) != -1) {
+		switch (c) {
+		case 'F':
+			do_rewind = B_TRUE;
+			break;
+		case 'n':
+			dryrun = B_TRUE;
+			break;
+		case 'X':
+			xtreme_rewind = B_TRUE;
+			break;
+		case '?':
+			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
+			    optopt);
+			usage(B_FALSE);
+		}
+	}
+
+	argc -= optind;
+	argv += optind;
+
+	if (argc < 1) {
 		(void) fprintf(stderr, gettext("missing pool name\n"));
 		usage(B_FALSE);
 	}
 
-	if (argc > 3) {
+	if (argc > 2) {
 		(void) fprintf(stderr, gettext("too many arguments\n"));
 		usage(B_FALSE);
 	}
 
-	pool = argv[1];
-	device = argc == 3 ? argv[2] : NULL;
+	if ((dryrun || xtreme_rewind) && !do_rewind) {
+		(void) fprintf(stderr,
+		    gettext("-n or -X only meaningful with -F\n"));
+		usage(B_FALSE);
+	}
+	if (dryrun)
+		rewind_policy = ZPOOL_TRY_REWIND;
+	else if (do_rewind)
+		rewind_policy = ZPOOL_DO_REWIND;
+	if (xtreme_rewind)
+		rewind_policy |= ZPOOL_EXTREME_REWIND;
+
+	/* In future, further rewind policy choices can be passed along here */
+	if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 ||
+	    nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0)
+		return (1);
+
+	pool = argv[0];
+	device = argc == 2 ? argv[1] : NULL;
 
-	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL)
+	if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) {
+		nvlist_free(policy);
 		return (1);
+	}
 
-	if (zpool_clear(zhp, device) != 0)
+	if (zpool_clear(zhp, device, policy) != 0)
 		ret = 1;
 
 	zpool_close(zhp);
 
+	nvlist_free(policy);
+
 	return (ret);
 }
 
@@ -2642,6 +3156,7 @@ typedef struct status_cbdata {
 	boolean_t	cb_verbose;
 	boolean_t	cb_explain;
 	boolean_t	cb_first;
+	boolean_t	cb_dedup_stats;
 } status_cbdata_t;
 
 /*
@@ -2706,181 +3221,6 @@ print_scrub_status(nvlist_t *nvroot)
 	    (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60));
 }
 
-typedef struct spare_cbdata {
-	uint64_t	cb_guid;
-	zpool_handle_t	*cb_zhp;
-} spare_cbdata_t;
-
-static boolean_t
-find_vdev(nvlist_t *nv, uint64_t search)
-{
-	uint64_t guid;
-	nvlist_t **child;
-	uint_t c, children;
-
-	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 &&
-	    search == guid)
-		return (B_TRUE);
-
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0) {
-		for (c = 0; c < children; c++)
-			if (find_vdev(child[c], search))
-				return (B_TRUE);
-	}
-
-	return (B_FALSE);
-}
-
-static int
-find_spare(zpool_handle_t *zhp, void *data)
-{
-	spare_cbdata_t *cbp = data;
-	nvlist_t *config, *nvroot;
-
-	config = zpool_get_config(zhp, NULL);
-	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-
-	if (find_vdev(nvroot, cbp->cb_guid)) {
-		cbp->cb_zhp = zhp;
-		return (1);
-	}
-
-	zpool_close(zhp);
-	return (0);
-}
-
-/*
- * Print out configuration state as requested by status_callback.
- */
-void
-print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
-    int namewidth, int depth, boolean_t isspare, boolean_t print_logs)
-{
-	nvlist_t **child;
-	uint_t c, children;
-	vdev_stat_t *vs;
-	char rbuf[6], wbuf[6], cbuf[6], repaired[7];
-	char *vname;
-	uint64_t notpresent;
-	spare_cbdata_t cb;
-	char *state;
-
-	verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS,
-	    (uint64_t **)&vs, &c) == 0);
-
-	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) != 0)
-		children = 0;
-
-	state = zpool_state_to_name(vs->vs_state, vs->vs_aux);
-	if (isspare) {
-		/*
-		 * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for
-		 * online drives.
-		 */
-		if (vs->vs_aux == VDEV_AUX_SPARED)
-			state = "INUSE";
-		else if (vs->vs_state == VDEV_STATE_HEALTHY)
-			state = "AVAIL";
-	}
-
-	(void) printf("\t%*s%-*s  %-8s", depth, "", namewidth - depth,
-	    name, state);
-
-	if (!isspare) {
-		zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf));
-		zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf));
-		zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf));
-		(void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf);
-	}
-
-	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-	    &notpresent) == 0) {
-		char *path;
-		verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0);
-		(void) printf("  was %s", path);
-	} else if (vs->vs_aux != 0) {
-		(void) printf("  ");
-
-		switch (vs->vs_aux) {
-		case VDEV_AUX_OPEN_FAILED:
-			(void) printf(gettext("cannot open"));
-			break;
-
-		case VDEV_AUX_BAD_GUID_SUM:
-			(void) printf(gettext("missing device"));
-			break;
-
-		case VDEV_AUX_NO_REPLICAS:
-			(void) printf(gettext("insufficient replicas"));
-			break;
-
-		case VDEV_AUX_VERSION_NEWER:
-			(void) printf(gettext("newer version"));
-			break;
-
-		case VDEV_AUX_SPARED:
-			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID,
-			    &cb.cb_guid) == 0);
-			if (zpool_iter(g_zfs, find_spare, &cb) == 1) {
-				if (strcmp(zpool_get_name(cb.cb_zhp),
-				    zpool_get_name(zhp)) == 0)
-					(void) printf(gettext("currently in "
-					    "use"));
-				else
-					(void) printf(gettext("in use by "
-					    "pool '%s'"),
-					    zpool_get_name(cb.cb_zhp));
-				zpool_close(cb.cb_zhp);
-			} else {
-				(void) printf(gettext("currently in use"));
-			}
-			break;
-
-		case VDEV_AUX_ERR_EXCEEDED:
-			(void) printf(gettext("too many errors"));
-			break;
-
-		case VDEV_AUX_IO_FAILURE:
-			(void) printf(gettext("experienced I/O failures"));
-			break;
-
-		case VDEV_AUX_BAD_LOG:
-			(void) printf(gettext("bad intent log"));
-			break;
-
-		default:
-			(void) printf(gettext("corrupted data"));
-			break;
-		}
-	} else if (vs->vs_scrub_repaired != 0 && children == 0) {
-		/*
-		 * Report bytes resilvered/repaired on leaf devices.
-		 */
-		zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired));
-		(void) printf(gettext("  %s %s"), repaired,
-		    (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ?
-		    "resilvered" : "repaired");
-	}
-
-	(void) printf("\n");
-
-	for (c = 0; c < children; c++) {
-		uint64_t is_log = B_FALSE;
-
-		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
-		    &is_log);
-		if ((is_log && !print_logs) || (!is_log && print_logs))
-			continue;
-		vname = zpool_vdev_name(g_zfs, zhp, child[c]);
-		print_status_config(zhp, vname, child[c],
-		    namewidth, depth + 2, isspare, B_FALSE);
-		free(vname);
-	}
-}
-
 static void
 print_error_log(zpool_handle_t *zhp)
 {
@@ -2929,9 +3269,9 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares,
 	(void) printf(gettext("\tspares\n"));
 
 	for (i = 0; i < nspares; i++) {
-		name = zpool_vdev_name(g_zfs, zhp, spares[i]);
+		name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE);
 		print_status_config(zhp, name, spares[i],
-		    namewidth, 2, B_TRUE, B_FALSE);
+		    namewidth, 2, B_TRUE);
 		free(name);
 	}
 }
@@ -2949,13 +3289,43 @@ print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache,
 	(void) printf(gettext("\tcache\n"));
 
 	for (i = 0; i < nl2cache; i++) {
-		name = zpool_vdev_name(g_zfs, zhp, l2cache[i]);
+		name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE);
 		print_status_config(zhp, name, l2cache[i],
-		    namewidth, 2, B_FALSE, B_FALSE);
+		    namewidth, 2, B_FALSE);
 		free(name);
 	}
 }
 
+static void
+print_dedup_stats(nvlist_t *config)
+{
+	ddt_histogram_t *ddh;
+	ddt_stat_t *dds;
+	ddt_object_t *ddo;
+	uint_t c;
+
+	/*
+	 * If the pool was faulted then we may not have been able to
+	 * obtain the config. Otherwise, if have anything in the dedup
+	 * table continue processing the stats.
+	 */
+	if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS,
+	    (uint64_t **)&ddo, &c) != 0 || ddo->ddo_count == 0)
+		return;
+
+	(void) printf("\n");
+	(void) printf("DDT entries %llu, size %llu on disk, %llu in core\n",
+	    (u_longlong_t)ddo->ddo_count,
+	    (u_longlong_t)ddo->ddo_dspace,
+	    (u_longlong_t)ddo->ddo_mspace);
+
+	verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS,
+	    (uint64_t **)&dds, &c) == 0);
+	verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM,
+	    (uint64_t **)&ddh, &c) == 0);
+	zpool_dump_ddt(dds, ddh);
+}
+
 /*
  * Display a summary of pool status.  Displays a summary such as:
  *
@@ -3046,8 +3416,8 @@ status_callback(zpool_handle_t *zhp, void *data)
 		    "be used because the label is missing \n\tor invalid.  "
 		    "There are insufficient replicas for the pool to "
 		    "continue\n\tfunctioning.\n"));
-		(void) printf(gettext("action: Destroy and re-create the pool "
-		    "from a backup source.\n"));
+		zpool_explain_recover(zpool_get_handle(zhp),
+		    zpool_get_name(zhp), reason, config);
 		break;
 
 	case ZPOOL_STATUS_FAILING_DEV:
@@ -3071,6 +3441,17 @@ status_callback(zpool_handle_t *zhp, void *data)
 		    "replace'.\n"));
 		break;
 
+	case ZPOOL_STATUS_REMOVED_DEV:
+		(void) printf(gettext("status: One or more devices has "
+		    "been removed by the administrator.\n\tSufficient "
+		    "replicas exist for the pool to continue functioning in "
+		    "a\n\tdegraded state.\n"));
+		(void) printf(gettext("action: Online the device using "
+		    "'zpool online' or replace the device with\n\t'zpool "
+		    "replace'.\n"));
+		break;
+
+
 	case ZPOOL_STATUS_RESILVERING:
 		(void) printf(gettext("status: One or more devices is "
 		    "currently being resilvered.  The pool will\n\tcontinue "
@@ -3091,8 +3472,8 @@ status_callback(zpool_handle_t *zhp, void *data)
 	case ZPOOL_STATUS_CORRUPT_POOL:
 		(void) printf(gettext("status: The pool metadata is corrupted "
 		    "and the pool cannot be opened.\n"));
-		(void) printf(gettext("action: Destroy and re-create the pool "
-		    "from a backup source.\n"));
+		zpool_explain_recover(zpool_get_handle(zhp),
+		    zpool_get_name(zhp), reason, config);
 		break;
 
 	case ZPOOL_STATUS_VERSION_OLDER:
@@ -3181,11 +3562,10 @@ status_callback(zpool_handle_t *zhp, void *data)
 		(void) printf(gettext("\t%-*s  %-8s %5s %5s %5s\n"), namewidth,
 		    "NAME", "STATE", "READ", "WRITE", "CKSUM");
 		print_status_config(zhp, zpool_get_name(zhp), nvroot,
-		    namewidth, 0, B_FALSE, B_FALSE);
-		if (num_logs(nvroot) > 0)
-			print_status_config(zhp, "logs", nvroot, namewidth, 0,
-			    B_FALSE, B_TRUE);
+		    namewidth, 0, B_FALSE);
 
+		if (num_logs(nvroot) > 0)
+			print_logs(zhp, nvroot, namewidth, B_TRUE);
 		if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
 		    &l2cache, &nl2cache) == 0)
 			print_l2cache(zhp, l2cache, nl2cache, namewidth);
@@ -3228,6 +3608,9 @@ status_callback(zpool_handle_t *zhp, void *data)
 			else
 				print_error_log(zhp);
 		}
+
+		if (cbp->cb_dedup_stats)
+			print_dedup_stats(config);
 	} else {
 		(void) printf(gettext("config: The configuration cannot be "
 		    "determined.\n"));
@@ -3241,6 +3624,7 @@ status_callback(zpool_handle_t *zhp, void *data)
  *
  *	-v	Display complete error logs
  *	-x	Display only pools with potential problems
+ *	-D	Display dedup status (undocumented)
  *
  * Describes the health status of all pools or some subset.
  */
@@ -3252,7 +3636,7 @@ zpool_do_status(int argc, char **argv)
 	status_cbdata_t cb = { 0 };
 
 	/* check options */
-	while ((c = getopt(argc, argv, "vx")) != -1) {
+	while ((c = getopt(argc, argv, "vxD")) != -1) {
 		switch (c) {
 		case 'v':
 			cb.cb_verbose = B_TRUE;
@@ -3260,6 +3644,9 @@ zpool_do_status(int argc, char **argv)
 		case 'x':
 			cb.cb_explain = B_TRUE;
 			break;
+		case 'D':
+			cb.cb_dedup_stats = B_TRUE;
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -3409,7 +3796,7 @@ zpool_do_upgrade(int argc, char **argv)
 
 
 	/* check options */
-	while ((c = getopt(argc, argv, "avV:")) != -1) {
+	while ((c = getopt(argc, argv, ":avV:")) != -1) {
 		switch (c) {
 		case 'a':
 			cb.cb_all = B_TRUE;
@@ -3426,6 +3813,11 @@ zpool_do_upgrade(int argc, char **argv)
 				usage(B_FALSE);
 			}
 			break;
+		case ':':
+			(void) fprintf(stderr, gettext("missing argument for "
+			    "'%c' option\n"), optopt);
+			usage(B_FALSE);
+			break;
 		case '?':
 			(void) fprintf(stderr, gettext("invalid option '%c'\n"),
 			    optopt);
@@ -3486,9 +3878,18 @@ zpool_do_upgrade(int argc, char **argv)
 		(void) printf(gettext(" 11  Improved scrub performance\n"));
 		(void) printf(gettext(" 12  Snapshot properties\n"));
 		(void) printf(gettext(" 13  snapused property\n"));
-		(void) printf(gettext(" 14  passthrough-x aclinherit "
-		    "support\n"));
-		(void) printf(gettext("For more information on a particular "
+		(void) printf(gettext(" 14  passthrough-x aclinherit\n"));
+		(void) printf(gettext(" 15  user/group space accounting\n"));
+		(void) printf(gettext(" 16  stmf property support\n"));
+		(void) printf(gettext(" 17  Triple-parity RAID-Z\n"));
+		(void) printf(gettext(" 18  Snapshot user holds\n"));
+		(void) printf(gettext(" 19  Log device removal\n"));
+		(void) printf(gettext(" 20  Compression using zle "
+		    "(zero-length encoding)\n"));
+		(void) printf(gettext(" 21  Deduplication\n"));
+		(void) printf(gettext(" 22  Received properties\n"));
+		(void) printf(gettext(" 23  Slim ZIL\n"));
+		(void) printf(gettext("\nFor more information on a particular "
 		    "version, including supported releases, see:\n\n"));
 		(void) printf("http://www.opensolaris.org/os/community/zfs/"
 		    "version/N\n\n");
@@ -3534,47 +3935,6 @@ typedef struct hist_cbdata {
 	int internal;
 } hist_cbdata_t;
 
-char *hist_event_table[LOG_END] = {
-	"invalid event",
-	"pool create",
-	"vdev add",
-	"pool remove",
-	"pool destroy",
-	"pool export",
-	"pool import",
-	"vdev attach",
-	"vdev replace",
-	"vdev detach",
-	"vdev online",
-	"vdev offline",
-	"vdev upgrade",
-	"pool clear",
-	"pool scrub",
-	"pool property set",
-	"create",
-	"clone",
-	"destroy",
-	"destroy_begin_sync",
-	"inherit",
-	"property set",
-	"quota set",
-	"permission update",
-	"permission remove",
-	"permission who remove",
-	"promote",
-	"receive",
-	"rename",
-	"reservation set",
-	"replay_inc_sync",
-	"replay_full_sync",
-	"rollback",
-	"snapshot",
-	"filesystem version upgrade",
-	"refquota set",
-	"refreservation set",
-	"pool scrub done",
-};
-
 /*
  * Print out the command history for a specific pool.
  */
@@ -3744,7 +4104,8 @@ get_callback(zpool_handle_t *zhp, void *data)
 			continue;
 
 		zprop_print_one_property(zpool_get_name(zhp), cbp,
-		    zpool_prop_to_name(pl->pl_prop), value, srctype, NULL);
+		    zpool_prop_to_name(pl->pl_prop), value, srctype, NULL,
+		    NULL);
 	}
 	return (0);
 }
diff --git a/external/cddl/osnet/dist/cmd/zpool/zpool_util.c b/external/cddl/osnet/dist/cmd/zpool/zpool_util.c
index f44da4ff60f53..c7a002efb17cf 100644
--- a/external/cddl/osnet/dist/cmd/zpool/zpool_util.c
+++ b/external/cddl/osnet/dist/cmd/zpool/zpool_util.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <errno.h>
 #include <libgen.h>
 #include <libintl.h>
@@ -50,22 +48,6 @@ safe_malloc(size_t size)
 	return (data);
 }
 
-/*
- * Same as above, but for strdup()
- */
-char *
-safe_strdup(const char *str)
-{
-	char *ret;
-
-	if ((ret = strdup(str)) == NULL) {
-		(void) fprintf(stderr, "internal error: out of memory\n");
-		exit(1);
-	}
-
-	return (ret);
-}
-
 /*
  * Display an out of memory error message and abort the current program.
  */
diff --git a/external/cddl/osnet/dist/cmd/zpool/zpool_util.h b/external/cddl/osnet/dist/cmd/zpool/zpool_util.h
index e82f3202af2ab..a18b8b705fd9a 100644
--- a/external/cddl/osnet/dist/cmd/zpool/zpool_util.h
+++ b/external/cddl/osnet/dist/cmd/zpool/zpool_util.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,7 +37,6 @@ extern "C" {
  * Basic utility functions
  */
 void *safe_malloc(size_t);
-char *safe_strdup(const char *);
 void zpool_no_memory(void);
 uint_t num_logs(nvlist_t *nv);
 
@@ -47,6 +46,8 @@ uint_t num_logs(nvlist_t *nv);
 
 nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep,
     boolean_t isreplace, boolean_t dryrun, int argc, char **argv);
+nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname,
+    nvlist_t *props, splitflags_t flags, int argc, char **argv);
 
 /*
  * Pool list functions
diff --git a/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c b/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c
index 10007c14927f6..3c725d232c77c 100644
--- a/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c
+++ b/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,6 +67,7 @@
 #include <libdiskmgt.h>
 #include <libintl.h>
 #include <libnvpair.h>
+#include <limits.h>
 #include <stdio.h>
 #include <string.h>
 #include <unistd.h>
@@ -1093,20 +1094,35 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing,
 }
 
 static const char *
-is_grouping(const char *type, int *mindev)
+is_grouping(const char *type, int *mindev, int *maxdev)
 {
-	if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) {
-		if (mindev != NULL)
-			*mindev = 2;
-		return (VDEV_TYPE_RAIDZ);
-	}
+	if (strncmp(type, "raidz", 5) == 0) {
+		const char *p = type + 5;
+		char *end;
+		long nparity;
+
+		if (*p == '\0') {
+			nparity = 1;
+		} else if (*p == '0') {
+			return (NULL); /* no zero prefixes allowed */
+		} else {
+			errno = 0;
+			nparity = strtol(p, &end, 10);
+			if (errno != 0 || nparity < 1 || nparity >= 255 ||
+			    *end != '\0')
+				return (NULL);
+		}
 
-	if (strcmp(type, "raidz2") == 0) {
 		if (mindev != NULL)
-			*mindev = 3;
+			*mindev = nparity + 1;
+		if (maxdev != NULL)
+			*maxdev = 255;
 		return (VDEV_TYPE_RAIDZ);
 	}
 
+	if (maxdev != NULL)
+		*maxdev = INT_MAX;
+
 	if (strcmp(type, "mirror") == 0) {
 		if (mindev != NULL)
 			*mindev = 2;
@@ -1144,7 +1160,7 @@ nvlist_t *
 construct_spec(int argc, char **argv)
 {
 	nvlist_t *nvroot, *nv, **top, **spares, **l2cache;
-	int t, toplevels, mindev, nspares, nlogs, nl2cache;
+	int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache;
 	const char *type;
 	uint64_t is_log;
 	boolean_t seen_logs;
@@ -1166,7 +1182,7 @@ construct_spec(int argc, char **argv)
 		 * If it's a mirror or raidz, the subsequent arguments are
 		 * its leaves -- until we encounter the next mirror or raidz.
 		 */
-		if ((type = is_grouping(argv[0], &mindev)) != NULL) {
+		if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) {
 			nvlist_t **child = NULL;
 			int c, children = 0;
 
@@ -1223,7 +1239,7 @@ construct_spec(int argc, char **argv)
 			}
 
 			for (c = 1; c < argc; c++) {
-				if (is_grouping(argv[c], NULL) != NULL)
+				if (is_grouping(argv[c], NULL, NULL) != NULL)
 					break;
 				children++;
 				child = realloc(child,
@@ -1243,6 +1259,13 @@ construct_spec(int argc, char **argv)
 				return (NULL);
 			}
 
+			if (children > maxdev) {
+				(void) fprintf(stderr, gettext("invalid vdev "
+				    "specification: %s supports no more than "
+				    "%d devices\n"), argv[0], maxdev);
+				return (NULL);
+			}
+
 			argc -= c;
 			argv += c;
 
@@ -1337,6 +1360,52 @@ construct_spec(int argc, char **argv)
 	return (nvroot);
 }
 
+nvlist_t *
+split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props,
+    splitflags_t flags, int argc, char **argv)
+{
+	nvlist_t *newroot = NULL, **child;
+	uint_t c, children;
+
+	if (argc > 0) {
+		if ((newroot = construct_spec(argc, argv)) == NULL) {
+			(void) fprintf(stderr, gettext("Unable to build a "
+			    "pool from the specified devices\n"));
+			return (NULL);
+		}
+
+		if (!flags.dryrun && make_disks(zhp, newroot) != 0) {
+			nvlist_free(newroot);
+			return (NULL);
+		}
+
+		/* avoid any tricks in the spec */
+		verify(nvlist_lookup_nvlist_array(newroot,
+		    ZPOOL_CONFIG_CHILDREN, &child, &children) == 0);
+		for (c = 0; c < children; c++) {
+			char *path;
+			const char *type;
+			int min, max;
+
+			verify(nvlist_lookup_string(child[c],
+			    ZPOOL_CONFIG_PATH, &path) == 0);
+			if ((type = is_grouping(path, &min, &max)) != NULL) {
+				(void) fprintf(stderr, gettext("Cannot use "
+				    "'%s' as a device for splitting\n"), type);
+				nvlist_free(newroot);
+				return (NULL);
+			}
+		}
+	}
+
+	if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) {
+		if (newroot != NULL)
+			nvlist_free(newroot);
+		return (NULL);
+	}
+
+	return (newroot);
+}
 
 /*
  * Get and validate the contents of the given vdev specification.  This ensures
diff --git a/external/cddl/osnet/dist/cmd/ztest/ztest.c b/external/cddl/osnet/dist/cmd/ztest/ztest.c
index 53cc6c7093b72..24464b4594b9e 100644
--- a/external/cddl/osnet/dist/cmd/ztest/ztest.c
+++ b/external/cddl/osnet/dist/cmd/ztest/ztest.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -76,6 +76,7 @@
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
+#include <sys/dbuf.h>
 #include <sys/zap.h>
 #include <sys/dmu_objset.h>
 #include <sys/poll.h>
@@ -85,13 +86,14 @@
 #include <sys/mman.h>
 #include <sys/resource.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
 #include <sys/zil.h>
+#include <sys/zil_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/vdev_file.h>
 #include <sys/spa_impl.h>
+#include <sys/metaslab_impl.h>
 #include <sys/dsl_prop.h>
+#include <sys/dsl_dataset.h>
 #include <sys/refcount.h>
 #include <stdio.h>
 #include <stdio_ext.h>
@@ -103,6 +105,7 @@
 #include <ctype.h>
 #include <math.h>
 #include <sys/fs/zfs.h>
+#include <libnvpair.h>
 
 static char cmdname[] = "ztest";
 static char *zopt_pool = cmdname;
@@ -122,41 +125,103 @@ static int zopt_verbose = 0;
 static int zopt_init = 1;
 static char *zopt_dir = "/tmp";
 static uint64_t zopt_time = 300;	/* 5 minutes */
-static int zopt_maxfaults;
+
+#define	BT_MAGIC	0x123456789abcdefULL
+#define	MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1)
+
+enum ztest_io_type {
+	ZTEST_IO_WRITE_TAG,
+	ZTEST_IO_WRITE_PATTERN,
+	ZTEST_IO_WRITE_ZEROES,
+	ZTEST_IO_TRUNCATE,
+	ZTEST_IO_SETATTR,
+	ZTEST_IO_TYPES
+};
 
 typedef struct ztest_block_tag {
+	uint64_t	bt_magic;
 	uint64_t	bt_objset;
 	uint64_t	bt_object;
 	uint64_t	bt_offset;
+	uint64_t	bt_gen;
 	uint64_t	bt_txg;
-	uint64_t	bt_thread;
-	uint64_t	bt_seq;
+	uint64_t	bt_crtxg;
 } ztest_block_tag_t;
 
-typedef struct ztest_args {
-	char		za_pool[MAXNAMELEN];
-	spa_t		*za_spa;
-	objset_t	*za_os;
-	zilog_t		*za_zilog;
-	thread_t	za_thread;
-	uint64_t	za_instance;
-	uint64_t	za_random;
-	uint64_t	za_diroff;
-	uint64_t	za_diroff_shared;
-	uint64_t	za_zil_seq;
-	hrtime_t	za_start;
-	hrtime_t	za_stop;
-	hrtime_t	za_kill;
-	/*
-	 * Thread-local variables can go here to aid debugging.
-	 */
-	ztest_block_tag_t za_rbt;
-	ztest_block_tag_t za_wbt;
-	dmu_object_info_t za_doi;
-	dmu_buf_t	*za_dbuf;
-} ztest_args_t;
-
-typedef void ztest_func_t(ztest_args_t *);
+typedef struct bufwad {
+	uint64_t	bw_index;
+	uint64_t	bw_txg;
+	uint64_t	bw_data;
+} bufwad_t;
+
+/*
+ * XXX -- fix zfs range locks to be generic so we can use them here.
+ */
+typedef enum {
+	RL_READER,
+	RL_WRITER,
+	RL_APPEND
+} rl_type_t;
+
+typedef struct rll {
+	void		*rll_writer;
+	int		rll_readers;
+	mutex_t		rll_lock;
+	cond_t		rll_cv;
+} rll_t;
+
+typedef struct rl {
+	uint64_t	rl_object;
+	uint64_t	rl_offset;
+	uint64_t	rl_size;
+	rll_t		*rl_lock;
+} rl_t;
+
+#define	ZTEST_RANGE_LOCKS	64
+#define	ZTEST_OBJECT_LOCKS	64
+
+/*
+ * Object descriptor.  Used as a template for object lookup/create/remove.
+ */
+typedef struct ztest_od {
+	uint64_t	od_dir;
+	uint64_t	od_object;
+	dmu_object_type_t od_type;
+	dmu_object_type_t od_crtype;
+	uint64_t	od_blocksize;
+	uint64_t	od_crblocksize;
+	uint64_t	od_gen;
+	uint64_t	od_crgen;
+	char		od_name[MAXNAMELEN];
+} ztest_od_t;
+
+/*
+ * Per-dataset state.
+ */
+typedef struct ztest_ds {
+	objset_t	*zd_os;
+	zilog_t		*zd_zilog;
+	uint64_t	zd_seq;
+	ztest_od_t	*zd_od;		/* debugging aid */
+	char		zd_name[MAXNAMELEN];
+	mutex_t		zd_dirobj_lock;
+	rll_t		zd_object_lock[ZTEST_OBJECT_LOCKS];
+	rll_t		zd_range_lock[ZTEST_RANGE_LOCKS];
+} ztest_ds_t;
+
+/*
+ * Per-iteration state.
+ */
+typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id);
+
+typedef struct ztest_info {
+	ztest_func_t	*zi_func;	/* test function */
+	uint64_t	zi_iters;	/* iterations per execution */
+	uint64_t	*zi_interval;	/* execute every <interval> seconds */
+	uint64_t	zi_call_count;	/* per-pass count */
+	uint64_t	zi_call_time;	/* per-pass time */
+	uint64_t	zi_call_next;	/* next time to call this function */
+} ztest_info_t;
 
 /*
  * Note: these aren't static because we want dladdr() to work.
@@ -164,94 +229,126 @@ typedef void ztest_func_t(ztest_args_t *);
 ztest_func_t ztest_dmu_read_write;
 ztest_func_t ztest_dmu_write_parallel;
 ztest_func_t ztest_dmu_object_alloc_free;
+ztest_func_t ztest_dmu_commit_callbacks;
 ztest_func_t ztest_zap;
 ztest_func_t ztest_zap_parallel;
-ztest_func_t ztest_traverse;
-ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_zil_commit;
+ztest_func_t ztest_dmu_read_write_zcopy;
 ztest_func_t ztest_dmu_objset_create_destroy;
+ztest_func_t ztest_dmu_prealloc;
+ztest_func_t ztest_fzap;
 ztest_func_t ztest_dmu_snapshot_create_destroy;
+ztest_func_t ztest_dsl_prop_get_set;
+ztest_func_t ztest_spa_prop_get_set;
 ztest_func_t ztest_spa_create_destroy;
 ztest_func_t ztest_fault_inject;
+ztest_func_t ztest_ddt_repair;
+ztest_func_t ztest_dmu_snapshot_hold;
 ztest_func_t ztest_spa_rename;
+ztest_func_t ztest_scrub;
+ztest_func_t ztest_dsl_dataset_promote_busy;
 ztest_func_t ztest_vdev_attach_detach;
 ztest_func_t ztest_vdev_LUN_growth;
 ztest_func_t ztest_vdev_add_remove;
 ztest_func_t ztest_vdev_aux_add_remove;
-ztest_func_t ztest_scrub;
-
-typedef struct ztest_info {
-	ztest_func_t	*zi_func;	/* test function */
-	uint64_t	zi_iters;	/* iterations per execution */
-	uint64_t	*zi_interval;	/* execute every <interval> seconds */
-	uint64_t	zi_calls;	/* per-pass count */
-	uint64_t	zi_call_time;	/* per-pass time */
-	uint64_t	zi_call_total;	/* cumulative total */
-	uint64_t	zi_call_target;	/* target cumulative total */
-} ztest_info_t;
+ztest_func_t ztest_split_pool;
 
-uint64_t zopt_always = 0;		/* all the time */
-uint64_t zopt_often = 1;		/* every second */
-uint64_t zopt_sometimes = 10;		/* every 10 seconds */
-uint64_t zopt_rarely = 60;		/* every 60 seconds */
+uint64_t zopt_always = 0ULL * NANOSEC;		/* all the time */
+uint64_t zopt_incessant = 1ULL * NANOSEC / 10;	/* every 1/10 second */
+uint64_t zopt_often = 1ULL * NANOSEC;		/* every second */
+uint64_t zopt_sometimes = 10ULL * NANOSEC;	/* every 10 seconds */
+uint64_t zopt_rarely = 60ULL * NANOSEC;		/* every 60 seconds */
 
 ztest_info_t ztest_info[] = {
 	{ ztest_dmu_read_write,			1,	&zopt_always	},
-	{ ztest_dmu_write_parallel,		30,	&zopt_always	},
+	{ ztest_dmu_write_parallel,		10,	&zopt_always	},
 	{ ztest_dmu_object_alloc_free,		1,	&zopt_always	},
+	{ ztest_dmu_commit_callbacks,		1,	&zopt_always	},
 	{ ztest_zap,				30,	&zopt_always	},
 	{ ztest_zap_parallel,			100,	&zopt_always	},
-	{ ztest_dsl_prop_get_set,		1,	&zopt_sometimes	},
-	{ ztest_dmu_objset_create_destroy,	1,	&zopt_sometimes },
-	{ ztest_dmu_snapshot_create_destroy,	1,	&zopt_sometimes },
-	{ ztest_spa_create_destroy,		1,	&zopt_sometimes },
+	{ ztest_split_pool,			1,	&zopt_always	},
+	{ ztest_zil_commit,			1,	&zopt_incessant	},
+	{ ztest_dmu_read_write_zcopy,		1,	&zopt_often	},
+	{ ztest_dmu_objset_create_destroy,	1,	&zopt_often	},
+	{ ztest_dsl_prop_get_set,		1,	&zopt_often	},
+	{ ztest_spa_prop_get_set,		1,	&zopt_sometimes	},
+#if 0
+	{ ztest_dmu_prealloc,			1,	&zopt_sometimes	},
+#endif
+	{ ztest_fzap,				1,	&zopt_sometimes	},
+	{ ztest_dmu_snapshot_create_destroy,	1,	&zopt_sometimes	},
+	{ ztest_spa_create_destroy,		1,	&zopt_sometimes	},
 	{ ztest_fault_inject,			1,	&zopt_sometimes	},
+	{ ztest_ddt_repair,			1,	&zopt_sometimes	},
+	{ ztest_dmu_snapshot_hold,		1,	&zopt_sometimes	},
 	{ ztest_spa_rename,			1,	&zopt_rarely	},
+	{ ztest_scrub,				1,	&zopt_rarely	},
+	{ ztest_dsl_dataset_promote_busy,	1,	&zopt_rarely	},
 	{ ztest_vdev_attach_detach,		1,	&zopt_rarely	},
 	{ ztest_vdev_LUN_growth,		1,	&zopt_rarely	},
 	{ ztest_vdev_add_remove,		1,	&zopt_vdevtime	},
 	{ ztest_vdev_aux_add_remove,		1,	&zopt_vdevtime	},
-	{ ztest_scrub,				1,	&zopt_vdevtime	},
 };
 
 #define	ZTEST_FUNCS	(sizeof (ztest_info) / sizeof (ztest_info_t))
 
-#define	ZTEST_SYNC_LOCKS	16
+/*
+ * The following struct is used to hold a list of uncalled commit callbacks.
+ * The callbacks are ordered by txg number.
+ */
+typedef struct ztest_cb_list {
+	mutex_t	zcl_callbacks_lock;
+	list_t	zcl_callbacks;
+} ztest_cb_list_t;
 
 /*
  * Stuff we need to share writably between parent and child.
  */
 typedef struct ztest_shared {
-	mutex_t		zs_vdev_lock;
-	rwlock_t	zs_name_lock;
-	uint64_t	zs_vdev_primaries;
-	uint64_t	zs_vdev_aux;
+	char		*zs_pool;
+	spa_t		*zs_spa;
+	hrtime_t	zs_proc_start;
+	hrtime_t	zs_proc_stop;
+	hrtime_t	zs_thread_start;
+	hrtime_t	zs_thread_stop;
+	hrtime_t	zs_thread_kill;
 	uint64_t	zs_enospc_count;
-	hrtime_t	zs_start_time;
-	hrtime_t	zs_stop_time;
+	uint64_t	zs_vdev_next_leaf;
+	uint64_t	zs_vdev_aux;
 	uint64_t	zs_alloc;
 	uint64_t	zs_space;
+	mutex_t		zs_vdev_lock;
+	rwlock_t	zs_name_lock;
 	ztest_info_t	zs_info[ZTEST_FUNCS];
-	mutex_t		zs_sync_lock[ZTEST_SYNC_LOCKS];
-	uint64_t	zs_seq[ZTEST_SYNC_LOCKS];
+	uint64_t	zs_splits;
+	uint64_t	zs_mirrors;
+	ztest_ds_t	zs_zd[];
 } ztest_shared_t;
 
+#define	ID_PARALLEL	-1ULL
+
 static char ztest_dev_template[] = "%s/%s.%llua";
 static char ztest_aux_template[] = "%s/%s.%s.%llu";
-static ztest_shared_t *ztest_shared;
+ztest_shared_t *ztest_shared;
+uint64_t *ztest_seq;
 
 static int ztest_random_fd;
 static int ztest_dump_core = 1;
 
 static boolean_t ztest_exiting;
 
-extern uint64_t metaslab_gang_bang;
+/* Global commit callback list */
+static ztest_cb_list_t zcl;
 
-#define	ZTEST_DIROBJ		1
-#define	ZTEST_MICROZAP_OBJ	2
-#define	ZTEST_FATZAP_OBJ	3
+extern uint64_t metaslab_gang_bang;
+extern uint64_t metaslab_df_alloc_threshold;
+static uint64_t metaslab_sz;
 
-#define	ZTEST_DIROBJ_BLOCKSIZE	(1 << 10)
-#define	ZTEST_DIRSIZE		256
+enum ztest_object {
+	ZTEST_META_DNODE = 0,
+	ZTEST_DIROBJ,
+	ZTEST_OBJECTS
+};
 
 static void usage(boolean_t) __NORETURN;
 
@@ -405,27 +502,6 @@ usage(boolean_t requested)
 	exit(requested ? 0 : 1);
 }
 
-static uint64_t
-ztest_random(uint64_t range)
-{
-	uint64_t r;
-
-	if (range == 0)
-		return (0);
-
-	if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
-		fatal(1, "short read from /dev/urandom");
-
-	return (r % range);
-}
-
-static void
-ztest_record_enospc(char *s)
-{
-	dprintf("ENOSPC doing: %s\n", s ? s : "<unknown>");
-	ztest_shared->zs_enospc_count++;
-}
-
 static void
 process_options(int argc, char **argv)
 {
@@ -471,7 +547,7 @@ process_options(int argc, char **argv)
 			zopt_raidz = MAX(1, value);
 			break;
 		case 'R':
-			zopt_raidz_parity = MIN(MAX(value, 1), 2);
+			zopt_raidz_parity = MIN(MAX(value, 1), 3);
 			break;
 		case 'd':
 			zopt_datasets = MAX(1, value);
@@ -518,8 +594,37 @@ process_options(int argc, char **argv)
 
 	zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1);
 
-	zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX);
-	zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1;
+	zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs :
+	    UINT64_MAX >> 2);
+}
+
+static void
+ztest_kill(ztest_shared_t *zs)
+{
+	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa));
+	zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa));
+	(void) kill(getpid(), SIGKILL);
+}
+
+static uint64_t
+ztest_random(uint64_t range)
+{
+	uint64_t r;
+
+	if (range == 0)
+		return (0);
+
+	if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r))
+		fatal(1, "short read from /dev/urandom");
+
+	return (r % range);
+}
+
+/* ARGSUSED */
+static void
+ztest_record_enospc(const char *s)
+{
+	ztest_shared->zs_enospc_count++;
 }
 
 static uint64_t
@@ -548,7 +653,7 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift)
 			(void) sprintf(path, ztest_aux_template,
 			    zopt_dir, zopt_pool, aux, vdev);
 		} else {
-			vdev = ztest_shared->zs_vdev_primaries++;
+			vdev = ztest_shared->zs_vdev_next_leaf++;
 			(void) sprintf(path, ztest_dev_template,
 			    zopt_dir, zopt_pool, vdev);
 		}
@@ -659,270 +764,1479 @@ make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift,
 	return (root);
 }
 
-static void
-ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx)
+static int
+ztest_random_blocksize(void)
 {
-	int bs = SPA_MINBLOCKSHIFT +
-	    ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1);
-	int ibs = DN_MIN_INDBLKSHIFT +
-	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1);
-	int error;
+	return (1 << (SPA_MINBLOCKSHIFT +
+	    ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1)));
+}
 
-	error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx);
-	if (error) {
-		char osname[300];
-		dmu_objset_name(os, osname);
-		fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d",
-		    osname, object, 1 << bs, ibs, error);
-	}
+static int
+ztest_random_ibshift(void)
+{
+	return (DN_MIN_INDBLKSHIFT +
+	    ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1));
 }
 
-static uint8_t
-ztest_random_checksum(void)
+static uint64_t
+ztest_random_vdev_top(spa_t *spa, boolean_t log_ok)
 {
-	uint8_t checksum;
+	uint64_t top;
+	vdev_t *rvd = spa->spa_root_vdev;
+	vdev_t *tvd;
 
-	do {
-		checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS);
-	} while (zio_checksum_table[checksum].ci_zbt);
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
-	if (checksum == ZIO_CHECKSUM_OFF)
-		checksum = ZIO_CHECKSUM_ON;
+	do {
+		top = ztest_random(rvd->vdev_children);
+		tvd = rvd->vdev_child[top];
+	} while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) ||
+	    tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL);
 
-	return (checksum);
+	return (top);
 }
 
-static uint8_t
-ztest_random_compress(void)
+static uint64_t
+ztest_random_dsl_prop(zfs_prop_t prop)
 {
-	return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS));
-}
+	uint64_t value;
 
-typedef struct ztest_replay {
-	objset_t	*zr_os;
-	uint64_t	zr_assign;
-} ztest_replay_t;
+	do {
+		value = zfs_prop_random_value(prop, ztest_random(-1ULL));
+	} while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF);
+
+	return (value);
+}
 
 static int
-ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap)
+ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value,
+    boolean_t inherit)
 {
-	objset_t *os = zr->zr_os;
-	dmu_tx_t *tx;
+	const char *propname = zfs_prop_to_name(prop);
+	const char *valname;
+	char setpoint[MAXPATHLEN];
+	uint64_t curval;
 	int error;
 
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
+	error = dsl_prop_set(osname, propname,
+	    (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL),
+	    sizeof (value), 1, &value);
 
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-	error = dmu_tx_assign(tx, zr->zr_assign);
-	if (error) {
-		dmu_tx_abort(tx);
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
 		return (error);
 	}
-
-	error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0,
-	    DMU_OT_NONE, 0, tx);
 	ASSERT3U(error, ==, 0);
-	dmu_tx_commit(tx);
 
-	if (zopt_verbose >= 5) {
-		char osname[MAXNAMELEN];
-		dmu_objset_name(os, osname);
-		(void) printf("replay create of %s object %llu"
-		    " in txg %llu = %d\n",
-		    osname, (u_longlong_t)lr->lr_doid,
-		    (u_longlong_t)zr->zr_assign, error);
+	VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval),
+	    1, &curval, setpoint), ==, 0);
+
+	if (zopt_verbose >= 6) {
+		VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0);
+		(void) printf("%s %s = %s at '%s'\n",
+		    osname, propname, valname, setpoint);
 	}
 
 	return (error);
 }
 
 static int
-ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap)
+ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value)
 {
-	objset_t *os = zr->zr_os;
-	dmu_tx_t *tx;
+	spa_t *spa = zs->zs_spa;
+	nvlist_t *props = NULL;
 	int error;
 
-	if (byteswap)
-		byteswap_uint64_array(lr, sizeof (*lr));
+	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0);
 
-	tx = dmu_tx_create(os);
-	dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END);
-	error = dmu_tx_assign(tx, zr->zr_assign);
-	if (error) {
-		dmu_tx_abort(tx);
+	error = spa_prop_set(spa, props);
+
+	nvlist_free(props);
+
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
 		return (error);
 	}
-
-	error = dmu_object_free(os, lr->lr_doid, tx);
-	dmu_tx_commit(tx);
+	ASSERT3U(error, ==, 0);
 
 	return (error);
 }
 
-zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
-	NULL,			/* 0 no such transaction type */
-	ztest_replay_create,	/* TX_CREATE */
-	NULL,			/* TX_MKDIR */
-	NULL,			/* TX_MKXATTR */
-	NULL,			/* TX_SYMLINK */
-	ztest_replay_remove,	/* TX_REMOVE */
-	NULL,			/* TX_RMDIR */
-	NULL,			/* TX_LINK */
-	NULL,			/* TX_RENAME */
-	NULL,			/* TX_WRITE */
-	NULL,			/* TX_TRUNCATE */
-	NULL,			/* TX_SETATTR */
-	NULL,			/* TX_ACL */
-};
+static void
+ztest_rll_init(rll_t *rll)
+{
+	rll->rll_writer = NULL;
+	rll->rll_readers = 0;
+	VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0);
+}
 
-/*
- * Verify that we can't destroy an active pool, create an existing pool,
- * or create a pool with a bad vdev spec.
- */
-void
-ztest_spa_create_destroy(ztest_args_t *za)
+static void
+ztest_rll_destroy(rll_t *rll)
 {
-	int error;
-	spa_t *spa;
-	nvlist_t *nvroot;
+	ASSERT(rll->rll_writer == NULL);
+	ASSERT(rll->rll_readers == 0);
+	VERIFY(_mutex_destroy(&rll->rll_lock) == 0);
+	VERIFY(cond_destroy(&rll->rll_cv) == 0);
+}
 
-	/*
-	 * Attempt to create using a bad file.
-	 */
-	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
-	error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL);
-	nvlist_free(nvroot);
-	if (error != ENOENT)
-		fatal(0, "spa_create(bad_file) = %d", error);
+static void
+ztest_rll_lock(rll_t *rll, rl_type_t type)
+{
+	VERIFY(mutex_lock(&rll->rll_lock) == 0);
 
-	/*
-	 * Attempt to create using a bad mirror.
-	 */
-	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
-	error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL);
-	nvlist_free(nvroot);
-	if (error != ENOENT)
-		fatal(0, "spa_create(bad_mirror) = %d", error);
+	if (type == RL_READER) {
+		while (rll->rll_writer != NULL)
+			(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+		rll->rll_readers++;
+	} else {
+		while (rll->rll_writer != NULL || rll->rll_readers)
+			(void) cond_wait(&rll->rll_cv, &rll->rll_lock);
+		rll->rll_writer = curthread;
+	}
 
-	/*
-	 * Attempt to create an existing pool.  It shouldn't matter
-	 * what's in the nvroot; we should fail with EEXIST.
-	 */
-	(void) rw_rdlock(&ztest_shared->zs_name_lock);
-	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
-	error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL);
-	nvlist_free(nvroot);
-	if (error != EEXIST)
-		fatal(0, "spa_create(whatever) = %d", error);
+	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
+}
 
-	error = spa_open(za->za_pool, &spa, FTAG);
-	if (error)
-		fatal(0, "spa_open() = %d", error);
+static void
+ztest_rll_unlock(rll_t *rll)
+{
+	VERIFY(mutex_lock(&rll->rll_lock) == 0);
 
-	error = spa_destroy(za->za_pool);
-	if (error != EBUSY)
-		fatal(0, "spa_destroy() = %d", error);
+	if (rll->rll_writer) {
+		ASSERT(rll->rll_readers == 0);
+		rll->rll_writer = NULL;
+	} else {
+		ASSERT(rll->rll_readers != 0);
+		ASSERT(rll->rll_writer == NULL);
+		rll->rll_readers--;
+	}
 
-	spa_close(spa, FTAG);
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+	if (rll->rll_writer == NULL && rll->rll_readers == 0)
+		VERIFY(cond_broadcast(&rll->rll_cv) == 0);
+
+	VERIFY(mutex_unlock(&rll->rll_lock) == 0);
 }
 
-static vdev_t *
-vdev_lookup_by_path(vdev_t *vd, const char *path)
+static void
+ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type)
 {
-	vdev_t *mvd;
+	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
 
-	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
-		return (vd);
+	ztest_rll_lock(rll, type);
+}
 
-	for (int c = 0; c < vd->vdev_children; c++)
-		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
-		    NULL)
-			return (mvd);
+static void
+ztest_object_unlock(ztest_ds_t *zd, uint64_t object)
+{
+	rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)];
 
-	return (NULL);
+	ztest_rll_unlock(rll);
 }
 
-/*
- * Verify that vdev_add() works as expected.
- */
-void
-ztest_vdev_add_remove(ztest_args_t *za)
+static rl_t *
+ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset,
+    uint64_t size, rl_type_t type)
 {
-	spa_t *spa = za->za_spa;
-	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
-	nvlist_t *nvroot;
-	int error;
+	uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1));
+	rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)];
+	rl_t *rl;
 
-	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+	rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL);
+	rl->rl_object = object;
+	rl->rl_offset = offset;
+	rl->rl_size = size;
+	rl->rl_lock = rll;
 
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+	ztest_rll_lock(rll, type);
 
-	ztest_shared->zs_vdev_primaries =
-	    spa->spa_root_vdev->vdev_children * leaves;
+	return (rl);
+}
 
-	spa_config_exit(spa, SCL_VDEV, FTAG);
+static void
+ztest_range_unlock(rl_t *rl)
+{
+	rll_t *rll = rl->rl_lock;
 
-	/*
-	 * Make 1/4 of the devices be log devices.
-	 */
-	nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
-	    ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1);
+	ztest_rll_unlock(rll);
 
-	error = spa_vdev_add(spa, nvroot);
-	nvlist_free(nvroot);
+	umem_free(rl, sizeof (*rl));
+}
+
+static void
+ztest_zd_init(ztest_ds_t *zd, objset_t *os)
+{
+	zd->zd_os = os;
+	zd->zd_zilog = dmu_objset_zil(os);
+	zd->zd_seq = 0;
+	dmu_objset_name(os, zd->zd_name);
 
-	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+	VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0);
 
-	if (error == ENOSPC)
-		ztest_record_enospc("spa_vdev_add");
-	else if (error != 0)
-		fatal(0, "spa_vdev_add() = %d", error);
+	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+		ztest_rll_init(&zd->zd_object_lock[l]);
+
+	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+		ztest_rll_init(&zd->zd_range_lock[l]);
 }
 
-/*
- * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
- */
-void
-ztest_vdev_aux_add_remove(ztest_args_t *za)
+static void
+ztest_zd_fini(ztest_ds_t *zd)
 {
-	spa_t *spa = za->za_spa;
-	vdev_t *rvd = spa->spa_root_vdev;
-	spa_aux_vdev_t *sav;
-	char *aux;
-	uint64_t guid = 0;
-	int error;
+	VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0);
 
-	if (ztest_random(2) == 0) {
-		sav = &spa->spa_spares;
-		aux = ZPOOL_CONFIG_SPARES;
-	} else {
-		sav = &spa->spa_l2cache;
-		aux = ZPOOL_CONFIG_L2CACHE;
-	}
+	for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++)
+		ztest_rll_destroy(&zd->zd_object_lock[l]);
 
-	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+	for (int l = 0; l < ZTEST_RANGE_LOCKS; l++)
+		ztest_rll_destroy(&zd->zd_range_lock[l]);
+}
 
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+#define	TXG_MIGHTWAIT	(ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT)
 
-	if (sav->sav_count != 0 && ztest_random(4) == 0) {
-		/*
-		 * Pick a random device to remove.
-		 */
-		guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
-	} else {
-		/*
-		 * Find an unused device we can add.
-		 */
-		ztest_shared->zs_vdev_aux = 0;
-		for (;;) {
-			char path[MAXPATHLEN];
-			int c;
+static uint64_t
+ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag)
+{
+	uint64_t txg;
+	int error;
+
+	/*
+	 * Attempt to assign tx to some transaction group.
+	 */
+	error = dmu_tx_assign(tx, txg_how);
+	if (error) {
+		if (error == ERESTART) {
+			ASSERT(txg_how == TXG_NOWAIT);
+			dmu_tx_wait(tx);
+		} else {
+			ASSERT3U(error, ==, ENOSPC);
+			ztest_record_enospc(tag);
+		}
+		dmu_tx_abort(tx);
+		return (0);
+	}
+	txg = dmu_tx_get_txg(tx);
+	ASSERT(txg != 0);
+	return (txg);
+}
+
+static void
+ztest_pattern_set(void *buf, uint64_t size, uint64_t value)
+{
+	uint64_t *ip = buf;
+	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+
+	while (ip < ip_end)
+		*ip++ = value;
+}
+
+static boolean_t
+ztest_pattern_match(void *buf, uint64_t size, uint64_t value)
+{
+	uint64_t *ip = buf;
+	uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size);
+	uint64_t diff = 0;
+
+	while (ip < ip_end)
+		diff |= (value - *ip++);
+
+	return (diff == 0);
+}
+
+static void
+ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+	bt->bt_magic = BT_MAGIC;
+	bt->bt_objset = dmu_objset_id(os);
+	bt->bt_object = object;
+	bt->bt_offset = offset;
+	bt->bt_gen = gen;
+	bt->bt_txg = txg;
+	bt->bt_crtxg = crtxg;
+}
+
+static void
+ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object,
+    uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg)
+{
+	ASSERT(bt->bt_magic == BT_MAGIC);
+	ASSERT(bt->bt_objset == dmu_objset_id(os));
+	ASSERT(bt->bt_object == object);
+	ASSERT(bt->bt_offset == offset);
+	ASSERT(bt->bt_gen <= gen);
+	ASSERT(bt->bt_txg <= txg);
+	ASSERT(bt->bt_crtxg == crtxg);
+}
+
+static ztest_block_tag_t *
+ztest_bt_bonus(dmu_buf_t *db)
+{
+	dmu_object_info_t doi;
+	ztest_block_tag_t *bt;
+
+	dmu_object_info_from_db(db, &doi);
+	ASSERT3U(doi.doi_bonus_size, <=, db->db_size);
+	ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt));
+	bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt));
+
+	return (bt);
+}
+
+/*
+ * ZIL logging ops
+ */
+
+#define	lrz_type	lr_mode
+#define	lrz_blocksize	lr_uid
+#define	lrz_ibshift	lr_gid
+#define	lrz_bonustype	lr_rdev
+#define	lrz_bonuslen	lr_crtime[1]
+
+static uint64_t
+ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	size_t namesize = strlen(name) + 1;
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize);
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) + namesize - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	size_t namesize = strlen(name) + 1;
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize);
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) + namesize - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr)
+{
+	itx_t *itx;
+	itx_wr_state_t write_state = ztest_random(WR_NUM_STATES);
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	if (lr->lr_length > ZIL_MAX_LOG_DATA)
+		write_state = WR_INDIRECT;
+
+	itx = zil_itx_create(TX_WRITE,
+	    sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0));
+
+	if (write_state == WR_COPIED &&
+	    dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length,
+	    ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) {
+		zil_itx_destroy(itx);
+		itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+		write_state = WR_NEED_COPY;
+	}
+	itx->itx_private = zd;
+	itx->itx_wr_state = write_state;
+	itx->itx_sync = (ztest_random(8) == 0);
+	itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0);
+
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr)
+{
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr));
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+static uint64_t
+ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr)
+{
+	itx_t *itx;
+
+	if (zil_replaying(zd->zd_zilog, tx))
+		return (0);
+
+	itx = zil_itx_create(TX_SETATTR, sizeof (*lr));
+	bcopy(&lr->lr_common + 1, &itx->itx_lr + 1,
+	    sizeof (*lr) - sizeof (lr_t));
+
+	return (zil_itx_assign(zd->zd_zilog, itx, tx));
+}
+
+/*
+ * ZIL replay ops
+ */
+static int
+ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	objset_t *os = zd->zd_os;
+	ztest_block_tag_t *bbt;
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	int error = 0;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+	ASSERT(name[0] != '\0');
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name);
+
+	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL);
+	} else {
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+	}
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0)
+		return (ENOSPC);
+
+	ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid);
+
+	if (lr->lrz_type == DMU_OT_ZAP_OTHER) {
+		if (lr->lr_foid == 0) {
+			lr->lr_foid = zap_create(os,
+			    lr->lrz_type, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		} else {
+			error = zap_create_claim(os, lr->lr_foid,
+			    lr->lrz_type, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		}
+	} else {
+		if (lr->lr_foid == 0) {
+			lr->lr_foid = dmu_object_alloc(os,
+			    lr->lrz_type, 0, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		} else {
+			error = dmu_object_claim(os, lr->lr_foid,
+			    lr->lrz_type, 0, lr->lrz_bonustype,
+			    lr->lrz_bonuslen, tx);
+		}
+	}
+
+	if (error) {
+		ASSERT3U(error, ==, EEXIST);
+		ASSERT(zd->zd_zilog->zl_replay);
+		dmu_tx_commit(tx);
+		return (error);
+	}
+
+	ASSERT(lr->lr_foid != 0);
+
+	if (lr->lrz_type != DMU_OT_ZAP_OTHER)
+		VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid,
+		    lr->lrz_blocksize, lr->lrz_ibshift, tx));
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+	bbt = ztest_bt_bonus(db);
+	dmu_buf_will_dirty(db, tx);
+	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg);
+	dmu_buf_rele(db, FTAG);
+
+	VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1,
+	    &lr->lr_foid, tx));
+
+	(void) ztest_log_create(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	return (0);
+}
+
+static int
+ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap)
+{
+	char *name = (void *)(lr + 1);		/* name follows lr */
+	objset_t *os = zd->zd_os;
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	uint64_t object, txg;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ASSERT(lr->lr_doid == ZTEST_DIROBJ);
+	ASSERT(name[0] != '\0');
+
+	VERIFY3U(0, ==,
+	    zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object));
+	ASSERT(object != 0);
+
+	ztest_object_lock(zd, object, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_object_info(os, object, &doi));
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name);
+	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		ztest_object_unlock(zd, object);
+		return (ENOSPC);
+	}
+
+	if (doi.doi_type == DMU_OT_ZAP_OTHER) {
+		VERIFY3U(0, ==, zap_destroy(os, object, tx));
+	} else {
+		VERIFY3U(0, ==, dmu_object_free(os, object, tx));
+	}
+
+	VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx));
+
+	(void) ztest_log_remove(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	ztest_object_unlock(zd, object);
+
+	return (0);
+}
+
+static int
+ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zd->zd_os;
+	void *data = lr + 1;			/* data follows lr */
+	uint64_t offset, length;
+	ztest_block_tag_t *bt = data;
+	ztest_block_tag_t *bbt;
+	uint64_t gen, txg, lrtxg, crtxg;
+	dmu_object_info_t doi;
+	dmu_tx_t *tx;
+	dmu_buf_t *db;
+	arc_buf_t *abuf = NULL;
+	rl_t *rl;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+	}
+
+	if (bt->bt_magic == BSWAP_64(BT_MAGIC))
+		byteswap_uint64_array(bt, sizeof (*bt));
+
+	if (bt->bt_magic != BT_MAGIC)
+		bt = NULL;
+
+	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+	rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+	dmu_object_info_from_db(db, &doi);
+
+	bbt = ztest_bt_bonus(db);
+	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+	gen = bbt->bt_gen;
+	crtxg = bbt->bt_crtxg;
+	lrtxg = lr->lr_common.lrc_txg;
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, lr->lr_foid, offset, length);
+
+	if (ztest_random(8) == 0 && length == doi.doi_data_block_size &&
+	    P2PHASE(offset, length) == 0)
+		abuf = dmu_request_arcbuf(db, length);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		if (abuf != NULL)
+			dmu_return_arcbuf(abuf);
+		dmu_buf_rele(db, FTAG);
+		ztest_range_unlock(rl);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	if (bt != NULL) {
+		/*
+		 * Usually, verify the old data before writing new data --
+		 * but not always, because we also want to verify correct
+		 * behavior when the data was not recently read into cache.
+		 */
+		ASSERT(offset % doi.doi_data_block_size == 0);
+		if (ztest_random(4) != 0) {
+			int prefetch = ztest_random(2) ?
+			    DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH;
+			ztest_block_tag_t rbt;
+
+			VERIFY(dmu_read(os, lr->lr_foid, offset,
+			    sizeof (rbt), &rbt, prefetch) == 0);
+			if (rbt.bt_magic == BT_MAGIC) {
+				ztest_bt_verify(&rbt, os, lr->lr_foid,
+				    offset, gen, txg, crtxg);
+			}
+		}
+
+		/*
+		 * Writes can appear to be newer than the bonus buffer because
+		 * the ztest_get_data() callback does a dmu_read() of the
+		 * open-context data, which may be different than the data
+		 * as it was when the write was generated.
+		 */
+		if (zd->zd_zilog->zl_replay) {
+			ztest_bt_verify(bt, os, lr->lr_foid, offset,
+			    MAX(gen, bt->bt_gen), MAX(txg, lrtxg),
+			    bt->bt_crtxg);
+		}
+
+		/*
+		 * Set the bt's gen/txg to the bonus buffer's gen/txg
+		 * so that all of the usual ASSERTs will work.
+		 */
+		ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg);
+	}
+
+	if (abuf == NULL) {
+		dmu_write(os, lr->lr_foid, offset, length, data, tx);
+	} else {
+		bcopy(data, abuf->b_data, length);
+		dmu_assign_arcbuf(db, offset, abuf, tx);
+	}
+
+	(void) ztest_log_write(zd, tx, lr);
+
+	dmu_buf_rele(db, FTAG);
+
+	dmu_tx_commit(tx);
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+static int
+ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	rl_t *rl;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ztest_object_lock(zd, lr->lr_foid, RL_READER);
+	rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length,
+	    RL_WRITER);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		ztest_range_unlock(rl);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset,
+	    lr->lr_length, tx) == 0);
+
+	(void) ztest_log_truncate(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+static int
+ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	dmu_buf_t *db;
+	ztest_block_tag_t *bbt;
+	uint64_t txg, lrtxg, crtxg;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	ztest_object_lock(zd, lr->lr_foid, RL_WRITER);
+
+	VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db));
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, lr->lr_foid);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		dmu_buf_rele(db, FTAG);
+		ztest_object_unlock(zd, lr->lr_foid);
+		return (ENOSPC);
+	}
+
+	bbt = ztest_bt_bonus(db);
+	ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+	crtxg = bbt->bt_crtxg;
+	lrtxg = lr->lr_common.lrc_txg;
+
+	if (zd->zd_zilog->zl_replay) {
+		ASSERT(lr->lr_size != 0);
+		ASSERT(lr->lr_mode != 0);
+		ASSERT(lrtxg != 0);
+	} else {
+		/*
+		 * Randomly change the size and increment the generation.
+		 */
+		lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) *
+		    sizeof (*bbt);
+		lr->lr_mode = bbt->bt_gen + 1;
+		ASSERT(lrtxg == 0);
+	}
+
+	/*
+	 * Verify that the current bonus buffer is not newer than our txg.
+	 */
+	ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode,
+	    MAX(txg, lrtxg), crtxg);
+
+	dmu_buf_will_dirty(db, tx);
+
+	ASSERT3U(lr->lr_size, >=, sizeof (*bbt));
+	ASSERT3U(lr->lr_size, <=, db->db_size);
+	VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0);
+	bbt = ztest_bt_bonus(db);
+
+	ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg);
+
+	dmu_buf_rele(db, FTAG);
+
+	(void) ztest_log_setattr(zd, tx, lr);
+
+	dmu_tx_commit(tx);
+
+	ztest_object_unlock(zd, lr->lr_foid);
+
+	return (0);
+}
+
+zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = {
+	NULL,			/* 0 no such transaction type */
+	ztest_replay_create,	/* TX_CREATE */
+	NULL,			/* TX_MKDIR */
+	NULL,			/* TX_MKXATTR */
+	NULL,			/* TX_SYMLINK */
+	ztest_replay_remove,	/* TX_REMOVE */
+	NULL,			/* TX_RMDIR */
+	NULL,			/* TX_LINK */
+	NULL,			/* TX_RENAME */
+	ztest_replay_write,	/* TX_WRITE */
+	ztest_replay_truncate,	/* TX_TRUNCATE */
+	ztest_replay_setattr,	/* TX_SETATTR */
+	NULL,			/* TX_ACL */
+	NULL,			/* TX_CREATE_ACL */
+	NULL,			/* TX_CREATE_ATTR */
+	NULL,			/* TX_CREATE_ACL_ATTR */
+	NULL,			/* TX_MKDIR_ACL */
+	NULL,			/* TX_MKDIR_ATTR */
+	NULL,			/* TX_MKDIR_ACL_ATTR */
+	NULL,			/* TX_WRITE2 */
+};
+
+/*
+ * ZIL get_data callbacks
+ */
+
+static void
+ztest_get_done(zgd_t *zgd, int error)
+{
+	ztest_ds_t *zd = zgd->zgd_private;
+	uint64_t object = zgd->zgd_rl->rl_object;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	ztest_range_unlock(zgd->zgd_rl);
+	ztest_object_unlock(zd, object);
+
+	if (error == 0 && zgd->zgd_bp)
+		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
+
+	umem_free(zgd, sizeof (*zgd));
+}
+
+static int
+ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
+{
+	ztest_ds_t *zd = arg;
+	objset_t *os = zd->zd_os;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	blkptr_t *bp = &lr->lr_blkptr;
+	uint64_t txg = lr->lr_common.lrc_txg;
+	uint64_t crtxg;
+	dmu_object_info_t doi;
+	dmu_buf_t *db;
+	zgd_t *zgd;
+	int error;
+
+	ztest_object_lock(zd, object, RL_READER);
+	error = dmu_bonus_hold(os, object, FTAG, &db);
+	if (error) {
+		ztest_object_unlock(zd, object);
+		return (error);
+	}
+
+	crtxg = ztest_bt_bonus(db)->bt_crtxg;
+
+	if (crtxg == 0 || crtxg > txg) {
+		dmu_buf_rele(db, FTAG);
+		ztest_object_unlock(zd, object);
+		return (ENOENT);
+	}
+
+	dmu_object_info_from_db(db, &doi);
+	dmu_buf_rele(db, FTAG);
+	db = NULL;
+
+	zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL);
+	zgd->zgd_zilog = zd->zd_zilog;
+	zgd->zgd_private = zd;
+
+	if (buf != NULL) {	/* immediate write */
+		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+		    RL_READER);
+
+		error = dmu_read(os, object, offset, size, buf,
+		    DMU_READ_NO_PREFETCH);
+		ASSERT(error == 0);
+	} else {
+		size = doi.doi_data_block_size;
+		if (ISP2(size)) {
+			offset = P2ALIGN(offset, size);
+		} else {
+			ASSERT(offset < size);
+			offset = 0;
+		}
+
+		zgd->zgd_rl = ztest_range_lock(zd, object, offset, size,
+		    RL_READER);
+
+		error = dmu_buf_hold(os, object, offset, zgd, &db);
+
+		if (error == 0) {
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    ztest_get_done, zgd);
+
+			if (error == 0)
+				return (0);
+		}
+	}
+
+	ztest_get_done(zgd, error);
+
+	return (error);
+}
+
+static void *
+ztest_lr_alloc(size_t lrsize, char *name)
+{
+	char *lr;
+	size_t namesize = name ? strlen(name) + 1 : 0;
+
+	lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL);
+
+	if (name)
+		bcopy(name, lr + lrsize, namesize);
+
+	return (lr);
+}
+
+void
+ztest_lr_free(void *lr, size_t lrsize, char *name)
+{
+	size_t namesize = name ? strlen(name) + 1 : 0;
+
+	umem_free(lr, lrsize + namesize);
+}
+
+/*
+ * Lookup a bunch of objects.  Returns the number of objects not found.
+ */
+static int
+ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int error;
+
+	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+	for (int i = 0; i < count; i++, od++) {
+		od->od_object = 0;
+		error = zap_lookup(zd->zd_os, od->od_dir, od->od_name,
+		    sizeof (uint64_t), 1, &od->od_object);
+		if (error) {
+			ASSERT(error == ENOENT);
+			ASSERT(od->od_object == 0);
+			missing++;
+		} else {
+			dmu_buf_t *db;
+			ztest_block_tag_t *bbt;
+			dmu_object_info_t doi;
+
+			ASSERT(od->od_object != 0);
+			ASSERT(missing == 0);	/* there should be no gaps */
+
+			ztest_object_lock(zd, od->od_object, RL_READER);
+			VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os,
+			    od->od_object, FTAG, &db));
+			dmu_object_info_from_db(db, &doi);
+			bbt = ztest_bt_bonus(db);
+			ASSERT3U(bbt->bt_magic, ==, BT_MAGIC);
+			od->od_type = doi.doi_type;
+			od->od_blocksize = doi.doi_data_block_size;
+			od->od_gen = bbt->bt_gen;
+			dmu_buf_rele(db, FTAG);
+			ztest_object_unlock(zd, od->od_object);
+		}
+	}
+
+	return (missing);
+}
+
+static int
+ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+
+	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+	for (int i = 0; i < count; i++, od++) {
+		if (missing) {
+			od->od_object = 0;
+			missing++;
+			continue;
+		}
+
+		lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+		lr->lr_doid = od->od_dir;
+		lr->lr_foid = 0;	/* 0 to allocate, > 0 to claim */
+		lr->lrz_type = od->od_crtype;
+		lr->lrz_blocksize = od->od_crblocksize;
+		lr->lrz_ibshift = ztest_random_ibshift();
+		lr->lrz_bonustype = DMU_OT_UINT64_OTHER;
+		lr->lrz_bonuslen = dmu_bonus_max();
+		lr->lr_gen = od->od_crgen;
+		lr->lr_crtime[0] = time(NULL);
+
+		if (ztest_replay_create(zd, lr, B_FALSE) != 0) {
+			ASSERT(missing == 0);
+			od->od_object = 0;
+			missing++;
+		} else {
+			od->od_object = lr->lr_foid;
+			od->od_type = od->od_crtype;
+			od->od_blocksize = od->od_crblocksize;
+			od->od_gen = od->od_crgen;
+			ASSERT(od->od_object != 0);
+		}
+
+		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+	}
+
+	return (missing);
+}
+
+static int
+ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count)
+{
+	int missing = 0;
+	int error;
+
+	ASSERT(_mutex_held(&zd->zd_dirobj_lock));
+
+	od += count - 1;
+
+	for (int i = count - 1; i >= 0; i--, od--) {
+		if (missing) {
+			missing++;
+			continue;
+		}
+
+		if (od->od_object == 0)
+			continue;
+
+		lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name);
+
+		lr->lr_doid = od->od_dir;
+
+		if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) {
+			ASSERT3U(error, ==, ENOSPC);
+			missing++;
+		} else {
+			od->od_object = 0;
+		}
+		ztest_lr_free(lr, sizeof (*lr), od->od_name);
+	}
+
+	return (missing);
+}
+
+static int
+ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size,
+    void *data)
+{
+	lr_write_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr) + size, NULL);
+
+	lr->lr_foid = object;
+	lr->lr_offset = offset;
+	lr->lr_length = size;
+	lr->lr_blkoff = 0;
+	BP_ZERO(&lr->lr_blkptr);
+
+	bcopy(data, lr + 1, size);
+
+	error = ztest_replay_write(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr) + size, NULL);
+
+	return (error);
+}
+
+static int
+ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+	lr_truncate_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+	lr->lr_foid = object;
+	lr->lr_offset = offset;
+	lr->lr_length = size;
+
+	error = ztest_replay_truncate(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr), NULL);
+
+	return (error);
+}
+
+static int
+ztest_setattr(ztest_ds_t *zd, uint64_t object)
+{
+	lr_setattr_t *lr;
+	int error;
+
+	lr = ztest_lr_alloc(sizeof (*lr), NULL);
+
+	lr->lr_foid = object;
+	lr->lr_size = 0;
+	lr->lr_mode = 0;
+
+	error = ztest_replay_setattr(zd, lr, B_FALSE);
+
+	ztest_lr_free(lr, sizeof (*lr), NULL);
+
+	return (error);
+}
+
+static void
+ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size)
+{
+	objset_t *os = zd->zd_os;
+	dmu_tx_t *tx;
+	uint64_t txg;
+	rl_t *rl;
+
+	txg_wait_synced(dmu_objset_pool(os), 0);
+
+	ztest_object_lock(zd, object, RL_READER);
+	rl = ztest_range_lock(zd, object, offset, size, RL_WRITER);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, object, offset, size);
+
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+
+	if (txg != 0) {
+		dmu_prealloc(os, object, offset, size, tx);
+		dmu_tx_commit(tx);
+		txg_wait_synced(dmu_objset_pool(os), txg);
+	} else {
+		(void) dmu_free_long_range(os, object, offset, size);
+	}
+
+	ztest_range_unlock(rl);
+	ztest_object_unlock(zd, object);
+}
+
+static void
+ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset)
+{
+	ztest_block_tag_t wbt;
+	dmu_object_info_t doi;
+	enum ztest_io_type io_type;
+	uint64_t blocksize;
+	void *data;
+
+	VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0);
+	blocksize = doi.doi_data_block_size;
+	data = umem_alloc(blocksize, UMEM_NOFAIL);
+
+	/*
+	 * Pick an i/o type at random, biased toward writing block tags.
+	 */
+	io_type = ztest_random(ZTEST_IO_TYPES);
+	if (ztest_random(2) == 0)
+		io_type = ZTEST_IO_WRITE_TAG;
+
+	switch (io_type) {
+
+	case ZTEST_IO_WRITE_TAG:
+		ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0);
+		(void) ztest_write(zd, object, offset, sizeof (wbt), &wbt);
+		break;
+
+	case ZTEST_IO_WRITE_PATTERN:
+		(void) memset(data, 'a' + (object + offset) % 5, blocksize);
+		if (ztest_random(2) == 0) {
+			/*
+			 * Induce fletcher2 collisions to ensure that
+			 * zio_ddt_collision() detects and resolves them
+			 * when using fletcher2-verify for deduplication.
+			 */
+			((uint64_t *)data)[0] ^= 1ULL << 63;
+			((uint64_t *)data)[4] ^= 1ULL << 63;
+		}
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+
+	case ZTEST_IO_WRITE_ZEROES:
+		bzero(data, blocksize);
+		(void) ztest_write(zd, object, offset, blocksize, data);
+		break;
+
+	case ZTEST_IO_TRUNCATE:
+		(void) ztest_truncate(zd, object, offset, blocksize);
+		break;
+
+	case ZTEST_IO_SETATTR:
+		(void) ztest_setattr(zd, object);
+		break;
+	}
+
+	umem_free(data, blocksize);
+}
+
+/*
+ * Initialize an object description template.
+ */
+static void
+ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index,
+    dmu_object_type_t type, uint64_t blocksize, uint64_t gen)
+{
+	od->od_dir = ZTEST_DIROBJ;
+	od->od_object = 0;
+
+	od->od_crtype = type;
+	od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize();
+	od->od_crgen = gen;
+
+	od->od_type = DMU_OT_NONE;
+	od->od_blocksize = 0;
+	od->od_gen = 0;
+
+	(void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]",
+	    tag, (int64_t)id, index);
+}
+
+/*
+ * Lookup or create the objects for a test using the od template.
+ * If the objects do not all exist, or if 'remove' is specified,
+ * remove any existing objects and create new ones.  Otherwise,
+ * use the existing objects.
+ */
+static int
+ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove)
+{
+	int count = size / sizeof (*od);
+	int rv = 0;
+
+	VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0);
+	if ((ztest_lookup(zd, od, count) != 0 || remove) &&
+	    (ztest_remove(zd, od, count) != 0 ||
+	    ztest_create(zd, od, count) != 0))
+		rv = -1;
+	zd->zd_od = od;
+	VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0);
+
+	return (rv);
+}
+
+/* ARGSUSED */
+void
+ztest_zil_commit(ztest_ds_t *zd, uint64_t id)
+{
+	zilog_t *zilog = zd->zd_zilog;
+
+	zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS));
+
+	/*
+	 * Remember the committed values in zd, which is in parent/child
+	 * shared memory.  If we die, the next iteration of ztest_run()
+	 * will verify that the log really does contain this record.
+	 */
+	mutex_enter(&zilog->zl_lock);
+	ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq);
+	zd->zd_seq = zilog->zl_commit_lr_seq;
+	mutex_exit(&zilog->zl_lock);
+}
+
+/*
+ * Verify that we can't destroy an active pool, create an existing pool,
+ * or create a pool with a bad vdev spec.
+ */
+/* ARGSUSED */
+void
+ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa;
+	nvlist_t *nvroot;
+
+	/*
+	 * Attempt to create using a bad file.
+	 */
+	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+	VERIFY3U(ENOENT, ==,
+	    spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL));
+	nvlist_free(nvroot);
+
+	/*
+	 * Attempt to create using a bad mirror.
+	 */
+	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1);
+	VERIFY3U(ENOENT, ==,
+	    spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL));
+	nvlist_free(nvroot);
+
+	/*
+	 * Attempt to create an existing pool.  It shouldn't matter
+	 * what's in the nvroot; we should fail with EEXIST.
+	 */
+	(void) rw_rdlock(&zs->zs_name_lock);
+	nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1);
+	VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL));
+	nvlist_free(nvroot);
+	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+	VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool));
+	spa_close(spa, FTAG);
+
+	(void) rw_unlock(&zs->zs_name_lock);
+}
+
+static vdev_t *
+vdev_lookup_by_path(vdev_t *vd, const char *path)
+{
+	vdev_t *mvd;
+
+	if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0)
+		return (vd);
+
+	for (int c = 0; c < vd->vdev_children; c++)
+		if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) !=
+		    NULL)
+			return (mvd);
+
+	return (NULL);
+}
+
+/*
+ * Find the first available hole which can be used as a top-level.
+ */
+int
+find_vdev_hole(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	int c;
+
+	ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV);
+
+	for (c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *cvd = rvd->vdev_child[c];
+
+		if (cvd->vdev_ishole)
+			break;
+	}
+	return (c);
+}
+
+/*
+ * Verify that vdev_add() works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
+	uint64_t leaves;
+	uint64_t guid;
+	nvlist_t *nvroot;
+	int error;
+
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+	leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves;
+
+	/*
+	 * If we have slogs then remove them 1/4 of the time.
+	 */
+	if (spa_has_slogs(spa) && ztest_random(4) == 0) {
+		/*
+		 * Grab the guid from the head of the log class rotor.
+		 */
+		guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid;
+
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+
+		/*
+		 * We have to grab the zs_name_lock as writer to
+		 * prevent a race between removing a slog (dmu_objset_find)
+		 * and destroying a dataset. Removing the slog will
+		 * grab a reference on the dataset which may cause
+		 * dmu_objset_destroy() to fail with EBUSY thus
+		 * leaving the dataset in an inconsistent state.
+		 */
+		VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0);
+		error = spa_vdev_remove(spa, guid, B_FALSE);
+		VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0);
+
+		if (error && error != EEXIST)
+			fatal(0, "spa_vdev_remove() = %d", error);
+	} else {
+		spa_config_exit(spa, SCL_VDEV, FTAG);
+
+		/*
+		 * Make 1/4 of the devices be log devices.
+		 */
+		nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
+		    ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1);
+
+		error = spa_vdev_add(spa, nvroot);
+		nvlist_free(nvroot);
+
+		if (error == ENOSPC)
+			ztest_record_enospc("spa_vdev_add");
+		else if (error != 0)
+			fatal(0, "spa_vdev_add() = %d", error);
+	}
+
+	VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0);
+}
+
+/*
+ * Verify that adding/removing aux devices (l2arc, hot spare) works as expected.
+ */
+/* ARGSUSED */
+void
+ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	spa_aux_vdev_t *sav;
+	char *aux;
+	uint64_t guid = 0;
+	int error;
+
+	if (ztest_random(2) == 0) {
+		sav = &spa->spa_spares;
+		aux = ZPOOL_CONFIG_SPARES;
+	} else {
+		sav = &spa->spa_l2cache;
+		aux = ZPOOL_CONFIG_L2CACHE;
+	}
+
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	if (sav->sav_count != 0 && ztest_random(4) == 0) {
+		/*
+		 * Pick a random device to remove.
+		 */
+		guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid;
+	} else {
+		/*
+		 * Find an unused device we can add.
+		 */
+		zs->zs_vdev_aux = 0;
+		for (;;) {
+			char path[MAXPATHLEN];
+			int c;
 			(void) sprintf(path, ztest_aux_template, zopt_dir,
-			    zopt_pool, aux, ztest_shared->zs_vdev_aux);
+			    zopt_pool, aux, zs->zs_vdev_aux);
 			for (c = 0; c < sav->sav_count; c++)
 				if (strcmp(sav->sav_vdevs[c]->vdev_path,
 				    path) == 0)
@@ -930,7 +2244,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
 			if (c == sav->sav_count &&
 			    vdev_lookup_by_path(rvd, path) == NULL)
 				break;
-			ztest_shared->zs_vdev_aux++;
+			zs->zs_vdev_aux++;
 		}
 	}
 
@@ -953,31 +2267,126 @@ ztest_vdev_aux_add_remove(ztest_args_t *za)
 		 * of devices that have pending state changes.
 		 */
 		if (ztest_random(2) == 0)
-			(void) vdev_online(spa, guid, B_FALSE, NULL);
+			(void) vdev_online(spa, guid, 0, NULL);
 
 		error = spa_vdev_remove(spa, guid, B_FALSE);
 		if (error != 0 && error != EBUSY)
 			fatal(0, "spa_vdev_remove(%llu) = %d", guid, error);
 	}
 
-	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+}
+
+/*
+ * split a pool if it has mirror tlvdevs
+ */
+/* ARGSUSED */
+void
+ztest_split_pool(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
+	nvlist_t *tree, **child, *config, *split, **schild;
+	uint_t c, children, schildren = 0, lastlogid = 0;
+	int error = 0;
+
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+
+	/* ensure we have a useable config; mirrors of raidz aren't supported */
+	if (zs->zs_mirrors < 3 || zopt_raidz > 1) {
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		return;
+	}
+
+	/* clean up the old pool, if any */
+	(void) spa_destroy("splitp");
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	/* generate a config from the existing config */
+	VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE,
+	    &tree) == 0);
+	VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) == 0);
+
+	schild = malloc(rvd->vdev_children * sizeof (nvlist_t *));
+	for (c = 0; c < children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		nvlist_t **mchild;
+		uint_t mchildren;
+
+		if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) {
+			VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME,
+			    0) == 0);
+			VERIFY(nvlist_add_string(schild[schildren],
+			    ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0);
+			VERIFY(nvlist_add_uint64(schild[schildren],
+			    ZPOOL_CONFIG_IS_HOLE, 1) == 0);
+			if (lastlogid == 0)
+				lastlogid = schildren;
+			++schildren;
+			continue;
+		}
+		lastlogid = 0;
+		VERIFY(nvlist_lookup_nvlist_array(child[c],
+		    ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+		VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0);
+	}
+
+	/* OK, create a config that can be used to split */
+	VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild,
+	    lastlogid != 0 ? lastlogid : schildren) == 0);
+
+	VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0);
+
+	for (c = 0; c < schildren; c++)
+		nvlist_free(schild[c]);
+	free(schild);
+	nvlist_free(split);
+
+	spa_config_exit(spa, SCL_VDEV, FTAG);
+
+	(void) rw_wrlock(&zs->zs_name_lock);
+	error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE);
+	(void) rw_unlock(&zs->zs_name_lock);
+
+	nvlist_free(config);
+
+	if (error == 0) {
+		(void) printf("successful split - results:\n");
+		mutex_enter(&spa_namespace_lock);
+		show_pool_stats(spa);
+		show_pool_stats(spa_lookup("splitp"));
+		mutex_exit(&spa_namespace_lock);
+		++zs->zs_splits;
+		--zs->zs_mirrors;
+	}
+	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+
 }
 
 /*
  * Verify that we can attach and detach devices.
  */
+/* ARGSUSED */
 void
-ztest_vdev_attach_detach(ztest_args_t *za)
+ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id)
 {
-	spa_t *spa = za->za_spa;
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
 	spa_aux_vdev_t *sav = &spa->spa_spares;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *pvd;
 	nvlist_t *root;
-	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+	uint64_t leaves;
 	uint64_t leaf, top;
 	uint64_t ashift = ztest_get_ashift();
-	uint64_t oldguid;
+	uint64_t oldguid, pguid;
 	size_t oldsize, newsize;
 	char oldpath[MAXPATHLEN], newpath[MAXPATHLEN];
 	int replacing;
@@ -986,7 +2395,8 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 	int oldvd_is_log;
 	int error, expected_error;
 
-	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+	leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
 
 	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
 
@@ -998,7 +2408,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 	/*
 	 * Pick a random top-level vdev.
 	 */
-	top = ztest_random(rvd->vdev_children);
+	top = ztest_random_vdev_top(spa, B_TRUE);
 
 	/*
 	 * Pick a random leaf within it.
@@ -1009,10 +2419,16 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 	 * Locate this vdev.
 	 */
 	oldvd = rvd->vdev_child[top];
-	if (zopt_mirrors >= 1)
+	if (zs->zs_mirrors >= 1) {
+		ASSERT(oldvd->vdev_ops == &vdev_mirror_ops);
+		ASSERT(oldvd->vdev_children >= zs->zs_mirrors);
 		oldvd = oldvd->vdev_child[leaf / zopt_raidz];
-	if (zopt_raidz > 1)
+	}
+	if (zopt_raidz > 1) {
+		ASSERT(oldvd->vdev_ops == &vdev_raidz_ops);
+		ASSERT(oldvd->vdev_children == zopt_raidz);
 		oldvd = oldvd->vdev_child[leaf % zopt_raidz];
+	}
 
 	/*
 	 * If we're already doing an attach or replace, oldvd may be a
@@ -1020,26 +2436,27 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 	 */
 	while (oldvd->vdev_children != 0) {
 		oldvd_has_siblings = B_TRUE;
-		ASSERT(oldvd->vdev_children == 2);
-		oldvd = oldvd->vdev_child[ztest_random(2)];
+		ASSERT(oldvd->vdev_children >= 2);
+		oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)];
 	}
 
 	oldguid = oldvd->vdev_guid;
-	oldsize = vdev_get_rsize(oldvd);
+	oldsize = vdev_get_min_asize(oldvd);
 	oldvd_is_log = oldvd->vdev_top->vdev_islog;
 	(void) strcpy(oldpath, oldvd->vdev_path);
 	pvd = oldvd->vdev_parent;
+	pguid = pvd->vdev_guid;
 
 	/*
 	 * If oldvd has siblings, then half of the time, detach it.
 	 */
 	if (oldvd_has_siblings && ztest_random(2) == 0) {
 		spa_config_exit(spa, SCL_VDEV, FTAG);
-		error = spa_vdev_detach(spa, oldguid, B_FALSE);
-		if (error != 0 && error != ENODEV && error != EBUSY)
-			fatal(0, "detach (%s) returned %d",
-			    oldpath, error);
-		(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+		error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE);
+		if (error != 0 && error != ENODEV && error != EBUSY &&
+		    error != ENOTSUP)
+			fatal(0, "detach (%s) returned %d", oldpath, error);
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 		return;
 	}
 
@@ -1060,7 +2477,7 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 	}
 
 	if (newvd) {
-		newsize = vdev_get_rsize(newvd);
+		newsize = vdev_get_min_asize(newvd);
 	} else {
 		/*
 		 * Make newsize a little bigger or smaller than oldsize.
@@ -1132,7 +2549,117 @@ ztest_vdev_attach_detach(ztest_args_t *za)
 		    (longlong_t)newsize, replacing, error, expected_error);
 	}
 
-	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+}
+
+/*
+ * Callback function which expands the physical size of the vdev.
+ */
+vdev_t *
+grow_vdev(vdev_t *vd, void *arg)
+{
+	spa_t *spa = vd->vdev_spa;
+	size_t *newsize = arg;
+	size_t fsize;
+	int fd;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	if ((fd = open(vd->vdev_path, O_RDWR)) == -1)
+		return (vd);
+
+	fsize = lseek(fd, 0, SEEK_END);
+	(void) ftruncate(fd, *newsize);
+
+	if (zopt_verbose >= 6) {
+		(void) printf("%s grew from %lu to %lu bytes\n",
+		    vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize);
+	}
+	(void) close(fd);
+	return (NULL);
+}
+
+/*
+ * Callback function which expands a given vdev by calling vdev_online().
+ */
+/* ARGSUSED */
+vdev_t *
+online_vdev(vdev_t *vd, void *arg)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *tvd = vd->vdev_top;
+	uint64_t guid = vd->vdev_guid;
+	uint64_t generation = spa->spa_config_generation + 1;
+	vdev_state_t newstate = VDEV_STATE_UNKNOWN;
+	int error;
+
+	ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
+
+	/* Calling vdev_online will initialize the new metaslabs */
+	spa_config_exit(spa, SCL_STATE, spa);
+	error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate);
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	/*
+	 * If vdev_online returned an error or the underlying vdev_open
+	 * failed then we abort the expand. The only way to know that
+	 * vdev_open fails is by checking the returned newstate.
+	 */
+	if (error || newstate != VDEV_STATE_HEALTHY) {
+		if (zopt_verbose >= 5) {
+			(void) printf("Unable to expand vdev, state %llu, "
+			    "error %d\n", (u_longlong_t)newstate, error);
+		}
+		return (vd);
+	}
+	ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY);
+
+	/*
+	 * Since we dropped the lock we need to ensure that we're
+	 * still talking to the original vdev. It's possible this
+	 * vdev may have been detached/replaced while we were
+	 * trying to online it.
+	 */
+	if (generation != spa->spa_config_generation) {
+		if (zopt_verbose >= 5) {
+			(void) printf("vdev configuration has changed, "
+			    "guid %llu, state %llu, expected gen %llu, "
+			    "got gen %llu\n",
+			    (u_longlong_t)guid,
+			    (u_longlong_t)tvd->vdev_state,
+			    (u_longlong_t)generation,
+			    (u_longlong_t)spa->spa_config_generation);
+		}
+		return (vd);
+	}
+	return (NULL);
+}
+
+/*
+ * Traverse the vdev tree calling the supplied function.
+ * We continue to walk the tree until we either have walked all
+ * children or we receive a non-NULL return from the callback.
+ * If a NULL callback is passed, then we just return back the first
+ * leaf vdev we encounter.
+ */
+vdev_t *
+vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg)
+{
+	if (vd->vdev_ops->vdev_op_leaf) {
+		if (func == NULL)
+			return (vd);
+		else
+			return (func(vd, arg));
+	}
+
+	for (uint_t c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL)
+			return (cvd);
+	}
+	return (NULL);
 }
 
 /*
@@ -1140,164 +2667,240 @@ ztest_vdev_attach_detach(ztest_args_t *za)
  */
 /* ARGSUSED */
 void
-ztest_vdev_LUN_growth(ztest_args_t *za)
+ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id)
 {
-	spa_t *spa = za->za_spa;
-	char dev_name[MAXPATHLEN];
-	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
-	uint64_t vdev;
-	size_t fsize;
-	int fd;
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
+	vdev_t *vd, *tvd;
+	metaslab_class_t *mc;
+	metaslab_group_t *mg;
+	size_t psize, newsize;
+	uint64_t top;
+	uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count;
+
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	top = ztest_random_vdev_top(spa, B_TRUE);
 
-	(void) mutex_lock(&ztest_shared->zs_vdev_lock);
+	tvd = spa->spa_root_vdev->vdev_child[top];
+	mg = tvd->vdev_mg;
+	mc = mg->mg_class;
+	old_ms_count = tvd->vdev_ms_count;
+	old_class_space = metaslab_class_get_space(mc);
 
 	/*
-	 * Pick a random leaf vdev.
+	 * Determine the size of the first leaf vdev associated with
+	 * our top-level device.
 	 */
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves);
-	spa_config_exit(spa, SCL_VDEV, FTAG);
+	vd = vdev_walk_tree(tvd, NULL, NULL);
+	ASSERT3P(vd, !=, NULL);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
-	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+	psize = vd->vdev_psize;
 
-	if ((fd = open(dev_name, O_RDWR)) != -1) {
-		/*
-		 * Determine the size.
-		 */
-		fsize = lseek(fd, 0, SEEK_END);
+	/*
+	 * We only try to expand the vdev if it's healthy, less than 4x its
+	 * original size, and it has a valid psize.
+	 */
+	if (tvd->vdev_state != VDEV_STATE_HEALTHY ||
+	    psize == 0 || psize >= 4 * zopt_vdev_size) {
+		spa_config_exit(spa, SCL_STATE, spa);
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		return;
+	}
+	ASSERT(psize > 0);
+	newsize = psize + psize / 8;
+	ASSERT3U(newsize, >, psize);
 
-		/*
-		 * If it's less than 2x the original size, grow by around 3%.
-		 */
-		if (fsize < 2 * zopt_vdev_size) {
-			size_t newsize = fsize + ztest_random(fsize / 32);
-			(void) ftruncate(fd, newsize);
-			if (zopt_verbose >= 6) {
-				(void) printf("%s grew from %lu to %lu bytes\n",
-				    dev_name, (ulong_t)fsize, (ulong_t)newsize);
-			}
+	if (zopt_verbose >= 6) {
+		(void) printf("Expanding LUN %s from %lu to %lu\n",
+		    vd->vdev_path, (ulong_t)psize, (ulong_t)newsize);
+	}
+
+	/*
+	 * Growing the vdev is a two step process:
+	 *	1). expand the physical size (i.e. relabel)
+	 *	2). online the vdev to create the new metaslabs
+	 */
+	if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL ||
+	    vdev_walk_tree(tvd, online_vdev, NULL) != NULL ||
+	    tvd->vdev_state != VDEV_STATE_HEALTHY) {
+		if (zopt_verbose >= 5) {
+			(void) printf("Could not expand LUN because "
+			    "the vdev configuration changed.\n");
 		}
-		(void) close(fd);
+		spa_config_exit(spa, SCL_STATE, spa);
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		return;
+	}
+
+	spa_config_exit(spa, SCL_STATE, spa);
+
+	/*
+	 * Expanding the LUN will update the config asynchronously,
+	 * thus we must wait for the async thread to complete any
+	 * pending tasks before proceeding.
+	 */
+	for (;;) {
+		boolean_t done;
+		mutex_enter(&spa->spa_async_lock);
+		done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks);
+		mutex_exit(&spa->spa_async_lock);
+		if (done)
+			break;
+		txg_wait_synced(spa_get_dsl(spa), 0);
+		(void) poll(NULL, 0, 100);
+	}
+
+	spa_config_enter(spa, SCL_STATE, spa, RW_READER);
+
+	tvd = spa->spa_root_vdev->vdev_child[top];
+	new_ms_count = tvd->vdev_ms_count;
+	new_class_space = metaslab_class_get_space(mc);
+
+	if (tvd->vdev_mg != mg || mg->mg_class != mc) {
+		if (zopt_verbose >= 5) {
+			(void) printf("Could not verify LUN expansion due to "
+			    "intervening vdev offline or remove.\n");
+		}
+		spa_config_exit(spa, SCL_STATE, spa);
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+		return;
+	}
+
+	/*
+	 * Make sure we were able to grow the vdev.
+	 */
+	if (new_ms_count <= old_ms_count)
+		fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n",
+		    old_ms_count, new_ms_count);
+
+	/*
+	 * Make sure we were able to grow the pool.
+	 */
+	if (new_class_space <= old_class_space)
+		fatal(0, "LUN expansion failed: class_space %llu <= %llu\n",
+		    old_class_space, new_class_space);
+
+	if (zopt_verbose >= 5) {
+		char oldnumbuf[6], newnumbuf[6];
+
+		nicenum(old_class_space, oldnumbuf);
+		nicenum(new_class_space, newnumbuf);
+		(void) printf("%s grew from %s to %s\n",
+		    spa->spa_name, oldnumbuf, newnumbuf);
 	}
 
-	(void) mutex_unlock(&ztest_shared->zs_vdev_lock);
+	spa_config_exit(spa, SCL_STATE, spa);
+	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 }
 
+/*
+ * Verify that dmu_objset_{create,destroy,open,close} work as expected.
+ */
 /* ARGSUSED */
 static void
-ztest_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
+ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
 {
 	/*
-	 * Create the directory object.
+	 * Create the objects common to all ztest datasets.
 	 */
-	VERIFY(dmu_object_claim(os, ZTEST_DIROBJ,
-	    DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE,
-	    DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0);
-
-	VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ,
-	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
-
-	VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ,
+	VERIFY(zap_create_claim(os, ZTEST_DIROBJ,
 	    DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0);
 }
 
+/* ARGSUSED */
 static int
-ztest_destroy_cb(char *name, void *arg)
+ztest_objset_destroy_cb(const char *name, void *arg)
 {
-	ztest_args_t *za = arg;
 	objset_t *os;
-	dmu_object_info_t *doi = &za->za_doi;
+	dmu_object_info_t doi;
 	int error;
 
 	/*
 	 * Verify that the dataset contains a directory object.
 	 */
-	error = dmu_objset_open(name, DMU_OST_OTHER,
-	    DS_MODE_USER | DS_MODE_READONLY, &os);
-	ASSERT3U(error, ==, 0);
-	error = dmu_object_info(os, ZTEST_DIROBJ, doi);
+	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os));
+	error = dmu_object_info(os, ZTEST_DIROBJ, &doi);
 	if (error != ENOENT) {
 		/* We could have crashed in the middle of destroying it */
 		ASSERT3U(error, ==, 0);
-		ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER);
-		ASSERT3S(doi->doi_physical_blks, >=, 0);
+		ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER);
+		ASSERT3S(doi.doi_physical_blocks_512, >=, 0);
 	}
-	dmu_objset_close(os);
+	dmu_objset_rele(os, FTAG);
 
 	/*
 	 * Destroy the dataset.
 	 */
-	error = dmu_objset_destroy(name);
-	if (error) {
-		(void) dmu_objset_open(name, DMU_OST_OTHER,
-		    DS_MODE_USER | DS_MODE_READONLY, &os);
-		fatal(0, "dmu_objset_destroy(os=%p) = %d\n", &os, error);
-	}
+	VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE));
 	return (0);
 }
 
-/*
- * Verify that dmu_objset_{create,destroy,open,close} work as expected.
- */
-static uint64_t
-ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode)
+static boolean_t
+ztest_snapshot_create(char *osname, uint64_t id)
 {
-	itx_t *itx;
-	lr_create_t *lr;
-	size_t namesize;
-	char name[24];
-
-	(void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object);
-	namesize = strlen(name) + 1;
-
-	itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize +
-	    ztest_random(ZIL_MAX_BLKSZ));
-	lr = (lr_create_t *)&itx->itx_lr;
-	bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr));
-	lr->lr_doid = object;
-	lr->lr_foid = 0;
-	lr->lr_mode = mode;
-	lr->lr_uid = 0;
-	lr->lr_gid = 0;
-	lr->lr_gen = dmu_tx_get_txg(tx);
-	lr->lr_crtime[0] = time(NULL);
-	lr->lr_crtime[1] = 0;
-	lr->lr_rdev = 0;
-	bcopy(name, (char *)(lr + 1), namesize);
-
-	return (zil_itx_assign(zilog, itx, tx));
+	char snapname[MAXNAMELEN];
+	int error;
+
+	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+	    (u_longlong_t)id);
+
+	error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1,
+	    NULL, B_FALSE);
+	if (error == ENOSPC) {
+		ztest_record_enospc(FTAG);
+		return (B_FALSE);
+	}
+	if (error != 0 && error != EEXIST)
+		fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error);
+	return (B_TRUE);
+}
+
+static boolean_t
+ztest_snapshot_destroy(char *osname, uint64_t id)
+{
+	char snapname[MAXNAMELEN];
+	int error;
+
+	(void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname,
+	    (u_longlong_t)id);
+
+	error = dmu_objset_destroy(snapname, B_FALSE);
+	if (error != 0 && error != ENOENT)
+		fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error);
+	return (B_TRUE);
 }
 
+/* ARGSUSED */
 void
-ztest_dmu_objset_create_destroy(ztest_args_t *za)
+ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
+	ztest_shared_t *zs = ztest_shared;
+	ztest_ds_t zdtmp;
+	int iters;
 	int error;
 	objset_t *os, *os2;
-	char name[100];
-	int basemode, expected_error;
+	char name[MAXNAMELEN];
 	zilog_t *zilog;
-	uint64_t seq;
-	uint64_t objects;
-	ztest_replay_t zr;
 
-	(void) rw_rdlock(&ztest_shared->zs_name_lock);
-	(void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool,
-	    (u_longlong_t)za->za_instance);
+	(void) rw_rdlock(&zs->zs_name_lock);
 
-	basemode = DS_MODE_TYPE(za->za_instance);
-	if (basemode != DS_MODE_USER && basemode != DS_MODE_OWNER)
-		basemode = DS_MODE_USER;
+	(void) snprintf(name, MAXNAMELEN, "%s/temp_%llu",
+	    zs->zs_pool, (u_longlong_t)id);
 
 	/*
 	 * If this dataset exists from a previous run, process its replay log
 	 * half of the time.  If we don't replay it, then dmu_objset_destroy()
-	 * (invoked from ztest_destroy_cb() below) should just throw it away.
+	 * (invoked from ztest_objset_destroy_cb()) should just throw it away.
 	 */
 	if (ztest_random(2) == 0 &&
-	    dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) {
-		zr.zr_os = os;
-		zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector, NULL);
-		dmu_objset_close(os);
+	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) {
+		ztest_zd_init(&zdtmp, os);
+		zil_replay(os, &zdtmp, ztest_replay_vector);
+		ztest_zd_fini(&zdtmp);
+		dmu_objset_disown(os, FTAG);
 	}
 
 	/*
@@ -1305,372 +2908,262 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za)
 	 * create lying around from a previous run.  If so, destroy it
 	 * and all of its snapshots.
 	 */
-	(void) dmu_objset_find(name, ztest_destroy_cb, za,
+	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
 	    DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
 	/*
 	 * Verify that the destroyed dataset is no longer in the namespace.
 	 */
-	error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
-	if (error != ENOENT)
-		fatal(1, "dmu_objset_open(%s) found destroyed dataset %p",
-		    name, os);
+	VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os));
 
 	/*
 	 * Verify that we can create a new dataset.
 	 */
-	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
-	    ztest_create_cb, NULL);
+	error = dmu_objset_create(name, DMU_OST_OTHER, 0,
+	    ztest_objset_create_cb, NULL);
 	if (error) {
 		if (error == ENOSPC) {
-			ztest_record_enospc("dmu_objset_create");
-			(void) rw_unlock(&ztest_shared->zs_name_lock);
+			ztest_record_enospc(FTAG);
+			(void) rw_unlock(&zs->zs_name_lock);
 			return;
 		}
 		fatal(0, "dmu_objset_create(%s) = %d", name, error);
 	}
 
-	error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os);
-	if (error) {
-		fatal(0, "dmu_objset_open(%s) = %d", name, error);
-	}
+	VERIFY3U(0, ==,
+	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os));
+
+	ztest_zd_init(&zdtmp, os);
 
 	/*
 	 * Open the intent log for it.
 	 */
-	zilog = zil_open(os, NULL);
+	zilog = zil_open(os, ztest_get_data);
 
 	/*
-	 * Put a random number of objects in there.
+	 * Put some objects in there, do a little I/O to them,
+	 * and randomly take a couple of snapshots along the way.
 	 */
-	objects = ztest_random(20);
-	seq = 0;
-	while (objects-- != 0) {
-		uint64_t object;
-		dmu_tx_t *tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name));
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(tx);
-		} else {
-			object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-			    DMU_OT_NONE, 0, tx);
-			ztest_set_random_blocksize(os, object, tx);
-			seq = ztest_log_create(zilog, tx, object,
-			    DMU_OT_UINT64_OTHER);
-			dmu_write(os, object, 0, sizeof (name), name, tx);
-			dmu_tx_commit(tx);
-		}
-		if (ztest_random(5) == 0) {
-			zil_commit(zilog, seq, object);
-		}
-		if (ztest_random(100) == 0) {
-			error = zil_suspend(zilog);
-			if (error == 0) {
-				zil_resume(zilog);
-			}
-		}
+	iters = ztest_random(5);
+	for (int i = 0; i < iters; i++) {
+		ztest_dmu_object_alloc_free(&zdtmp, id);
+		if (ztest_random(iters) == 0)
+			(void) ztest_snapshot_create(name, i);
 	}
 
 	/*
 	 * Verify that we cannot create an existing dataset.
 	 */
-	error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, NULL, NULL);
-	if (error != EEXIST)
-		fatal(0, "created existing dataset, error = %d", error);
+	VERIFY3U(EEXIST, ==,
+	    dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL));
 
 	/*
-	 * Verify that multiple dataset holds are allowed, but only when
-	 * the new access mode is compatible with the base mode.
+	 * Verify that we can hold an objset that is also owned.
 	 */
-	if (basemode == DS_MODE_OWNER) {
-		error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_USER,
-		    &os2);
-		if (error)
-			fatal(0, "dmu_objset_open('%s') = %d", name, error);
-		else
-			dmu_objset_close(os2);
-	}
-	error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os2);
-	expected_error = (basemode == DS_MODE_OWNER) ? EBUSY : 0;
-	if (error != expected_error)
-		fatal(0, "dmu_objset_open('%s') = %d, expected %d",
-		    name, error, expected_error);
-	if (error == 0)
-		dmu_objset_close(os2);
+	VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2));
+	dmu_objset_rele(os2, FTAG);
 
-	zil_close(zilog);
-	dmu_objset_close(os);
+	/*
+	 * Verify that we cannot own an objset that is already owned.
+	 */
+	VERIFY3U(EBUSY, ==,
+	    dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2));
 
-	error = dmu_objset_destroy(name);
-	if (error)
-		fatal(0, "dmu_objset_destroy(%s) = %d", name, error);
+	zil_close(zilog);
+	dmu_objset_disown(os, FTAG);
+	ztest_zd_fini(&zdtmp);
 
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
 /*
  * Verify that dmu_snapshot_{create,destroy,open,close} work as expected.
  */
 void
-ztest_dmu_snapshot_create_destroy(ztest_args_t *za)
+ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id)
 {
-	int error;
-	objset_t *os = za->za_os;
-	char snapname[100];
-	char osname[MAXNAMELEN];
+	ztest_shared_t *zs = ztest_shared;
 
-	(void) rw_rdlock(&ztest_shared->zs_name_lock);
-	dmu_objset_name(os, osname);
-	(void) snprintf(snapname, 100, "%s@%llu", osname,
-	    (u_longlong_t)za->za_instance);
+	(void) rw_rdlock(&zs->zs_name_lock);
+	(void) ztest_snapshot_destroy(zd->zd_name, id);
+	(void) ztest_snapshot_create(zd->zd_name, id);
+	(void) rw_unlock(&zs->zs_name_lock);
+}
 
-	error = dmu_objset_destroy(snapname);
-	if (error != 0 && error != ENOENT)
-		fatal(0, "dmu_objset_destroy() = %d", error);
-	error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, FALSE);
-	if (error == ENOSPC)
-		ztest_record_enospc("dmu_take_snapshot");
-	else if (error != 0 && error != EEXIST)
-		fatal(0, "dmu_take_snapshot() = %d", error);
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+/*
+ * Cleanup non-standard snapshots and clones.
+ */
+void
+ztest_dsl_dataset_cleanup(char *osname, uint64_t id)
+{
+	char snap1name[MAXNAMELEN];
+	char clone1name[MAXNAMELEN];
+	char snap2name[MAXNAMELEN];
+	char clone2name[MAXNAMELEN];
+	char snap3name[MAXNAMELEN];
+	int error;
+
+	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
+
+	error = dmu_objset_destroy(clone2name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error);
+	error = dmu_objset_destroy(snap3name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error);
+	error = dmu_objset_destroy(snap2name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error);
+	error = dmu_objset_destroy(clone1name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error);
+	error = dmu_objset_destroy(snap1name, B_FALSE);
+	if (error && error != ENOENT)
+		fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error);
 }
 
 /*
- * Verify that dmu_object_{alloc,free} work as expected.
+ * Verify dsl_dataset_promote handles EBUSY
  */
 void
-ztest_dmu_object_alloc_free(ztest_args_t *za)
+ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	dmu_buf_t *db;
-	dmu_tx_t *tx;
-	uint64_t batchobj, object, batchsize, endoff, temp;
-	int b, c, error, bonuslen;
-	dmu_object_info_t *doi = &za->za_doi;
-	char osname[MAXNAMELEN];
+	ztest_shared_t *zs = ztest_shared;
+	objset_t *clone;
+	dsl_dataset_t *ds;
+	char snap1name[MAXNAMELEN];
+	char clone1name[MAXNAMELEN];
+	char snap2name[MAXNAMELEN];
+	char clone2name[MAXNAMELEN];
+	char snap3name[MAXNAMELEN];
+	char *osname = zd->zd_name;
+	int error;
 
-	dmu_objset_name(os, osname);
+	(void) rw_rdlock(&zs->zs_name_lock);
 
-	endoff = -8ULL;
-	batchsize = 2;
+	ztest_dsl_dataset_cleanup(osname, id);
 
-	/*
-	 * Create a batch object if necessary, and record it in the directory.
-	 */
-	VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (uint64_t), &batchobj));
-	if (batchobj == 0) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t));
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create a batch object");
-			dmu_tx_abort(tx);
-			return;
-		}
-		batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		ztest_set_random_blocksize(os, batchobj, tx);
-		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t), &batchobj, tx);
-		dmu_tx_commit(tx);
-	}
+	(void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id);
+	(void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id);
+	(void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id);
+	(void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id);
+	(void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id);
 
-	/*
-	 * Destroy the previous batch of objects.
-	 */
-	for (b = 0; b < batchsize; b++) {
-		VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t), &object));
-		if (object == 0)
-			continue;
-		/*
-		 * Read and validate contents.
-		 * We expect the nth byte of the bonus buffer to be n.
-		 */
-		VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db));
-		za->za_dbuf = db;
-
-		dmu_object_info_from_db(db, doi);
-		ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER);
-		ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER);
-		ASSERT3S(doi->doi_physical_blks, >=, 0);
-
-		bonuslen = doi->doi_bonus_size;
-
-		for (c = 0; c < bonuslen; c++) {
-			if (((uint8_t *)db->db_data)[c] !=
-			    (uint8_t)(c + bonuslen)) {
-				fatal(0,
-				    "bad bonus: %s, obj %llu, off %d: %u != %u",
-				    osname, object, c,
-				    ((uint8_t *)db->db_data)[c],
-				    (uint8_t)(c + bonuslen));
-			}
+	error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1,
+	    NULL, B_FALSE);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
 		}
+		fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error);
+	}
 
-		dmu_buf_rele(db, FTAG);
-		za->za_dbuf = NULL;
-
-		/*
-		 * We expect the word at endoff to be our object number.
-		 */
-		VERIFY(0 == dmu_read(os, object, endoff,
-		    sizeof (uint64_t), &temp));
+	error = dmu_objset_hold(snap1name, FTAG, &clone);
+	if (error)
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error);
 
-		if (temp != object) {
-			fatal(0, "bad data in %s, got %llu, expected %llu",
-			    osname, temp, object);
+	error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0);
+	dmu_objset_rele(clone, FTAG);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
 		}
+		fatal(0, "dmu_objset_create(%s) = %d", clone1name, error);
+	}
 
-		/*
-		 * Destroy old object and clear batch entry.
-		 */
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, batchobj,
-		    b * sizeof (uint64_t), sizeof (uint64_t));
-		dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("free object");
-			dmu_tx_abort(tx);
-			return;
-		}
-		error = dmu_object_free(os, object, tx);
-		if (error) {
-			fatal(0, "dmu_object_free('%s', %llu) = %d",
-			    osname, object, error);
+	error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1,
+	    NULL, B_FALSE);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
 		}
-		object = 0;
-
-		dmu_object_set_checksum(os, batchobj,
-		    ztest_random_checksum(), tx);
-		dmu_object_set_compress(os, batchobj,
-		    ztest_random_compress(), tx);
-
-		dmu_write(os, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t), &object, tx);
-
-		dmu_tx_commit(tx);
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error);
 	}
 
-	/*
-	 * Before creating the new batch of objects, generate a bunch of churn.
-	 */
-	for (b = ztest_random(100); b > 0; b--) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("churn objects");
-			dmu_tx_abort(tx);
-			return;
-		}
-		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		ztest_set_random_blocksize(os, object, tx);
-		error = dmu_object_free(os, object, tx);
-		if (error) {
-			fatal(0, "dmu_object_free('%s', %llu) = %d",
-			    osname, object, error);
+	error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1,
+	    NULL, B_FALSE);
+	if (error && error != EEXIST) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
 		}
-		dmu_tx_commit(tx);
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
 	}
 
-	/*
-	 * Create a new batch of objects with randomly chosen
-	 * blocksizes and record them in the batch directory.
-	 */
-	for (b = 0; b < batchsize; b++) {
-		uint32_t va_blksize;
-		u_longlong_t va_nblocks;
+	error = dmu_objset_hold(snap3name, FTAG, &clone);
+	if (error)
+		fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error);
 
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t));
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff,
-		    sizeof (uint64_t));
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create batchobj");
-			dmu_tx_abort(tx);
-			return;
+	error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0);
+	dmu_objset_rele(clone, FTAG);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc(FTAG);
+			goto out;
 		}
-		bonuslen = (int)ztest_random(dmu_bonus_max()) + 1;
-
-		object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_PLAIN_OTHER, bonuslen, tx);
-
-		ztest_set_random_blocksize(os, object, tx);
-
-		dmu_object_set_checksum(os, object,
-		    ztest_random_checksum(), tx);
-		dmu_object_set_compress(os, object,
-		    ztest_random_compress(), tx);
-
-		dmu_write(os, batchobj, b * sizeof (uint64_t),
-		    sizeof (uint64_t), &object, tx);
+		fatal(0, "dmu_objset_create(%s) = %d", clone2name, error);
+	}
 
-		/*
-		 * Write to both the bonus buffer and the regular data.
-		 */
-		VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0);
-		za->za_dbuf = db;
-		ASSERT3U(bonuslen, <=, db->db_size);
+	error = dsl_dataset_own(snap1name, B_FALSE, FTAG, &ds);
+	if (error)
+		fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error);
+	error = dsl_dataset_promote(clone2name, NULL);
+	if (error != EBUSY)
+		fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name,
+		    error);
+	dsl_dataset_disown(ds, FTAG);
 
-		dmu_object_size_from_db(db, &va_blksize, &va_nblocks);
-		ASSERT3S(va_nblocks, >=, 0);
+out:
+	ztest_dsl_dataset_cleanup(osname, id);
 
-		dmu_buf_will_dirty(db, tx);
+	(void) rw_unlock(&zs->zs_name_lock);
+}
 
-		/*
-		 * See comments above regarding the contents of
-		 * the bonus buffer and the word at endoff.
-		 */
-		for (c = 0; c < bonuslen; c++)
-			((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen);
+/*
+ * Verify that dmu_object_{alloc,free} work as expected.
+ */
+void
+ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_od_t od[4];
+	int batchsize = sizeof (od) / sizeof (od[0]);
 
-		dmu_buf_rele(db, FTAG);
-		za->za_dbuf = NULL;
+	for (int b = 0; b < batchsize; b++)
+		ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0);
 
-		/*
-		 * Write to a large offset to increase indirection.
-		 */
-		dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx);
+	/*
+	 * Destroy the previous batch of objects, create a new batch,
+	 * and do some I/O on the new objects.
+	 */
+	if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0)
+		return;
 
-		dmu_tx_commit(tx);
-	}
+	while (ztest_random(4 * batchsize) != 0)
+		ztest_io(zd, od[ztest_random(batchsize)].od_object,
+		    ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 }
 
 /*
  * Verify that dmu_{read,write} work as expected.
  */
-typedef struct bufwad {
-	uint64_t	bw_index;
-	uint64_t	bw_txg;
-	uint64_t	bw_data;
-} bufwad_t;
-
-typedef struct dmu_read_write_dir {
-	uint64_t	dd_packobj;
-	uint64_t	dd_bigobj;
-	uint64_t	dd_chunk;
-} dmu_read_write_dir_t;
-
 void
-ztest_dmu_read_write(ztest_args_t *za)
+ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	dmu_read_write_dir_t dd;
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[2];
 	dmu_tx_t *tx;
 	int i, freeit, error;
 	uint64_t n, s, txg;
 	bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT;
-	uint64_t packoff, packsize, bigoff, bigsize;
+	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+	uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t);
 	uint64_t regions = 997;
 	uint64_t stride = 123456789ULL;
 	uint64_t width = 40;
@@ -1703,34 +3196,16 @@ ztest_dmu_read_write(ztest_args_t *za)
 	/*
 	 * Read the directory info.  If it's the first time, set things up.
 	 */
-	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (dd), &dd));
-	if (dd.dd_chunk == 0) {
-		ASSERT(dd.dd_packobj == 0);
-		ASSERT(dd.dd_bigobj == 0);
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd));
-		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create r/w directory");
-			dmu_tx_abort(tx);
-			return;
-		}
-
-		dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0,
-		    DMU_OT_NONE, 0, tx);
-		dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t);
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize);
+	ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
 
-		ztest_set_random_blocksize(os, dd.dd_packobj, tx);
-		ztest_set_random_blocksize(os, dd.dd_bigobj, tx);
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
 
-		dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd,
-		    tx);
-		dmu_tx_commit(tx);
-	}
+	bigobj = od[0].od_object;
+	packobj = od[1].od_object;
+	chunksize = od[0].od_gen;
+	ASSERT(chunksize == od[1].od_gen);
 
 	/*
 	 * Prefetch a random chunk of the big object.
@@ -1740,7 +3215,7 @@ ztest_dmu_read_write(ztest_args_t *za)
 	 */
 	n = ztest_random(regions) * stride + ztest_random(width);
 	s = 1 + ztest_random(2 * width - 1);
-	dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk);
+	dmu_prefetch(os, bigobj, n * chunksize, s * chunksize);
 
 	/*
 	 * Pick a random index and compute the offsets into packobj and bigobj.
@@ -1751,8 +3226,8 @@ ztest_dmu_read_write(ztest_args_t *za)
 	packoff = n * sizeof (bufwad_t);
 	packsize = s * sizeof (bufwad_t);
 
-	bigoff = n * dd.dd_chunk;
-	bigsize = s * dd.dd_chunk;
+	bigoff = n * chunksize;
+	bigsize = s * chunksize;
 
 	packbuf = umem_alloc(packsize, UMEM_NOFAIL);
 	bigbuf = umem_alloc(bigsize, UMEM_NOFAIL);
@@ -1766,9 +3241,11 @@ ztest_dmu_read_write(ztest_args_t *za)
 	/*
 	 * Read the current contents of our objects.
 	 */
-	error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf);
+	error = dmu_read(os, packobj, packoff, packsize, packbuf,
+	    DMU_READ_PREFETCH);
 	ASSERT3U(error, ==, 0);
-	error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf);
+	error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf,
+	    DMU_READ_PREFETCH);
 	ASSERT3U(error, ==, 0);
 
 	/*
@@ -1776,24 +3253,25 @@ ztest_dmu_read_write(ztest_args_t *za)
 	 */
 	tx = dmu_tx_create(os);
 
-	dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize);
+	dmu_tx_hold_write(tx, packobj, packoff, packsize);
 
 	if (freeit)
-		dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize);
+		dmu_tx_hold_free(tx, bigobj, bigoff, bigsize);
 	else
-		dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize);
-
-	error = dmu_tx_assign(tx, TXG_WAIT);
+		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
 
-	if (error) {
-		ztest_record_enospc("dmu r/w range");
-		dmu_tx_abort(tx);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0) {
 		umem_free(packbuf, packsize);
 		umem_free(bigbuf, bigsize);
 		return;
 	}
 
-	txg = dmu_tx_get_txg(tx);
+	dmu_object_set_checksum(os, bigobj,
+	    (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx);
+
+	dmu_object_set_compress(os, bigobj,
+	    (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx);
 
 	/*
 	 * For each index from n to n + s, verify that the existing bufwad
@@ -1805,9 +3283,9 @@ ztest_dmu_read_write(ztest_args_t *za)
 		/* LINTED */
 		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
 		/* LINTED */
-		bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk);
+		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
 		/* LINTED */
-		bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1;
+		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
 
 		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
 		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
@@ -1841,27 +3319,26 @@ ztest_dmu_read_write(ztest_args_t *za)
 	 * We've verified all the old bufwads, and made new ones.
 	 * Now write them out.
 	 */
-	dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx);
+	dmu_write(os, packobj, packoff, packsize, packbuf, tx);
 
 	if (freeit) {
-		if (zopt_verbose >= 6) {
+		if (zopt_verbose >= 7) {
 			(void) printf("freeing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
-		VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff,
-		    bigsize, tx));
+		VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx));
 	} else {
-		if (zopt_verbose >= 6) {
+		if (zopt_verbose >= 7) {
 			(void) printf("writing offset %llx size %llx"
 			    " txg %llx\n",
 			    (u_longlong_t)bigoff,
 			    (u_longlong_t)bigsize,
 			    (u_longlong_t)txg);
 		}
-		dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx);
+		dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx);
 	}
 
 	dmu_tx_commit(tx);
@@ -1873,10 +3350,10 @@ ztest_dmu_read_write(ztest_args_t *za)
 		void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
 		void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
-		VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff,
-		    packsize, packcheck));
-		VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff,
-		    bigsize, bigcheck));
+		VERIFY(0 == dmu_read(os, packobj, packoff,
+		    packsize, packcheck, DMU_READ_PREFETCH));
+		VERIFY(0 == dmu_read(os, bigobj, bigoff,
+		    bigsize, bigcheck, DMU_READ_PREFETCH));
 
 		ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
 		ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
@@ -1890,240 +3367,335 @@ ztest_dmu_read_write(ztest_args_t *za)
 }
 
 void
-ztest_dmu_check_future_leak(ztest_args_t *za)
+compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf,
+    uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg)
 {
-	objset_t *os = za->za_os;
-	dmu_buf_t *db;
-	ztest_block_tag_t *bt;
-	dmu_object_info_t *doi = &za->za_doi;
-
-	/*
-	 * Make sure that, if there is a write record in the bonus buffer
-	 * of the ZTEST_DIROBJ, that the txg for this record is <= the
-	 * last synced txg of the pool.
-	 */
-	VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
-	za->za_dbuf = db;
-	VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0);
-	ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt));
-	ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
-	ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0);
-	bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt));
-	if (bt->bt_objset != 0) {
-		ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os));
-		ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ);
-		ASSERT3U(bt->bt_offset, ==, -1ULL);
-		ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa));
+	uint64_t i;
+	bufwad_t *pack;
+	bufwad_t *bigH;
+	bufwad_t *bigT;
+
+	/*
+	 * For each index from n to n + s, verify that the existing bufwad
+	 * in packobj matches the bufwads at the head and tail of the
+	 * corresponding chunk in bigobj.  Then update all three bufwads
+	 * with the new values we want to write out.
+	 */
+	for (i = 0; i < s; i++) {
+		/* LINTED */
+		pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t));
+		/* LINTED */
+		bigH = (bufwad_t *)((char *)bigbuf + i * chunksize);
+		/* LINTED */
+		bigT = (bufwad_t *)((char *)bigH + chunksize) - 1;
+
+		ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize);
+		ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize);
+
+		if (pack->bw_txg > txg)
+			fatal(0, "future leak: got %llx, open txg is %llx",
+			    pack->bw_txg, txg);
+
+		if (pack->bw_data != 0 && pack->bw_index != n + i)
+			fatal(0, "wrong index: got %llx, wanted %llx+%llx",
+			    pack->bw_index, n, i);
+
+		if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH);
+
+		if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0)
+			fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT);
+
+		pack->bw_index = n + i;
+		pack->bw_txg = txg;
+		pack->bw_data = 1 + ztest_random(-2ULL);
+
+		*bigH = *pack;
+		*bigT = *pack;
 	}
-	dmu_buf_rele(db, FTAG);
-	za->za_dbuf = NULL;
 }
 
 void
-ztest_dmu_write_parallel(ztest_args_t *za)
+ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	ztest_block_tag_t *rbt = &za->za_rbt;
-	ztest_block_tag_t *wbt = &za->za_wbt;
-	const size_t btsize = sizeof (ztest_block_tag_t);
-	dmu_buf_t *db;
-	int b, error;
-	int bs = ZTEST_DIROBJ_BLOCKSIZE;
-	int do_free = 0;
-	uint64_t off, txg, txg_how;
-	mutex_t *lp;
-	char osname[MAXNAMELEN];
-	char iobuf[SPA_MAXBLOCKSIZE];
-	blkptr_t blk = { 0 };
-	uint64_t blkoff;
-	zbookmark_t zb;
-	dmu_tx_t *tx = dmu_tx_create(os);
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[2];
+	dmu_tx_t *tx;
+	uint64_t i;
+	int error;
+	uint64_t n, s, txg;
+	bufwad_t *packbuf, *bigbuf;
+	uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize;
+	uint64_t blocksize = ztest_random_blocksize();
+	uint64_t chunksize = blocksize;
+	uint64_t regions = 997;
+	uint64_t stride = 123456789ULL;
+	uint64_t width = 9;
+	dmu_buf_t *bonus_db;
+	arc_buf_t **bigbuf_arcbufs;
+	dmu_object_info_t doi;
 
-	dmu_objset_name(os, osname);
+	/*
+	 * This test uses two objects, packobj and bigobj, that are always
+	 * updated together (i.e. in the same tx) so that their contents are
+	 * in sync and can be compared.  Their contents relate to each other
+	 * in a simple way: packobj is a dense array of 'bufwad' structures,
+	 * while bigobj is a sparse array of the same bufwads.  Specifically,
+	 * for any index n, there are three bufwads that should be identical:
+	 *
+	 *	packobj, at offset n * sizeof (bufwad_t)
+	 *	bigobj, at the head of the nth chunk
+	 *	bigobj, at the tail of the nth chunk
+	 *
+	 * The chunk size is set equal to bigobj block size so that
+	 * dmu_assign_arcbuf() can be tested for object updates.
+	 */
 
 	/*
-	 * Have multiple threads write to large offsets in ZTEST_DIROBJ
-	 * to verify that having multiple threads writing to the same object
-	 * in parallel doesn't cause any trouble.
+	 * Read the directory info.  If it's the first time, set things up.
 	 */
-	if (ztest_random(4) == 0) {
-		/*
-		 * Do the bonus buffer instead of a regular block.
-		 * We need a lock to serialize resize vs. others,
-		 * so we hash on the objset ID.
-		 */
-		b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS;
-		off = -1ULL;
-		dmu_tx_hold_bonus(tx, ZTEST_DIROBJ);
-	} else {
-		b = ztest_random(ZTEST_SYNC_LOCKS);
-		off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT);
-		if (ztest_random(4) == 0) {
-			do_free = 1;
-			dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs);
-		} else {
-			dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs);
-		}
-	}
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
+	ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize);
 
-	txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT;
-	error = dmu_tx_assign(tx, txg_how);
-	if (error) {
-		if (error == ERESTART) {
-			ASSERT(txg_how == TXG_NOWAIT);
-			dmu_tx_wait(tx);
-		} else {
-			ztest_record_enospc("dmu write parallel");
-		}
-		dmu_tx_abort(tx);
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
 		return;
-	}
-	txg = dmu_tx_get_txg(tx);
 
-	lp = &ztest_shared->zs_sync_lock[b];
-	(void) mutex_lock(lp);
-
-	wbt->bt_objset = dmu_objset_id(os);
-	wbt->bt_object = ZTEST_DIROBJ;
-	wbt->bt_offset = off;
-	wbt->bt_txg = txg;
-	wbt->bt_thread = za->za_instance;
-	wbt->bt_seq = ztest_shared->zs_seq[b]++;	/* protected by lp */
-
-	/*
-	 * Occasionally, write an all-zero block to test the behavior
-	 * of blocks that compress into holes.
-	 */
-	if (off != -1ULL && ztest_random(8) == 0)
-		bzero(wbt, btsize);
-
-	if (off == -1ULL) {
-		dmu_object_info_t *doi = &za->za_doi;
-		char *dboff;
-
-		VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0);
-		za->za_dbuf = db;
-		dmu_object_info_from_db(db, doi);
-		ASSERT3U(doi->doi_bonus_size, <=, db->db_size);
-		ASSERT3U(doi->doi_bonus_size, >=, btsize);
-		ASSERT3U(doi->doi_bonus_size % btsize, ==, 0);
-		dboff = (char *)db->db_data + doi->doi_bonus_size - btsize;
-		bcopy(dboff, rbt, btsize);
-		if (rbt->bt_objset != 0) {
-			ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
-			ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
-			ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
-			ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg);
-		}
-		if (ztest_random(10) == 0) {
-			int newsize = (ztest_random(db->db_size /
-			    btsize) + 1) * btsize;
-
-			ASSERT3U(newsize, >=, btsize);
-			ASSERT3U(newsize, <=, db->db_size);
-			VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0);
-			dboff = (char *)db->db_data + newsize - btsize;
-		}
-		dmu_buf_will_dirty(db, tx);
-		bcopy(wbt, dboff, btsize);
-		dmu_buf_rele(db, FTAG);
-		za->za_dbuf = NULL;
-	} else if (do_free) {
-		VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0);
-	} else {
-		dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx);
-	}
+	bigobj = od[0].od_object;
+	packobj = od[1].od_object;
+	blocksize = od[0].od_blocksize;
+	chunksize = blocksize;
+	ASSERT(chunksize == od[1].od_gen);
+
+	VERIFY(dmu_object_info(os, bigobj, &doi) == 0);
+	VERIFY(ISP2(doi.doi_data_block_size));
+	VERIFY(chunksize == doi.doi_data_block_size);
+	VERIFY(chunksize >= 2 * sizeof (bufwad_t));
+
+	/*
+	 * Pick a random index and compute the offsets into packobj and bigobj.
+	 */
+	n = ztest_random(regions) * stride + ztest_random(width);
+	s = 1 + ztest_random(width - 1);
 
-	(void) mutex_unlock(lp);
+	packoff = n * sizeof (bufwad_t);
+	packsize = s * sizeof (bufwad_t);
 
-	if (ztest_random(1000) == 0)
-		(void) poll(NULL, 0, 1); /* open dn_notxholds window */
+	bigoff = n * chunksize;
+	bigsize = s * chunksize;
 
-	dmu_tx_commit(tx);
+	packbuf = umem_zalloc(packsize, UMEM_NOFAIL);
+	bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL);
 
-	if (ztest_random(10000) == 0)
-		txg_wait_synced(dmu_objset_pool(os), txg);
+	VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db));
 
-	if (off == -1ULL || do_free)
-		return;
+	bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL);
 
-	if (ztest_random(2) != 0)
-		return;
+	/*
+	 * Iteration 0 test zcopy for DB_UNCACHED dbufs.
+	 * Iteration 1 test zcopy to already referenced dbufs.
+	 * Iteration 2 test zcopy to dirty dbuf in the same txg.
+	 * Iteration 3 test zcopy to dbuf dirty in previous txg.
+	 * Iteration 4 test zcopy when dbuf is no longer dirty.
+	 * Iteration 5 test zcopy when it can't be done.
+	 * Iteration 6 one more zcopy write.
+	 */
+	for (i = 0; i < 7; i++) {
+		uint64_t j;
+		uint64_t off;
+
+		/*
+		 * In iteration 5 (i == 5) use arcbufs
+		 * that don't match bigobj blksz to test
+		 * dmu_assign_arcbuf() when it can't directly
+		 * assign an arcbuf to a dbuf.
+		 */
+		for (j = 0; j < s; j++) {
+			if (i != 5) {
+				bigbuf_arcbufs[j] =
+				    dmu_request_arcbuf(bonus_db, chunksize);
+			} else {
+				bigbuf_arcbufs[2 * j] =
+				    dmu_request_arcbuf(bonus_db, chunksize / 2);
+				bigbuf_arcbufs[2 * j + 1] =
+				    dmu_request_arcbuf(bonus_db, chunksize / 2);
+			}
+		}
+
+		/*
+		 * Get a tx for the mods to both packobj and bigobj.
+		 */
+		tx = dmu_tx_create(os);
+
+		dmu_tx_hold_write(tx, packobj, packoff, packsize);
+		dmu_tx_hold_write(tx, bigobj, bigoff, bigsize);
+
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0) {
+			umem_free(packbuf, packsize);
+			umem_free(bigbuf, bigsize);
+			for (j = 0; j < s; j++) {
+				if (i != 5) {
+					dmu_return_arcbuf(bigbuf_arcbufs[j]);
+				} else {
+					dmu_return_arcbuf(
+					    bigbuf_arcbufs[2 * j]);
+					dmu_return_arcbuf(
+					    bigbuf_arcbufs[2 * j + 1]);
+				}
+			}
+			umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+			dmu_buf_rele(bonus_db, FTAG);
+			return;
+		}
+
+		/*
+		 * 50% of the time don't read objects in the 1st iteration to
+		 * test dmu_assign_arcbuf() for the case when there're no
+		 * existing dbufs for the specified offsets.
+		 */
+		if (i != 0 || ztest_random(2) != 0) {
+			error = dmu_read(os, packobj, packoff,
+			    packsize, packbuf, DMU_READ_PREFETCH);
+			ASSERT3U(error, ==, 0);
+			error = dmu_read(os, bigobj, bigoff, bigsize,
+			    bigbuf, DMU_READ_PREFETCH);
+			ASSERT3U(error, ==, 0);
+		}
+		compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize,
+		    n, chunksize, txg);
+
+		/*
+		 * We've verified all the old bufwads, and made new ones.
+		 * Now write them out.
+		 */
+		dmu_write(os, packobj, packoff, packsize, packbuf, tx);
+		if (zopt_verbose >= 7) {
+			(void) printf("writing offset %llx size %llx"
+			    " txg %llx\n",
+			    (u_longlong_t)bigoff,
+			    (u_longlong_t)bigsize,
+			    (u_longlong_t)txg);
+		}
+		for (off = bigoff, j = 0; j < s; j++, off += chunksize) {
+			dmu_buf_t *dbt;
+			if (i != 5) {
+				bcopy((caddr_t)bigbuf + (off - bigoff),
+				    bigbuf_arcbufs[j]->b_data, chunksize);
+			} else {
+				bcopy((caddr_t)bigbuf + (off - bigoff),
+				    bigbuf_arcbufs[2 * j]->b_data,
+				    chunksize / 2);
+				bcopy((caddr_t)bigbuf + (off - bigoff) +
+				    chunksize / 2,
+				    bigbuf_arcbufs[2 * j + 1]->b_data,
+				    chunksize / 2);
+			}
+
+			if (i == 1) {
+				VERIFY(dmu_buf_hold(os, bigobj, off,
+				    FTAG, &dbt) == 0);
+			}
+			if (i != 5) {
+				dmu_assign_arcbuf(bonus_db, off,
+				    bigbuf_arcbufs[j], tx);
+			} else {
+				dmu_assign_arcbuf(bonus_db, off,
+				    bigbuf_arcbufs[2 * j], tx);
+				dmu_assign_arcbuf(bonus_db,
+				    off + chunksize / 2,
+				    bigbuf_arcbufs[2 * j + 1], tx);
+			}
+			if (i == 1) {
+				dmu_buf_rele(dbt, FTAG);
+			}
+		}
+		dmu_tx_commit(tx);
 
-	/*
-	 * dmu_sync() the block we just wrote.
-	 */
-	(void) mutex_lock(lp);
+		/*
+		 * Sanity check the stuff we just wrote.
+		 */
+		{
+			void *packcheck = umem_alloc(packsize, UMEM_NOFAIL);
+			void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL);
 
-	blkoff = P2ALIGN_TYPED(off, bs, uint64_t);
-	error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db);
-	za->za_dbuf = db;
-	if (error) {
-		dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n",
-		    osname, ZTEST_DIROBJ, blkoff, error);
-		(void) mutex_unlock(lp);
-		return;
-	}
-	blkoff = off - blkoff;
-	error = dmu_sync(NULL, db, &blk, txg, NULL, NULL);
-	dmu_buf_rele(db, FTAG);
-	za->za_dbuf = NULL;
+			VERIFY(0 == dmu_read(os, packobj, packoff,
+			    packsize, packcheck, DMU_READ_PREFETCH));
+			VERIFY(0 == dmu_read(os, bigobj, bigoff,
+			    bigsize, bigcheck, DMU_READ_PREFETCH));
 
-	(void) mutex_unlock(lp);
+			ASSERT(bcmp(packbuf, packcheck, packsize) == 0);
+			ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0);
 
-	if (error) {
-		dprintf("dmu_sync(%s, %d, %llx) = %d\n",
-		    osname, ZTEST_DIROBJ, off, error);
-		return;
+			umem_free(packcheck, packsize);
+			umem_free(bigcheck, bigsize);
+		}
+		if (i == 2) {
+			txg_wait_open(dmu_objset_pool(os), 0);
+		} else if (i == 3) {
+			txg_wait_synced(dmu_objset_pool(os), 0);
+		}
 	}
 
-	if (blk.blk_birth == 0)		/* concurrent free */
-		return;
-
-	txg_suspend(dmu_objset_pool(os));
+	dmu_buf_rele(bonus_db, FTAG);
+	umem_free(packbuf, packsize);
+	umem_free(bigbuf, bigsize);
+	umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *));
+}
 
-	ASSERT(blk.blk_fill == 1);
-	ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER);
-	ASSERT3U(BP_GET_LEVEL(&blk), ==, 0);
-	ASSERT3U(BP_GET_LSIZE(&blk), ==, bs);
+/* ARGSUSED */
+void
+ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_od_t od[1];
+	uint64_t offset = (1ULL << (ztest_random(20) + 43)) +
+	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
 
 	/*
-	 * Read the block that dmu_sync() returned to make sure its contents
-	 * match what we wrote.  We do this while still txg_suspend()ed
-	 * to ensure that the block can't be reused before we read it.
+	 * Have multiple threads write to large offsets in an object
+	 * to verify that parallel writes to an object -- even to the
+	 * same blocks within the object -- doesn't cause any trouble.
 	 */
-	zb.zb_objset = dmu_objset_id(os);
-	zb.zb_object = ZTEST_DIROBJ;
-	zb.zb_level = 0;
-	zb.zb_blkid = off / bs;
-	error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs,
-	    NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb));
-	ASSERT3U(error, ==, 0);
+	ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
+
+	while (ztest_random(10) != 0)
+		ztest_io(zd, od[0].od_object, offset);
+}
 
-	txg_resume(dmu_objset_pool(os));
+void
+ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_od_t od[1];
+	uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) +
+	    (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT);
+	uint64_t count = ztest_random(20) + 1;
+	uint64_t blocksize = ztest_random_blocksize();
+	void *data;
 
-	bcopy(&iobuf[blkoff], rbt, btsize);
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
 
-	if (rbt->bt_objset == 0)		/* concurrent free */
+	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
 		return;
 
-	if (wbt->bt_objset == 0)		/* all-zero overwrite */
+	if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0)
 		return;
 
-	ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset);
-	ASSERT3U(rbt->bt_object, ==, wbt->bt_object);
-	ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset);
+	ztest_prealloc(zd, od[0].od_object, offset, count * blocksize);
 
-	/*
-	 * The semantic of dmu_sync() is that we always push the most recent
-	 * version of the data, so in the face of concurrent updates we may
-	 * see a newer version of the block.  That's OK.
-	 */
-	ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg);
-	if (rbt->bt_thread == wbt->bt_thread)
-		ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq);
-	else
-		ASSERT3U(rbt->bt_seq, >, wbt->bt_seq);
+	data = umem_zalloc(blocksize, UMEM_NOFAIL);
+
+	while (ztest_random(count) != 0) {
+		uint64_t randoff = offset + (ztest_random(count) * blocksize);
+		if (ztest_write(zd, od[0].od_object, randoff, blocksize,
+		    data) != 0)
+			break;
+		while (ztest_random(4) != 0)
+			ztest_io(zd, od[0].od_object, randoff);
+	}
+
+	umem_free(data, blocksize);
 }
 
 /*
@@ -2134,9 +3706,10 @@ ztest_dmu_write_parallel(ztest_args_t *za)
 #define	ZTEST_ZAP_MAX_PROPS	1000
 
 void
-ztest_zap(ztest_args_t *za)
+ztest_zap(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
 	uint64_t object;
 	uint64_t txg, last_txg;
 	uint64_t value[ZTEST_ZAP_MAX_INTS];
@@ -2145,64 +3718,45 @@ ztest_zap(ztest_args_t *za)
 	dmu_tx_t *tx;
 	char propname[100], txgname[100];
 	int error;
-	char osname[MAXNAMELEN];
 	char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" };
 
-	dmu_objset_name(os, osname);
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
 
-	/*
-	 * Create a new object if necessary, and record it in the directory.
-	 */
-	VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff,
-	    sizeof (uint64_t), &object));
+	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
+		return;
 
-	if (object == 0) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t));
-		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("create zap test obj");
-			dmu_tx_abort(tx);
-			return;
-		}
-		object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx);
-		if (error) {
-			fatal(0, "zap_create('%s', %llu) = %d",
-			    osname, object, error);
-		}
-		ASSERT(object != 0);
-		dmu_write(os, ZTEST_DIROBJ, za->za_diroff,
-		    sizeof (uint64_t), &object, tx);
-		/*
-		 * Generate a known hash collision, and verify that
-		 * we can lookup and remove both entries.
-		 */
-		for (i = 0; i < 2; i++) {
-			value[i] = i;
-			error = zap_add(os, object, hc[i], sizeof (uint64_t),
-			    1, &value[i], tx);
-			ASSERT3U(error, ==, 0);
-		}
-		for (i = 0; i < 2; i++) {
-			error = zap_add(os, object, hc[i], sizeof (uint64_t),
-			    1, &value[i], tx);
-			ASSERT3U(error, ==, EEXIST);
-			error = zap_length(os, object, hc[i],
-			    &zl_intsize, &zl_ints);
-			ASSERT3U(error, ==, 0);
-			ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
-			ASSERT3U(zl_ints, ==, 1);
-		}
-		for (i = 0; i < 2; i++) {
-			error = zap_remove(os, object, hc[i], tx);
-			ASSERT3U(error, ==, 0);
-		}
+	object = od[0].od_object;
 
-		dmu_tx_commit(tx);
+	/*
+	 * Generate a known hash collision, and verify that
+	 * we can lookup and remove both entries.
+	 */
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
+		return;
+	for (i = 0; i < 2; i++) {
+		value[i] = i;
+		VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t),
+		    1, &value[i], tx));
+	}
+	for (i = 0; i < 2; i++) {
+		VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i],
+		    sizeof (uint64_t), 1, &value[i], tx));
+		VERIFY3U(0, ==,
+		    zap_length(os, object, hc[i], &zl_intsize, &zl_ints));
+		ASSERT3U(zl_intsize, ==, sizeof (uint64_t));
+		ASSERT3U(zl_ints, ==, 1);
+	}
+	for (i = 0; i < 2; i++) {
+		VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx));
 	}
+	dmu_tx_commit(tx);
 
+	/*
+	 * Generate a buch of random entries.
+	 */
 	ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS);
 
 	prop = ztest_random(ZTEST_ZAP_MAX_PROPS);
@@ -2246,14 +3800,10 @@ ztest_zap(ztest_args_t *za)
 	 * should be txg + object + n.
 	 */
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, object, TRUE, NULL);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		ztest_record_enospc("create zap entry");
-		dmu_tx_abort(tx);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
 		return;
-	}
-	txg = dmu_tx_get_txg(tx);
 
 	if (last_txg > txg)
 		fatal(0, "zap future leak: old %llu new %llu", last_txg, txg);
@@ -2261,16 +3811,10 @@ ztest_zap(ztest_args_t *za)
 	for (i = 0; i < ints; i++)
 		value[i] = txg + object + i;
 
-	error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx);
-	if (error)
-		fatal(0, "zap_update('%s', %llu, '%s') = %d",
-		    osname, object, txgname, error);
-
-	error = zap_update(os, object, propname, sizeof (uint64_t),
-	    ints, value, tx);
-	if (error)
-		fatal(0, "zap_update('%s', %llu, '%s') = %d",
-		    osname, object, propname, error);
+	VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t),
+	    1, &txg, tx));
+	VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t),
+	    ints, value, tx));
 
 	dmu_tx_commit(tx);
 
@@ -2289,231 +3833,558 @@ ztest_zap(ztest_args_t *za)
 	ASSERT3U(error, ==, 0);
 
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, object, TRUE, NULL);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		ztest_record_enospc("remove zap entry");
-		dmu_tx_abort(tx);
+	dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+	txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+	if (txg == 0)
+		return;
+	VERIFY3U(0, ==, zap_remove(os, object, txgname, tx));
+	VERIFY3U(0, ==, zap_remove(os, object, propname, tx));
+	dmu_tx_commit(tx);
+}
+
+/*
+ * Testcase to test the upgrading of a microzap to fatzap.
+ */
+void
+ztest_fzap(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
+	uint64_t object, txg;
+
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0)
 		return;
+
+	object = od[0].od_object;
+
+	/*
+	 * Add entries to this ZAP and make sure it spills over
+	 * and gets upgraded to a fatzap. Also, since we are adding
+	 * 2050 entries we should see ptrtbl growth and leaf-block split.
+	 */
+	for (int i = 0; i < 2050; i++) {
+		char name[MAXNAMELEN];
+		uint64_t value = i;
+		dmu_tx_t *tx;
+		int error;
+
+		(void) snprintf(name, sizeof (name), "fzap-%llu-%llu",
+		    id, value);
+
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_zap(tx, object, B_TRUE, name);
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0)
+			return;
+		error = zap_add(os, object, name, sizeof (uint64_t), 1,
+		    &value, tx);
+		ASSERT(error == 0 || error == EEXIST);
+		dmu_tx_commit(tx);
 	}
-	error = zap_remove(os, object, txgname, tx);
-	if (error)
-		fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-		    osname, object, txgname, error);
+}
 
-	error = zap_remove(os, object, propname, tx);
-	if (error)
-		fatal(0, "zap_remove('%s', %llu, '%s') = %d",
-		    osname, object, propname, error);
+/* ARGSUSED */
+void
+ztest_zap_parallel(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
+	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
+	dmu_tx_t *tx;
+	int i, namelen, error;
+	int micro = ztest_random(2);
+	char name[20], string_value[20];
+	void *data;
 
-	dmu_tx_commit(tx);
+	ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
+
+	object = od[0].od_object;
+
+	/*
+	 * Generate a random name of the form 'xxx.....' where each
+	 * x is a random printable character and the dots are dots.
+	 * There are 94 such characters, and the name length goes from
+	 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+	 */
+	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+
+	for (i = 0; i < 3; i++)
+		name[i] = '!' + ztest_random('~' - '!' + 1);
+	for (; i < namelen - 1; i++)
+		name[i] = '.';
+	name[i] = '\0';
+
+	if ((namelen & 1) || micro) {
+		wsize = sizeof (txg);
+		wc = 1;
+		data = &txg;
+	} else {
+		wsize = 1;
+		wc = namelen;
+		data = string_value;
+	}
+
+	count = -1ULL;
+	VERIFY(zap_count(os, object, &count) == 0);
+	ASSERT(count != -1ULL);
 
 	/*
-	 * Once in a while, destroy the object.
+	 * Select an operation: length, lookup, add, update, remove.
 	 */
-	if (ztest_random(1000) != 0)
+	i = ztest_random(5);
+
+	if (i >= 2) {
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_zap(tx, object, B_TRUE, NULL);
+		txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG);
+		if (txg == 0)
+			return;
+		bcopy(name, string_value, namelen);
+	} else {
+		tx = NULL;
+		txg = 0;
+		bzero(string_value, namelen);
+	}
+
+	switch (i) {
+
+	case 0:
+		error = zap_length(os, object, name, &zl_wsize, &zl_wc);
+		if (error == 0) {
+			ASSERT3U(wsize, ==, zl_wsize);
+			ASSERT3U(wc, ==, zl_wc);
+		} else {
+			ASSERT3U(error, ==, ENOENT);
+		}
+		break;
+
+	case 1:
+		error = zap_lookup(os, object, name, wsize, wc, data);
+		if (error == 0) {
+			if (data == string_value &&
+			    bcmp(name, data, namelen) != 0)
+				fatal(0, "name '%s' != val '%s' len %d",
+				    name, data, namelen);
+		} else {
+			ASSERT3U(error, ==, ENOENT);
+		}
+		break;
+
+	case 2:
+		error = zap_add(os, object, name, wsize, wc, data, tx);
+		ASSERT(error == 0 || error == EEXIST);
+		break;
+
+	case 3:
+		VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
+		break;
+
+	case 4:
+		error = zap_remove(os, object, name, tx);
+		ASSERT(error == 0 || error == ENOENT);
+		break;
+	}
+
+	if (tx != NULL)
+		dmu_tx_commit(tx);
+}
+
+/*
+ * Commit callback data.
+ */
+typedef struct ztest_cb_data {
+	list_node_t		zcd_node;
+	uint64_t		zcd_txg;
+	int			zcd_expected_err;
+	boolean_t		zcd_added;
+	boolean_t		zcd_called;
+	spa_t			*zcd_spa;
+} ztest_cb_data_t;
+
+/* This is the actual commit callback function */
+static void
+ztest_commit_callback(void *arg, int error)
+{
+	ztest_cb_data_t *data = arg;
+	uint64_t synced_txg;
+
+	VERIFY(data != NULL);
+	VERIFY3S(data->zcd_expected_err, ==, error);
+	VERIFY(!data->zcd_called);
+
+	synced_txg = spa_last_synced_txg(data->zcd_spa);
+	if (data->zcd_txg > synced_txg)
+		fatal(0, "commit callback of txg %" PRIu64 " called prematurely"
+		    ", last synced txg = %" PRIu64 "\n", data->zcd_txg,
+		    synced_txg);
+
+	data->zcd_called = B_TRUE;
+
+	if (error == ECANCELED) {
+		ASSERT3U(data->zcd_txg, ==, 0);
+		ASSERT(!data->zcd_added);
+
+		/*
+		 * The private callback data should be destroyed here, but
+		 * since we are going to check the zcd_called field after
+		 * dmu_tx_abort(), we will destroy it there.
+		 */
+		return;
+	}
+
+	/* Was this callback added to the global callback list? */
+	if (!data->zcd_added)
+		goto out;
+
+	ASSERT3U(data->zcd_txg, !=, 0);
+
+	/* Remove our callback from the list */
+	(void) mutex_lock(&zcl.zcl_callbacks_lock);
+	list_remove(&zcl.zcl_callbacks, data);
+	(void) mutex_unlock(&zcl.zcl_callbacks_lock);
+
+out:
+	umem_free(data, sizeof (ztest_cb_data_t));
+}
+
+/* Allocate and initialize callback data structure */
+static ztest_cb_data_t *
+ztest_create_cb_data(objset_t *os, uint64_t txg)
+{
+	ztest_cb_data_t *cb_data;
+
+	cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL);
+
+	cb_data->zcd_txg = txg;
+	cb_data->zcd_spa = dmu_objset_spa(os);
+
+	return (cb_data);
+}
+
+/*
+ * If a number of txgs equal to this threshold have been created after a commit
+ * callback has been registered but not called, then we assume there is an
+ * implementation bug.
+ */
+#define	ZTEST_COMMIT_CALLBACK_THRESH	(TXG_CONCURRENT_STATES + 2)
+
+/*
+ * Commit callback test.
+ */
+void
+ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id)
+{
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
+	dmu_tx_t *tx;
+	ztest_cb_data_t *cb_data[3], *tmp_cb;
+	uint64_t old_txg, txg;
+	int i, error;
+
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0);
+
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
 		return;
 
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t));
-	dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END);
-	error = dmu_tx_assign(tx, TXG_WAIT);
+
+	cb_data[0] = ztest_create_cb_data(os, 0);
+	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]);
+
+	dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t));
+
+	/* Every once in a while, abort the transaction on purpose */
+	if (ztest_random(100) == 0)
+		error = -1;
+
+	if (!error)
+		error = dmu_tx_assign(tx, TXG_NOWAIT);
+
+	txg = error ? 0 : dmu_tx_get_txg(tx);
+
+	cb_data[0]->zcd_txg = txg;
+	cb_data[1] = ztest_create_cb_data(os, txg);
+	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]);
+
 	if (error) {
-		ztest_record_enospc("destroy zap object");
+		/*
+		 * It's not a strict requirement to call the registered
+		 * callbacks from inside dmu_tx_abort(), but that's what
+		 * it's supposed to happen in the current implementation
+		 * so we will check for that.
+		 */
+		for (i = 0; i < 2; i++) {
+			cb_data[i]->zcd_expected_err = ECANCELED;
+			VERIFY(!cb_data[i]->zcd_called);
+		}
+
 		dmu_tx_abort(tx);
+
+		for (i = 0; i < 2; i++) {
+			VERIFY(cb_data[i]->zcd_called);
+			umem_free(cb_data[i], sizeof (ztest_cb_data_t));
+		}
+
 		return;
 	}
-	error = zap_destroy(os, object, tx);
-	if (error)
-		fatal(0, "zap_destroy('%s', %llu) = %d",
-		    osname, object, error);
-	object = 0;
-	dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t),
-	    &object, tx);
-	dmu_tx_commit(tx);
-}
 
-void
-ztest_zap_parallel(ztest_args_t *za)
-{
-	objset_t *os = za->za_os;
-	uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc;
-	dmu_tx_t *tx;
-	int i, namelen, error;
-	char name[20], string_value[20];
-	void *data;
+	cb_data[2] = ztest_create_cb_data(os, txg);
+	dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]);
 
 	/*
-	 * Generate a random name of the form 'xxx.....' where each
-	 * x is a random printable character and the dots are dots.
-	 * There are 94 such characters, and the name length goes from
-	 * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names.
+	 * Read existing data to make sure there isn't a future leak.
 	 */
-	namelen = ztest_random(sizeof (name) - 5) + 5 + 1;
+	VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t),
+	    &old_txg, DMU_READ_PREFETCH));
 
-	for (i = 0; i < 3; i++)
-		name[i] = '!' + ztest_random('~' - '!' + 1);
-	for (; i < namelen - 1; i++)
-		name[i] = '.';
-	name[i] = '\0';
+	if (old_txg > txg)
+		fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64,
+		    old_txg, txg);
 
-	if (ztest_random(2) == 0)
-		object = ZTEST_MICROZAP_OBJ;
-	else
-		object = ZTEST_FATZAP_OBJ;
+	dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx);
 
-	if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) {
-		wsize = sizeof (txg);
-		wc = 1;
-		data = &txg;
-	} else {
-		wsize = 1;
-		wc = namelen;
-		data = string_value;
-	}
+	(void) mutex_lock(&zcl.zcl_callbacks_lock);
 
-	count = -1ULL;
-	VERIFY(zap_count(os, object, &count) == 0);
-	ASSERT(count != -1ULL);
+	/*
+	 * Since commit callbacks don't have any ordering requirement and since
+	 * it is theoretically possible for a commit callback to be called
+	 * after an arbitrary amount of time has elapsed since its txg has been
+	 * synced, it is difficult to reliably determine whether a commit
+	 * callback hasn't been called due to high load or due to a flawed
+	 * implementation.
+	 *
+	 * In practice, we will assume that if after a certain number of txgs a
+	 * commit callback hasn't been called, then most likely there's an
+	 * implementation bug..
+	 */
+	tmp_cb = list_head(&zcl.zcl_callbacks);
+	if (tmp_cb != NULL &&
+	    tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) {
+		fatal(0, "Commit callback threshold exceeded, oldest txg: %"
+		    PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg);
+	}
 
 	/*
-	 * Select an operation: length, lookup, add, update, remove.
+	 * Let's find the place to insert our callbacks.
+	 *
+	 * Even though the list is ordered by txg, it is possible for the
+	 * insertion point to not be the end because our txg may already be
+	 * quiescing at this point and other callbacks in the open txg
+	 * (from other objsets) may have sneaked in.
 	 */
-	i = ztest_random(5);
+	tmp_cb = list_tail(&zcl.zcl_callbacks);
+	while (tmp_cb != NULL && tmp_cb->zcd_txg > txg)
+		tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb);
+
+	/* Add the 3 callbacks to the list */
+	for (i = 0; i < 3; i++) {
+		if (tmp_cb == NULL)
+			list_insert_head(&zcl.zcl_callbacks, cb_data[i]);
+		else
+			list_insert_after(&zcl.zcl_callbacks, tmp_cb,
+			    cb_data[i]);
 
-	if (i >= 2) {
-		tx = dmu_tx_create(os);
-		dmu_tx_hold_zap(tx, object, TRUE, NULL);
-		error = dmu_tx_assign(tx, TXG_WAIT);
-		if (error) {
-			ztest_record_enospc("zap parallel");
-			dmu_tx_abort(tx);
-			return;
-		}
-		txg = dmu_tx_get_txg(tx);
-		bcopy(name, string_value, namelen);
-	} else {
-		tx = NULL;
-		txg = 0;
-		bzero(string_value, namelen);
+		cb_data[i]->zcd_added = B_TRUE;
+		VERIFY(!cb_data[i]->zcd_called);
+
+		tmp_cb = cb_data[i];
 	}
 
-	switch (i) {
+	(void) mutex_unlock(&zcl.zcl_callbacks_lock);
 
-	case 0:
-		error = zap_length(os, object, name, &zl_wsize, &zl_wc);
-		if (error == 0) {
-			ASSERT3U(wsize, ==, zl_wsize);
-			ASSERT3U(wc, ==, zl_wc);
-		} else {
-			ASSERT3U(error, ==, ENOENT);
-		}
-		break;
+	dmu_tx_commit(tx);
+}
 
-	case 1:
-		error = zap_lookup(os, object, name, wsize, wc, data);
-		if (error == 0) {
-			if (data == string_value &&
-			    bcmp(name, data, namelen) != 0)
-				fatal(0, "name '%s' != val '%s' len %d",
-				    name, data, namelen);
-		} else {
-			ASSERT3U(error, ==, ENOENT);
-		}
-		break;
+/* ARGSUSED */
+void
+ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+	zfs_prop_t proplist[] = {
+		ZFS_PROP_CHECKSUM,
+		ZFS_PROP_COMPRESSION,
+		ZFS_PROP_COPIES,
+		ZFS_PROP_DEDUP
+	};
+	ztest_shared_t *zs = ztest_shared;
 
-	case 2:
-		error = zap_add(os, object, name, wsize, wc, data, tx);
-		ASSERT(error == 0 || error == EEXIST);
-		break;
+	(void) rw_rdlock(&zs->zs_name_lock);
 
-	case 3:
-		VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0);
-		break;
+	for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++)
+		(void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p],
+		    ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2));
 
-	case 4:
-		error = zap_remove(os, object, name, tx);
-		ASSERT(error == 0 || error == ENOENT);
-		break;
-	}
+	(void) rw_unlock(&zs->zs_name_lock);
+}
 
-	if (tx != NULL)
-		dmu_tx_commit(tx);
+/* ARGSUSED */
+void
+ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	nvlist_t *props = NULL;
+
+	(void) rw_rdlock(&zs->zs_name_lock);
+
+	(void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO,
+	    ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN));
+
+	VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0);
+
+	if (zopt_verbose >= 6)
+		dump_nvlist(props, 4);
+
+	nvlist_free(props);
+
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
+/*
+ * Test snapshot hold/release and deferred destroy.
+ */
 void
-ztest_dsl_prop_get_set(ztest_args_t *za)
+ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id)
 {
-	objset_t *os = za->za_os;
-	int i, inherit;
-	uint64_t value;
-	const char *prop, *valname;
-	char setpoint[MAXPATHLEN];
-	char osname[MAXNAMELEN];
 	int error;
+	objset_t *os = zd->zd_os;
+	objset_t *origin;
+	char snapname[100];
+	char fullname[100];
+	char clonename[100];
+	char tag[100];
+	char osname[MAXNAMELEN];
 
 	(void) rw_rdlock(&ztest_shared->zs_name_lock);
 
 	dmu_objset_name(os, osname);
 
-	for (i = 0; i < 2; i++) {
-		if (i == 0) {
-			prop = "checksum";
-			value = ztest_random_checksum();
-			inherit = (value == ZIO_CHECKSUM_INHERIT);
-		} else {
-			prop = "compression";
-			value = ztest_random_compress();
-			inherit = (value == ZIO_COMPRESS_INHERIT);
+	(void) snprintf(snapname, 100, "sh1_%llu", id);
+	(void) snprintf(fullname, 100, "%s@%s", osname, snapname);
+	(void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id);
+	(void) snprintf(tag, 100, "%tag_%llu", id);
+
+	/*
+	 * Clean up from any previous run.
+	 */
+	(void) dmu_objset_destroy(clonename, B_FALSE);
+	(void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
+	(void) dmu_objset_destroy(fullname, B_FALSE);
+
+	/*
+	 * Create snapshot, clone it, mark snap for deferred destroy,
+	 * destroy clone, verify snap was also destroyed.
+	 */
+	error = dmu_objset_snapshot(osname, snapname, NULL, FALSE);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_snapshot");
+			goto out;
 		}
+		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+	}
 
-		error = dsl_prop_set(osname, prop, sizeof (value),
-		    !inherit, &value);
+	error = dmu_objset_hold(fullname, FTAG, &origin);
+	if (error)
+		fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
 
+	error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0);
+	dmu_objset_rele(origin, FTAG);
+	if (error) {
 		if (error == ENOSPC) {
-			ztest_record_enospc("dsl_prop_set");
-			break;
+			ztest_record_enospc("dmu_objset_clone");
+			goto out;
 		}
+		fatal(0, "dmu_objset_clone(%s) = %d", clonename, error);
+	}
 
-		ASSERT3U(error, ==, 0);
+	error = dmu_objset_destroy(fullname, B_TRUE);
+	if (error) {
+		fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+		    fullname, error);
+	}
 
-		VERIFY3U(dsl_prop_get(osname, prop, sizeof (value),
-		    1, &value, setpoint), ==, 0);
+	error = dmu_objset_destroy(clonename, B_FALSE);
+	if (error)
+		fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error);
 
-		if (i == 0)
-			valname = zio_checksum_table[value].ci_name;
-		else
-			valname = zio_compress_table[value].ci_name;
+	error = dmu_objset_hold(fullname, FTAG, &origin);
+	if (error != ENOENT)
+		fatal(0, "dmu_objset_hold(%s) = %d", fullname, error);
 
-		if (zopt_verbose >= 6) {
-			(void) printf("%s %s = %s for '%s'\n",
-			    osname, prop, valname, setpoint);
+	/*
+	 * Create snapshot, add temporary hold, verify that we can't
+	 * destroy a held snapshot, mark for deferred destroy,
+	 * release hold, verify snapshot was destroyed.
+	 */
+	error = dmu_objset_snapshot(osname, snapname, NULL, FALSE);
+	if (error) {
+		if (error == ENOSPC) {
+			ztest_record_enospc("dmu_objset_snapshot");
+			goto out;
 		}
+		fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error);
+	}
+
+	error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, B_TRUE);
+	if (error)
+		fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag);
+
+	error = dmu_objset_destroy(fullname, B_FALSE);
+	if (error != EBUSY) {
+		fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d",
+		    fullname, error);
+	}
+
+	error = dmu_objset_destroy(fullname, B_TRUE);
+	if (error) {
+		fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d",
+		    fullname, error);
 	}
 
+	error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE);
+	if (error)
+		fatal(0, "dsl_dataset_user_release(%s)", fullname, tag);
+
+	VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT);
+
+out:
 	(void) rw_unlock(&ztest_shared->zs_name_lock);
 }
 
 /*
  * Inject random faults into the on-disk data.
  */
+/* ARGSUSED */
 void
-ztest_fault_inject(ztest_args_t *za)
+ztest_fault_inject(ztest_ds_t *zd, uint64_t id)
 {
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
 	int fd;
 	uint64_t offset;
-	uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz;
+	uint64_t leaves;
 	uint64_t bad = 0x1990c0ffeedecade;
 	uint64_t top, leaf;
 	char path0[MAXPATHLEN];
 	char pathrand[MAXPATHLEN];
 	size_t fsize;
-	spa_t *spa = za->za_spa;
 	int bshift = SPA_MAXBLOCKSHIFT + 2;	/* don't scrog all labels */
 	int iters = 1000;
-	int maxfaults = zopt_maxfaults;
+	int maxfaults;
+	int mirror_save;
 	vdev_t *vd0 = NULL;
 	uint64_t guid0 = 0;
+	boolean_t islog = B_FALSE;
+
+	VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+	maxfaults = MAXFAULTS();
+	leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz;
+	mirror_save = zs->zs_mirrors;
+	VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
 
 	ASSERT(leaves >= 1);
 
@@ -2524,10 +4395,10 @@ ztest_fault_inject(ztest_args_t *za)
 
 	if (ztest_random(2) == 0) {
 		/*
-		 * Inject errors on a normal data device.
+		 * Inject errors on a normal data device or slog device.
 		 */
-		top = ztest_random(spa->spa_root_vdev->vdev_children);
-		leaf = ztest_random(leaves);
+		top = ztest_random_vdev_top(spa, B_TRUE);
+		leaf = ztest_random(leaves) + zs->zs_splits;
 
 		/*
 		 * Generate paths to the first leaf in this top-level vdev,
@@ -2536,11 +4407,14 @@ ztest_fault_inject(ztest_args_t *za)
 		 * and we'll write random garbage to the randomly chosen leaf.
 		 */
 		(void) snprintf(path0, sizeof (path0), ztest_dev_template,
-		    zopt_dir, zopt_pool, top * leaves + 0);
+		    zopt_dir, zopt_pool, top * leaves + zs->zs_splits);
 		(void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template,
 		    zopt_dir, zopt_pool, top * leaves + leaf);
 
 		vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0);
+		if (vd0 != NULL && vd0->vdev_top->vdev_islog)
+			islog = B_TRUE;
+
 		if (vd0 != NULL && maxfaults != 1) {
 			/*
 			 * Make vd0 explicitly claim to be unreadable,
@@ -2584,23 +4458,40 @@ ztest_fault_inject(ztest_args_t *za)
 		maxfaults = INT_MAX;	/* no limit on cache devices */
 	}
 
-	dprintf("damaging %s and %s\n", path0, pathrand);
-
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
-	if (maxfaults == 0)
-		return;
-
 	/*
-	 * If we can tolerate two or more faults, randomly online/offline vd0.
+	 * If we can tolerate two or more faults, or we're dealing
+	 * with a slog, randomly online/offline vd0.
 	 */
-	if (maxfaults >= 2 && guid0 != 0) {
-		if (ztest_random(10) < 6)
-			(void) vdev_offline(spa, guid0, B_TRUE);
-		else
-			(void) vdev_online(spa, guid0, B_FALSE, NULL);
+	if ((maxfaults >= 2 || islog) && guid0 != 0) {
+		if (ztest_random(10) < 6) {
+			int flags = (ztest_random(2) == 0 ?
+			    ZFS_OFFLINE_TEMPORARY : 0);
+
+			/*
+			 * We have to grab the zs_name_lock as writer to
+			 * prevent a race between offlining a slog and
+			 * destroying a dataset. Offlining the slog will
+			 * grab a reference on the dataset which may cause
+			 * dmu_objset_destroy() to fail with EBUSY thus
+			 * leaving the dataset in an inconsistent state.
+			 */
+			if (islog)
+				(void) rw_wrlock(&ztest_shared->zs_name_lock);
+
+			VERIFY(vdev_offline(spa, guid0, flags) != EBUSY);
+
+			if (islog)
+				(void) rw_unlock(&ztest_shared->zs_name_lock);
+		} else {
+			(void) vdev_online(spa, guid0, 0, NULL);
+		}
 	}
 
+	if (maxfaults == 0)
+		return;
+
 	/*
 	 * We have at least single-fault tolerance, so inject data corruption.
 	 */
@@ -2619,173 +4510,196 @@ ztest_fault_inject(ztest_args_t *za)
 		if (offset >= fsize)
 			continue;
 
-		if (zopt_verbose >= 6)
-			(void) printf("injecting bad word into %s,"
-			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
+		VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0);
+		if (mirror_save != zs->zs_mirrors) {
+			VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+			(void) close(fd);
+			return;
+		}
 
 		if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad))
 			fatal(1, "can't inject bad word at 0x%llx in %s",
 			    offset, pathrand);
+
+		VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0);
+
+		if (zopt_verbose >= 7)
+			(void) printf("injected bad word into %s,"
+			    " offset 0x%llx\n", pathrand, (u_longlong_t)offset);
 	}
 
 	(void) close(fd);
 }
 
 /*
- * Scrub the pool.
+ * Verify that DDT repair works as expected.
  */
 void
-ztest_scrub(ztest_args_t *za)
+ztest_ddt_repair(ztest_ds_t *zd, uint64_t id)
 {
-	spa_t *spa = za->za_spa;
-
-	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
-	(void) poll(NULL, 0, 1000); /* wait a second, then force a restart */
-	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
-}
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
+	objset_t *os = zd->zd_os;
+	ztest_od_t od[1];
+	uint64_t object, blocksize, txg, pattern, psize;
+	enum zio_checksum checksum = spa_dedup_checksum(spa);
+	dmu_buf_t *db;
+	dmu_tx_t *tx;
+	void *buf;
+	blkptr_t blk;
+	int copies = 2 * ZIO_DEDUPDITTO_MIN;
 
-/*
- * Rename the pool to a different name and then rename it back.
- */
-void
-ztest_spa_rename(ztest_args_t *za)
-{
-	char *oldname, *newname;
-	int error;
-	spa_t *spa;
+	blocksize = ztest_random_blocksize();
+	blocksize = MIN(blocksize, 2048);	/* because we write so many */
 
-	(void) rw_wrlock(&ztest_shared->zs_name_lock);
+	ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0);
 
-	oldname = za->za_pool;
-	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
-	(void) strcpy(newname, oldname);
-	(void) strcat(newname, "_tmp");
+	if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0)
+		return;
 
 	/*
-	 * Do the rename
+	 * Take the name lock as writer to prevent anyone else from changing
+	 * the pool and dataset properies we need to maintain during this test.
 	 */
-	error = spa_rename(oldname, newname);
-	if (error)
-		fatal(0, "spa_rename('%s', '%s') = %d", oldname,
-		    newname, error);
+	(void) rw_wrlock(&zs->zs_name_lock);
 
-	/*
-	 * Try to open it under the old name, which shouldn't exist
-	 */
-	error = spa_open(oldname, &spa, FTAG);
-	if (error != ENOENT)
-		fatal(0, "spa_open('%s') = %d", oldname, error);
+	if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum,
+	    B_FALSE) != 0 ||
+	    ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1,
+	    B_FALSE) != 0) {
+		(void) rw_unlock(&zs->zs_name_lock);
+		return;
+	}
+
+	object = od[0].od_object;
+	blocksize = od[0].od_blocksize;
+	pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os);
+
+	ASSERT(object != 0);
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_write(tx, object, 0, copies * blocksize);
+	txg = ztest_tx_assign(tx, TXG_WAIT, FTAG);
+	if (txg == 0) {
+		(void) rw_unlock(&zs->zs_name_lock);
+		return;
+	}
 
 	/*
-	 * Open it under the new name and make sure it's still the same spa_t.
+	 * Write all the copies of our block.
 	 */
-	error = spa_open(newname, &spa, FTAG);
-	if (error != 0)
-		fatal(0, "spa_open('%s') = %d", newname, error);
+	for (int i = 0; i < copies; i++) {
+		uint64_t offset = i * blocksize;
+		VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db) == 0);
+		ASSERT(db->db_offset == offset);
+		ASSERT(db->db_size == blocksize);
+		ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) ||
+		    ztest_pattern_match(db->db_data, db->db_size, 0ULL));
+		dmu_buf_will_fill(db, tx);
+		ztest_pattern_set(db->db_data, db->db_size, pattern);
+		dmu_buf_rele(db, FTAG);
+	}
 
-	ASSERT(spa == za->za_spa);
-	spa_close(spa, FTAG);
+	dmu_tx_commit(tx);
+	txg_wait_synced(spa_get_dsl(spa), txg);
 
 	/*
-	 * Rename it back to the original
+	 * Find out what block we got.
 	 */
-	error = spa_rename(newname, oldname);
-	if (error)
-		fatal(0, "spa_rename('%s', '%s') = %d", newname,
-		    oldname, error);
+	VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db) == 0);
+	blk = *((dmu_buf_impl_t *)db)->db_blkptr;
+	dmu_buf_rele(db, FTAG);
 
 	/*
-	 * Make sure it can still be opened
+	 * Damage the block.  Dedup-ditto will save us when we read it later.
 	 */
-	error = spa_open(oldname, &spa, FTAG);
-	if (error != 0)
-		fatal(0, "spa_open('%s') = %d", oldname, error);
+	psize = BP_GET_PSIZE(&blk);
+	buf = zio_buf_alloc(psize);
+	ztest_pattern_set(buf, psize, ~pattern);
 
-	ASSERT(spa == za->za_spa);
-	spa_close(spa, FTAG);
+	(void) zio_wait(zio_rewrite(NULL, spa, 0, &blk,
+	    buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL));
 
-	umem_free(newname, strlen(newname) + 1);
+	zio_buf_free(buf, psize);
 
-	(void) rw_unlock(&ztest_shared->zs_name_lock);
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
-
 /*
- * Completely obliterate one disk.
+ * Scrub the pool.
  */
-static void
-ztest_obliterate_one_disk(uint64_t vdev)
+/* ARGSUSED */
+void
+ztest_scrub(ztest_ds_t *zd, uint64_t id)
 {
-	int fd;
-	char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN];
-	size_t fsize;
+	ztest_shared_t *zs = ztest_shared;
+	spa_t *spa = zs->zs_spa;
 
-	if (zopt_maxfaults < 2)
-		return;
+	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+	(void) poll(NULL, 0, 100); /* wait a moment, then force a restart */
+	(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
+}
 
-	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
-	(void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name);
+/*
+ * Rename the pool to a different name and then rename it back.
+ */
+/* ARGSUSED */
+void
+ztest_spa_rename(ztest_ds_t *zd, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	char *oldname, *newname;
+	spa_t *spa;
 
-	fd = open(dev_name, O_RDWR);
+	(void) rw_wrlock(&zs->zs_name_lock);
 
-	if (fd == -1)
-		fatal(1, "can't open %s", dev_name);
+	oldname = zs->zs_pool;
+	newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL);
+	(void) strcpy(newname, oldname);
+	(void) strcat(newname, "_tmp");
 
 	/*
-	 * Determine the size.
+	 * Do the rename
 	 */
-	fsize = lseek(fd, 0, SEEK_END);
-
-	(void) close(fd);
+	VERIFY3U(0, ==, spa_rename(oldname, newname));
 
 	/*
-	 * Rename the old device to dev_name.old (useful for debugging).
+	 * Try to open it under the old name, which shouldn't exist
 	 */
-	VERIFY(rename(dev_name, copy_name) == 0);
+	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
-	 * Create a new one.
+	 * Open it under the new name and make sure it's still the same spa_t.
 	 */
-	VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0);
-	VERIFY(ftruncate(fd, fsize) == 0);
-	(void) close(fd);
-}
+	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
 
-static void
-ztest_replace_one_disk(spa_t *spa, uint64_t vdev)
-{
-	char dev_name[MAXPATHLEN];
-	nvlist_t *root;
-	int error;
-	uint64_t guid;
-	vdev_t *vd;
+	ASSERT(spa == zs->zs_spa);
+	spa_close(spa, FTAG);
 
-	(void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev);
+	/*
+	 * Rename it back to the original
+	 */
+	VERIFY3U(0, ==, spa_rename(newname, oldname));
 
 	/*
-	 * Build the nvlist describing dev_name.
+	 * Make sure it can still be opened
 	 */
-	root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1);
+	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
 
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL)
-		guid = 0;
-	else
-		guid = vd->vdev_guid;
-	spa_config_exit(spa, SCL_VDEV, FTAG);
-	error = spa_vdev_attach(spa, guid, root, B_TRUE);
-	if (error != 0 &&
-	    error != EBUSY &&
-	    error != ENOTSUP &&
-	    error != ENODEV &&
-	    error != EDOM)
-		fatal(0, "spa_vdev_attach(in-place) = %d", error);
+	ASSERT(spa == zs->zs_spa);
+	spa_close(spa, FTAG);
 
-	nvlist_free(root);
+	umem_free(newname, strlen(newname) + 1);
+
+	(void) rw_unlock(&zs->zs_name_lock);
 }
 
+/*
+ * Verify pool integrity by running zdb.
+ */
 static void
-ztest_verify_blocks(char *pool)
+ztest_run_zdb(char *pool)
 {
 	int status;
 	char zdb[MAXPATHLEN + MAXNAMELEN + 20];
@@ -2806,7 +4720,7 @@ ztest_verify_blocks(char *pool)
 	isa = strdup(isa);
 	/* LINTED */
 	(void) sprintf(bin,
-	    "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache %s",
+	    "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s",
 	    isalen,
 	    isa,
 	    zopt_verbose >= 3 ? "s" : "",
@@ -2853,10 +4767,9 @@ ztest_walk_pool_directory(char *header)
 static void
 ztest_spa_import_export(char *oldname, char *newname)
 {
-	nvlist_t *config;
+	nvlist_t *config, *newconfig;
 	uint64_t pool_guid;
 	spa_t *spa;
-	int error;
 
 	if (zopt_verbose >= 4) {
 		(void) printf("import/export: old = %s, new = %s\n",
@@ -2871,9 +4784,13 @@ ztest_spa_import_export(char *oldname, char *newname)
 	/*
 	 * Get the pool's configuration and guid.
 	 */
-	error = spa_open(oldname, &spa, FTAG);
-	if (error)
-		fatal(0, "spa_open('%s') = %d", oldname, error);
+	VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG));
+
+	/*
+	 * Kick off a scrub to tickle scrub/export races.
+	 */
+	if (ztest_random(2) == 0)
+		(void) spa_scrub(spa, POOL_SCRUB_EVERYTHING);
 
 	pool_guid = spa_guid(spa);
 	spa_close(spa, FTAG);
@@ -2883,225 +4800,337 @@ ztest_spa_import_export(char *oldname, char *newname)
 	/*
 	 * Export it.
 	 */
-	error = spa_export(oldname, &config, B_FALSE);
-	if (error)
-		fatal(0, "spa_export('%s') = %d", oldname, error);
+	VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE));
 
 	ztest_walk_pool_directory("pools after export");
 
+	/*
+	 * Try to import it.
+	 */
+	newconfig = spa_tryimport(config);
+	ASSERT(newconfig != NULL);
+	nvlist_free(newconfig);
+
 	/*
 	 * Import it under the new name.
 	 */
-	error = spa_import(newname, config, NULL);
-	if (error)
-		fatal(0, "spa_import('%s') = %d", newname, error);
+	VERIFY3U(0, ==, spa_import(newname, config, NULL));
 
 	ztest_walk_pool_directory("pools after import");
 
 	/*
 	 * Try to import it again -- should fail with EEXIST.
 	 */
-	error = spa_import(newname, config, NULL);
-	if (error != EEXIST)
-		fatal(0, "spa_import('%s') twice", newname);
+	VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL));
 
 	/*
 	 * Try to import it under a different name -- should fail with EEXIST.
 	 */
-	error = spa_import(oldname, config, NULL);
-	if (error != EEXIST)
-		fatal(0, "spa_import('%s') under multiple names", newname);
+	VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL));
 
 	/*
 	 * Verify that the pool is no longer visible under the old name.
 	 */
-	error = spa_open(oldname, &spa, FTAG);
-	if (error != ENOENT)
-		fatal(0, "spa_open('%s') = %d", newname, error);
+	VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG));
 
 	/*
 	 * Verify that we can open and close the pool using the new name.
 	 */
-	error = spa_open(newname, &spa, FTAG);
-	if (error)
-		fatal(0, "spa_open('%s') = %d", newname, error);
+	VERIFY3U(0, ==, spa_open(newname, &spa, FTAG));
 	ASSERT(pool_guid == spa_guid(spa));
 	spa_close(spa, FTAG);
 
 	nvlist_free(config);
 }
 
+static void
+ztest_resume(spa_t *spa)
+{
+	if (spa_suspended(spa) && zopt_verbose >= 6)
+		(void) printf("resuming from suspended state\n");
+	spa_vdev_state_enter(spa, SCL_NONE);
+	vdev_clear(spa, NULL);
+	(void) spa_vdev_state_exit(spa, NULL, 0);
+	(void) zio_resume(spa);
+}
+
 static void *
-ztest_resume(void *arg)
+ztest_resume_thread(void *arg)
 {
 	spa_t *spa = arg;
 
 	while (!ztest_exiting) {
-		(void) poll(NULL, 0, 1000);
+		if (spa_suspended(spa))
+			ztest_resume(spa);
+		(void) poll(NULL, 0, 100);
+	}
+	return (NULL);
+}
 
-		if (!spa_suspended(spa))
-			continue;
+static void *
+ztest_deadman_thread(void *arg)
+{
+	ztest_shared_t *zs = arg;
+	int grace = 300;
+	hrtime_t delta;
 
-		spa_vdev_state_enter(spa);
-		vdev_clear(spa, NULL);
-		(void) spa_vdev_state_exit(spa, NULL, 0);
+	delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace;
+
+	(void) poll(NULL, 0, (int)(1000 * delta));
+
+	fatal(0, "failed to complete within %d seconds of deadline", grace);
 
-		zio_resume(spa);
-	}
 	return (NULL);
 }
 
+static void
+ztest_execute(ztest_info_t *zi, uint64_t id)
+{
+	ztest_shared_t *zs = ztest_shared;
+	ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets];
+	hrtime_t functime = gethrtime();
+
+	for (int i = 0; i < zi->zi_iters; i++)
+		zi->zi_func(zd, id);
+
+	functime = gethrtime() - functime;
+
+	atomic_add_64(&zi->zi_call_count, 1);
+	atomic_add_64(&zi->zi_call_time, functime);
+
+	if (zopt_verbose >= 4) {
+		Dl_info dli;
+		(void) dladdr((void *)zi->zi_func, &dli);
+		(void) printf("%6.2f sec in %s\n",
+		    (double)functime / NANOSEC, dli.dli_sname);
+	}
+}
+
 static void *
 ztest_thread(void *arg)
 {
-	ztest_args_t *za = arg;
+	uint64_t id = (uintptr_t)arg;
 	ztest_shared_t *zs = ztest_shared;
-	hrtime_t now, functime;
+	uint64_t call_next;
+	hrtime_t now;
 	ztest_info_t *zi;
-	int f, i;
 
-	while ((now = gethrtime()) < za->za_stop) {
+	while ((now = gethrtime()) < zs->zs_thread_stop) {
 		/*
 		 * See if it's time to force a crash.
 		 */
-		if (now > za->za_kill) {
-			zs->zs_alloc = spa_get_alloc(za->za_spa);
-			zs->zs_space = spa_get_space(za->za_spa);
-			(void) kill(getpid(), SIGKILL);
-		}
+		if (now > zs->zs_thread_kill)
+			ztest_kill(zs);
 
 		/*
-		 * Pick a random function.
+		 * If we're getting ENOSPC with some regularity, stop.
 		 */
-		f = ztest_random(ZTEST_FUNCS);
-		zi = &zs->zs_info[f];
+		if (zs->zs_enospc_count > 10)
+			break;
 
 		/*
-		 * Decide whether to call it, based on the requested frequency.
+		 * Pick a random function to execute.
 		 */
-		if (zi->zi_call_target == 0 ||
-		    (double)zi->zi_call_total / zi->zi_call_target >
-		    (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC))
-			continue;
+		zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)];
+		call_next = zi->zi_call_next;
 
-		atomic_add_64(&zi->zi_calls, 1);
-		atomic_add_64(&zi->zi_call_total, 1);
+		if (now >= call_next &&
+		    atomic_cas_64(&zi->zi_call_next, call_next, call_next +
+		    ztest_random(2 * zi->zi_interval[0] + 1)) == call_next)
+			ztest_execute(zi, id);
+	}
 
-		za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) *
-		    ZTEST_DIRSIZE;
-		za->za_diroff_shared = (1ULL << 63);
+	return (NULL);
+}
 
-		for (i = 0; i < zi->zi_iters; i++)
-			zi->zi_func(za);
+static void
+ztest_dataset_name(char *dsname, char *pool, int d)
+{
+	(void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d);
+}
 
-		functime = gethrtime() - now;
+static void
+ztest_dataset_destroy(ztest_shared_t *zs, int d)
+{
+	char name[MAXNAMELEN];
 
-		atomic_add_64(&zi->zi_call_time, functime);
+	ztest_dataset_name(name, zs->zs_pool, d);
 
-		if (zopt_verbose >= 4) {
-			Dl_info dli;
-			(void) dladdr((void *)zi->zi_func, &dli);
-			(void) printf("%6.2f sec in %s\n",
-			    (double)functime / NANOSEC, dli.dli_sname);
-		}
+	if (zopt_verbose >= 3)
+		(void) printf("Destroying %s to free up space\n", name);
 
-		/*
-		 * If we're getting ENOSPC with some regularity, stop.
-		 */
-		if (zs->zs_enospc_count > 10)
-			break;
+	/*
+	 * Cleanup any non-standard clones and snapshots.  In general,
+	 * ztest thread t operates on dataset (t % zopt_datasets),
+	 * so there may be more than one thing to clean up.
+	 */
+	for (int t = d; t < zopt_threads; t += zopt_datasets)
+		ztest_dsl_dataset_cleanup(name, t);
+
+	(void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL,
+	    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
+}
+
+static void
+ztest_dataset_dirobj_verify(ztest_ds_t *zd)
+{
+	uint64_t usedobjs, dirobjs, scratch;
+
+	/*
+	 * ZTEST_DIROBJ is the object directory for the entire dataset.
+	 * Therefore, the number of objects in use should equal the
+	 * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself.
+	 * If not, we have an object leak.
+	 *
+	 * Note that we can only check this in ztest_dataset_open(),
+	 * when the open-context and syncing-context values agree.
+	 * That's because zap_count() returns the open-context value,
+	 * while dmu_objset_space() returns the rootbp fill count.
+	 */
+	VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs));
+	dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch);
+	ASSERT3U(dirobjs + 1, ==, usedobjs);
+}
+
+static int
+ztest_dataset_open(ztest_shared_t *zs, int d)
+{
+	ztest_ds_t *zd = &zs->zs_zd[d];
+	uint64_t committed_seq = zd->zd_seq;
+	objset_t *os;
+	zilog_t *zilog;
+	char name[MAXNAMELEN];
+	int error;
+
+	ztest_dataset_name(name, zs->zs_pool, d);
+
+	(void) rw_rdlock(&zs->zs_name_lock);
+
+	error = dmu_objset_create(name, DMU_OST_OTHER, 0,
+	    ztest_objset_create_cb, NULL);
+	if (error == ENOSPC) {
+		(void) rw_unlock(&zs->zs_name_lock);
+		ztest_record_enospc(FTAG);
+		return (error);
 	}
+	ASSERT(error == 0 || error == EEXIST);
 
-	return (NULL);
+	VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0);
+	(void) rw_unlock(&zs->zs_name_lock);
+
+	ztest_zd_init(zd, os);
+
+	zilog = zd->zd_zilog;
+
+	if (zilog->zl_header->zh_claim_lr_seq != 0 &&
+	    zilog->zl_header->zh_claim_lr_seq < committed_seq)
+		fatal(0, "missing log records: claimed %llu < committed %llu",
+		    zilog->zl_header->zh_claim_lr_seq, committed_seq);
+
+	ztest_dataset_dirobj_verify(zd);
+
+	zil_replay(os, zd, ztest_replay_vector);
+
+	ztest_dataset_dirobj_verify(zd);
+
+	if (zopt_verbose >= 6)
+		(void) printf("%s replay %llu blocks, %llu records, seq %llu\n",
+		    zd->zd_name,
+		    (u_longlong_t)zilog->zl_parse_blk_count,
+		    (u_longlong_t)zilog->zl_parse_lr_count,
+		    (u_longlong_t)zilog->zl_replaying_seq);
+
+	zilog = zil_open(os, ztest_get_data);
+
+	if (zilog->zl_replaying_seq != 0 &&
+	    zilog->zl_replaying_seq < committed_seq)
+		fatal(0, "missing log records: replayed %llu < committed %llu",
+		    zilog->zl_replaying_seq, committed_seq);
+
+	return (0);
+}
+
+static void
+ztest_dataset_close(ztest_shared_t *zs, int d)
+{
+	ztest_ds_t *zd = &zs->zs_zd[d];
+
+	zil_close(zd->zd_zilog);
+	dmu_objset_rele(zd->zd_os, zd);
+
+	ztest_zd_fini(zd);
 }
 
 /*
  * Kick off threads to run tests on all datasets in parallel.
  */
 static void
-ztest_run(char *pool)
+ztest_run(ztest_shared_t *zs)
 {
-	int t, d, error;
-	ztest_shared_t *zs = ztest_shared;
-	ztest_args_t *za;
+	thread_t *tid;
 	spa_t *spa;
-	char name[100];
 	thread_t resume_tid;
+	int error;
 
 	ztest_exiting = B_FALSE;
 
-	(void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL);
-	(void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL);
-
-	for (t = 0; t < ZTEST_SYNC_LOCKS; t++)
-		(void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL);
-
 	/*
-	 * Destroy one disk before we even start.
-	 * It's mirrored, so everything should work just fine.
-	 * This makes us exercise fault handling very early in spa_load().
+	 * Initialize parent/child shared state.
 	 */
-	ztest_obliterate_one_disk(0);
+	VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
 
-	/*
-	 * Verify that the sum of the sizes of all blocks in the pool
-	 * equals the SPA's allocated space total.
-	 */
-	ztest_verify_blocks(pool);
+	zs->zs_thread_start = gethrtime();
+	zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC;
+	zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop);
+	zs->zs_thread_kill = zs->zs_thread_stop;
+	if (ztest_random(100) < zopt_killrate)
+		zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC);
 
-	/*
-	 * Kick off a replacement of the disk we just obliterated.
-	 */
-	kernel_init(FREAD | FWRITE);
-	VERIFY(spa_open(pool, &spa, FTAG) == 0);
-	ztest_replace_one_disk(spa, 0);
-	if (zopt_verbose >= 5)
-		show_pool_stats(spa);
-	spa_close(spa, FTAG);
-	kernel_fini();
+	(void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL);
 
-	kernel_init(FREAD | FWRITE);
+	list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t),
+	    offsetof(ztest_cb_data_t, zcd_node));
 
 	/*
-	 * Verify that we can export the pool and reimport it under a
-	 * different name.
+	 * Open our pool.
 	 */
-	if (ztest_random(2) == 0) {
-		(void) snprintf(name, 100, "%s_import", pool);
-		ztest_spa_import_export(pool, name);
-		ztest_spa_import_export(name, pool);
-	}
+	kernel_init(FREAD | FWRITE);
+	VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0);
+	zs->zs_spa = spa;
 
-	/*
-	 * Verify that we can loop over all pools.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) {
-		if (zopt_verbose > 3) {
-			(void) printf("spa_next: found %s\n", spa_name(spa));
-		}
-	}
-	mutex_exit(&spa_namespace_lock);
+	spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN;
 
 	/*
-	 * Open our pool.
+	 * We don't expect the pool to suspend unless maxfaults == 0,
+	 * in which case ztest_fault_inject() temporarily takes away
+	 * the only valid replica.
 	 */
-	VERIFY(spa_open(pool, &spa, FTAG) == 0);
+	if (MAXFAULTS() == 0)
+		spa->spa_failmode = ZIO_FAILURE_MODE_WAIT;
+	else
+		spa->spa_failmode = ZIO_FAILURE_MODE_PANIC;
 
 	/*
 	 * Create a thread to periodically resume suspended I/O.
 	 */
-	VERIFY(thr_create(0, 0, ztest_resume, spa, THR_BOUND,
+	VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND,
 	    &resume_tid) == 0);
 
+	/*
+	 * Create a deadman thread to abort() if we hang.
+	 */
+	VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND,
+	    NULL) == 0);
+
 	/*
 	 * Verify that we can safely inquire about about any object,
 	 * whether it's allocated or not.  To make it interesting,
 	 * we probe a 5-wide window around each power of two.
 	 * This hits all edge cases, including zero and the max.
 	 */
-	for (t = 0; t < 64; t++) {
-		for (d = -5; d <= 5; d++) {
+	for (int t = 0; t < 64; t++) {
+		for (int d = -5; d <= 5; d++) {
 			error = dmu_object_info(spa->spa_meta_objset,
 			    (1ULL << t) + d, NULL);
 			ASSERT(error == 0 || error == ENOENT ||
@@ -3110,118 +5139,156 @@ ztest_run(char *pool)
 	}
 
 	/*
-	 * Now kick off all the tests that run in parallel.
+	 * If we got any ENOSPC errors on the previous run, destroy something.
 	 */
+	if (zs->zs_enospc_count != 0) {
+		int d = ztest_random(zopt_datasets);
+		ztest_dataset_destroy(zs, d);
+	}
 	zs->zs_enospc_count = 0;
 
-	za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL);
+	tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL);
 
 	if (zopt_verbose >= 4)
 		(void) printf("starting main threads...\n");
 
-	za[0].za_start = gethrtime();
-	za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC;
-	za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time);
-	za[0].za_kill = za[0].za_stop;
-	if (ztest_random(100) < zopt_killrate)
-		za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC);
-
-	for (t = 0; t < zopt_threads; t++) {
-		d = t % zopt_datasets;
-
-		(void) strcpy(za[t].za_pool, pool);
-		za[t].za_os = za[d].za_os;
-		za[t].za_spa = spa;
-		za[t].za_zilog = za[d].za_zilog;
-		za[t].za_instance = t;
-		za[t].za_random = ztest_random(-1ULL);
-		za[t].za_start = za[0].za_start;
-		za[t].za_stop = za[0].za_stop;
-		za[t].za_kill = za[0].za_kill;
-
-		if (t < zopt_datasets) {
-			ztest_replay_t zr;
-			int test_future = FALSE;
-			(void) rw_rdlock(&ztest_shared->zs_name_lock);
-			(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
-			error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0,
-			    ztest_create_cb, NULL);
-			if (error == EEXIST) {
-				test_future = TRUE;
-			} else if (error == ENOSPC) {
-				zs->zs_enospc_count++;
-				(void) rw_unlock(&ztest_shared->zs_name_lock);
-				break;
-			} else if (error != 0) {
-				fatal(0, "dmu_objset_create(%s) = %d",
-				    name, error);
-			}
-			error = dmu_objset_open(name, DMU_OST_OTHER,
-			    DS_MODE_USER, &za[d].za_os);
-			if (error)
-				fatal(0, "dmu_objset_open('%s') = %d",
-				    name, error);
-			(void) rw_unlock(&ztest_shared->zs_name_lock);
-			if (test_future)
-				ztest_dmu_check_future_leak(&za[t]);
-			zr.zr_os = za[d].za_os;
-			zil_replay(zr.zr_os, &zr, &zr.zr_assign,
-			    ztest_replay_vector, NULL);
-			za[d].za_zilog = zil_open(za[d].za_os, NULL);
-		}
+	/*
+	 * Kick off all the tests that run in parallel.
+	 */
+	for (int t = 0; t < zopt_threads; t++) {
+		if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0)
+			return;
+		VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t,
+		    THR_BOUND, &tid[t]) == 0);
+	}
 
-		VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND,
-		    &za[t].za_thread) == 0);
+	/*
+	 * Wait for all of the tests to complete.  We go in reverse order
+	 * so we don't close datasets while threads are still using them.
+	 */
+	for (int t = zopt_threads - 1; t >= 0; t--) {
+		VERIFY(thr_join(tid[t], NULL, NULL) == 0);
+		if (t < zopt_datasets)
+			ztest_dataset_close(zs, t);
 	}
 
-	while (--t >= 0) {
-		VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0);
-		if (t < zopt_datasets) {
-			zil_close(za[t].za_zilog);
-			dmu_objset_close(za[t].za_os);
-		}
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+	zs->zs_space = metaslab_class_get_space(spa_normal_class(spa));
+
+	umem_free(tid, zopt_threads * sizeof (thread_t));
+
+	/* Kill the resume thread */
+	ztest_exiting = B_TRUE;
+	VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
+	ztest_resume(spa);
+
+	/*
+	 * Right before closing the pool, kick off a bunch of async I/O;
+	 * spa_close() should wait for it to complete.
+	 */
+	for (uint64_t object = 1; object < 50; object++)
+		dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20);
+
+	spa_close(spa, FTAG);
+
+	/*
+	 * Verify that we can loop over all pools.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa))
+		if (zopt_verbose > 3)
+			(void) printf("spa_next: found %s\n", spa_name(spa));
+	mutex_exit(&spa_namespace_lock);
+
+	/*
+	 * Verify that we can export the pool and reimport it under a
+	 * different name.
+	 */
+	if (ztest_random(2) == 0) {
+		char name[MAXNAMELEN];
+		(void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool);
+		ztest_spa_import_export(zs->zs_pool, name);
+		ztest_spa_import_export(name, zs->zs_pool);
 	}
 
-	if (zopt_verbose >= 3)
-		show_pool_stats(spa);
+	kernel_fini();
+}
 
-	txg_wait_synced(spa_get_dsl(spa), 0);
+static void
+ztest_freeze(ztest_shared_t *zs)
+{
+	ztest_ds_t *zd = &zs->zs_zd[0];
+	spa_t *spa;
+
+	if (zopt_verbose >= 3)
+		(void) printf("testing spa_freeze()...\n");
 
-	zs->zs_alloc = spa_get_alloc(spa);
-	zs->zs_space = spa_get_space(spa);
+	kernel_init(FREAD | FWRITE);
+	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+	VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
 
 	/*
-	 * If we had out-of-space errors, destroy a random objset.
+	 * Force the first log block to be transactionally allocated.
+	 * We have to do this before we freeze the pool -- otherwise
+	 * the log chain won't be anchored.
 	 */
-	if (zs->zs_enospc_count != 0) {
-		(void) rw_rdlock(&ztest_shared->zs_name_lock);
-		d = (int)ztest_random(zopt_datasets);
-		(void) snprintf(name, 100, "%s/%s_%d", pool, pool, d);
-		if (zopt_verbose >= 3)
-			(void) printf("Destroying %s to free up space\n", name);
-		(void) dmu_objset_find(name, ztest_destroy_cb, &za[d],
-		    DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN);
-		(void) rw_unlock(&ztest_shared->zs_name_lock);
+	while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) {
+		ztest_dmu_object_alloc_free(zd, 0);
+		zil_commit(zd->zd_zilog, UINT64_MAX, 0);
 	}
 
 	txg_wait_synced(spa_get_dsl(spa), 0);
 
-	umem_free(za, zopt_threads * sizeof (ztest_args_t));
+	/*
+	 * Freeze the pool.  This stops spa_sync() from doing anything,
+	 * so that the only way to record changes from now on is the ZIL.
+	 */
+	spa_freeze(spa);
 
-	/* Kill the resume thread */
-	ztest_exiting = B_TRUE;
-	VERIFY(thr_join(resume_tid, NULL, NULL) == 0);
+	/*
+	 * Run tests that generate log records but don't alter the pool config
+	 * or depend on DSL sync tasks (snapshots, objset create/destroy, etc).
+	 * We do a txg_wait_synced() after each iteration to force the txg
+	 * to increase well beyond the last synced value in the uberblock.
+	 * The ZIL should be OK with that.
+	 */
+	while (ztest_random(20) != 0) {
+		ztest_dmu_write_parallel(zd, 0);
+		ztest_dmu_object_alloc_free(zd, 0);
+		txg_wait_synced(spa_get_dsl(spa), 0);
+	}
 
 	/*
-	 * Right before closing the pool, kick off a bunch of async I/O;
-	 * spa_close() should wait for it to complete.
+	 * Commit all of the changes we just generated.
 	 */
-	for (t = 1; t < 50; t++)
-		dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15);
+	zil_commit(zd->zd_zilog, UINT64_MAX, 0);
+	txg_wait_synced(spa_get_dsl(spa), 0);
 
+	/*
+	 * Close our dataset and close the pool.
+	 */
+	ztest_dataset_close(zs, 0);
 	spa_close(spa, FTAG);
+	kernel_fini();
 
+	/*
+	 * Open and close the pool and dataset to induce log replay.
+	 */
+	kernel_init(FREAD | FWRITE);
+	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+	VERIFY3U(0, ==, ztest_dataset_open(zs, 0));
+	ztest_dataset_close(zs, 0);
+	spa_close(spa, FTAG);
 	kernel_fini();
+
+	list_destroy(&zcl.zcl_callbacks);
+
+	(void) _mutex_destroy(&zcl.zcl_callbacks_lock);
+
+	(void) rwlock_destroy(&zs->zs_name_lock);
+	(void) _mutex_destroy(&zs->zs_vdev_lock);
 }
 
 void
@@ -3249,41 +5316,62 @@ print_time(hrtime_t t, char *timebuf)
 		(void) sprintf(timebuf, "%llus", s);
 }
 
+static nvlist_t *
+make_random_props()
+{
+	nvlist_t *props;
+
+	if (ztest_random(2) == 0)
+		return (NULL);
+
+	VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0);
+	VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0);
+
+	(void) printf("props:\n");
+	dump_nvlist(props, 4);
+
+	return (props);
+}
+
 /*
  * Create a storage pool with the given name and initial vdev size.
- * Then create the specified number of datasets in the pool.
+ * Then test spa_freeze() functionality.
  */
 static void
-ztest_init(char *pool)
+ztest_init(ztest_shared_t *zs)
 {
 	spa_t *spa;
-	int error;
-	nvlist_t *nvroot;
+	nvlist_t *nvroot, *props;
+
+	VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0);
+	VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0);
 
 	kernel_init(FREAD | FWRITE);
 
 	/*
 	 * Create the storage pool.
 	 */
-	(void) spa_destroy(pool);
-	ztest_shared->zs_vdev_primaries = 0;
+	(void) spa_destroy(zs->zs_pool);
+	ztest_shared->zs_vdev_next_leaf = 0;
+	zs->zs_splits = 0;
+	zs->zs_mirrors = zopt_mirrors;
 	nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0,
-	    0, zopt_raidz, zopt_mirrors, 1);
-	error = spa_create(pool, nvroot, NULL, NULL, NULL);
+	    0, zopt_raidz, zs->zs_mirrors, 1);
+	props = make_random_props();
+	VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL));
 	nvlist_free(nvroot);
 
-	if (error)
-		fatal(0, "spa_create() = %d", error);
-	error = spa_open(pool, &spa, FTAG);
-	if (error)
-		fatal(0, "spa_open() = %d", error);
-
-	if (zopt_verbose >= 3)
-		show_pool_stats(spa);
-
+	VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG));
+	metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift;
 	spa_close(spa, FTAG);
 
 	kernel_fini();
+
+	ztest_run_zdb(zs->zs_pool);
+
+	ztest_freeze(zs);
+
+	ztest_run_zdb(zs->zs_pool);
 }
 
 int
@@ -3291,11 +5379,12 @@ main(int argc, char **argv)
 {
 	int kills = 0;
 	int iters = 0;
-	int i, f;
 	ztest_shared_t *zs;
+	size_t shared_size;
 	ztest_info_t *zi;
 	char timebuf[100];
 	char numbuf[6];
+	spa_t *spa;
 
 	(void) setvbuf(stdout, NULL, _IOLBF, 0);
 
@@ -3306,19 +5395,16 @@ main(int argc, char **argv)
 
 	process_options(argc, argv);
 
-	argc -= optind;
-	argv += optind;
-
-	dprintf_setup(&argc, argv);
-
 	/*
 	 * Blow away any existing copy of zpool.cache
 	 */
 	if (zopt_init != 0)
 		(void) remove("/tmp/zpool.cache");
 
+	shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t);
+
 	zs = ztest_shared = (void *)mmap(0,
-	    P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()),
+	    P2ROUNDUP(shared_size, getpagesize()),
 	    PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0);
 
 	if (zopt_verbose >= 1) {
@@ -3331,49 +5417,49 @@ main(int argc, char **argv)
 	/*
 	 * Create and initialize our storage pool.
 	 */
-	for (i = 1; i <= zopt_init; i++) {
+	for (int i = 1; i <= zopt_init; i++) {
 		bzero(zs, sizeof (ztest_shared_t));
 		if (zopt_verbose >= 3 && zopt_init != 1)
 			(void) printf("ztest_init(), pass %d\n", i);
-		ztest_init(zopt_pool);
+		zs->zs_pool = zopt_pool;
+		ztest_init(zs);
 	}
 
-	/*
-	 * Initialize the call targets for each function.
-	 */
-	for (f = 0; f < ZTEST_FUNCS; f++) {
-		zi = &zs->zs_info[f];
+	zs->zs_pool = zopt_pool;
+	zs->zs_proc_start = gethrtime();
+	zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC;
 
+	for (int f = 0; f < ZTEST_FUNCS; f++) {
+		zi = &zs->zs_info[f];
 		*zi = ztest_info[f];
-
-		if (*zi->zi_interval == 0)
-			zi->zi_call_target = UINT64_MAX;
+		if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop)
+			zi->zi_call_next = UINT64_MAX;
 		else
-			zi->zi_call_target = zopt_time / *zi->zi_interval;
+			zi->zi_call_next = zs->zs_proc_start +
+			    ztest_random(2 * zi->zi_interval[0] + 1);
 	}
 
-	zs->zs_start_time = gethrtime();
-	zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC;
-
 	/*
 	 * Run the tests in a loop.  These tests include fault injection
 	 * to verify that self-healing data works, and forced crashes
 	 * to verify that we never lose on-disk consistency.
 	 */
-	while (gethrtime() < zs->zs_stop_time) {
+	while (gethrtime() < zs->zs_proc_stop) {
 		int status;
 		pid_t pid;
-		char *tmp;
 
 		/*
 		 * Initialize the workload counters for each function.
 		 */
-		for (f = 0; f < ZTEST_FUNCS; f++) {
+		for (int f = 0; f < ZTEST_FUNCS; f++) {
 			zi = &zs->zs_info[f];
-			zi->zi_calls = 0;
+			zi->zi_call_count = 0;
 			zi->zi_call_time = 0;
 		}
 
+		/* Set the allocation switch size */
+		metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1;
+
 		pid = fork();
 
 		if (pid == -1)
@@ -3383,7 +5469,7 @@ main(int argc, char **argv)
 			struct rlimit rl = { 1024, 1024 };
 			(void) setrlimit(RLIMIT_NOFILE, &rl);
 			(void) enable_extended_FILE_stdio(-1, -1);
-			ztest_run(zopt_pool);
+			ztest_run(zs);
 			exit(0);
 		}
 
@@ -3416,8 +5502,8 @@ main(int argc, char **argv)
 		if (zopt_verbose >= 1) {
 			hrtime_t now = gethrtime();
 
-			now = MIN(now, zs->zs_stop_time);
-			print_time(zs->zs_stop_time - now, timebuf);
+			now = MIN(now, zs->zs_proc_stop);
+			print_time(zs->zs_proc_stop - now, timebuf);
 			nicenum(zs->zs_space, numbuf);
 
 			(void) printf("Pass %3d, %8s, %3llu ENOSPC, "
@@ -3427,7 +5513,7 @@ main(int argc, char **argv)
 			    (u_longlong_t)zs->zs_enospc_count,
 			    100.0 * zs->zs_alloc / zs->zs_space,
 			    numbuf,
-			    100.0 * (now - zs->zs_start_time) /
+			    100.0 * (now - zs->zs_proc_start) /
 			    (zopt_time * NANOSEC), timebuf);
 		}
 
@@ -3437,34 +5523,39 @@ main(int argc, char **argv)
 			    "Calls", "Time", "Function");
 			(void) printf("%7s %9s   %s\n",
 			    "-----", "----", "--------");
-			for (f = 0; f < ZTEST_FUNCS; f++) {
+			for (int f = 0; f < ZTEST_FUNCS; f++) {
 				Dl_info dli;
 
 				zi = &zs->zs_info[f];
 				print_time(zi->zi_call_time, timebuf);
 				(void) dladdr((void *)zi->zi_func, &dli);
 				(void) printf("%7llu %9s   %s\n",
-				    (u_longlong_t)zi->zi_calls, timebuf,
+				    (u_longlong_t)zi->zi_call_count, timebuf,
 				    dli.dli_sname);
 			}
 			(void) printf("\n");
 		}
 
 		/*
-		 * It's possible that we killed a child during a rename test, in
-		 * which case we'll have a 'ztest_tmp' pool lying around instead
-		 * of 'ztest'.  Do a blind rename in case this happened.
+		 * It's possible that we killed a child during a rename test,
+		 * in which case we'll have a 'ztest_tmp' pool lying around
+		 * instead of 'ztest'.  Do a blind rename in case this happened.
 		 */
-		tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL);
-		(void) strcpy(tmp, zopt_pool);
-		(void) strcat(tmp, "_tmp");
-		kernel_init(FREAD | FWRITE);
-		(void) spa_rename(tmp, zopt_pool);
+		kernel_init(FREAD);
+		if (spa_open(zopt_pool, &spa, FTAG) == 0) {
+			spa_close(spa, FTAG);
+		} else {
+			char tmpname[MAXNAMELEN];
+			kernel_fini();
+			kernel_init(FREAD | FWRITE);
+			(void) snprintf(tmpname, sizeof (tmpname), "%s_tmp",
+			    zopt_pool);
+			(void) spa_rename(tmpname, zopt_pool);
+		}
 		kernel_fini();
-		umem_free(tmp, strlen(tmp) + 1);
-	}
 
-	ztest_verify_blocks(zopt_pool);
+		ztest_run_zdb(zopt_pool);
+	}
 
 	if (zopt_verbose >= 1) {
 		(void) printf("%d killed, %d completed, %.0f%% kill rate\n",
diff --git a/external/cddl/osnet/dist/common/avl/avl.c b/external/cddl/osnet/dist/common/avl/avl.c
index c9727c643b962..dd39c12d215e9 100644
--- a/external/cddl/osnet/dist/common/avl/avl.c
+++ b/external/cddl/osnet/dist/common/avl/avl.c
@@ -19,13 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-
 /*
  * AVL - generic AVL tree implementation for kernel use
  *
@@ -243,7 +240,7 @@ avl_nearest(avl_tree_t *tree, avl_index_t where, int direction)
  *	"void *"  of the found tree node
  */
 void *
-avl_find(avl_tree_t *tree, void *value, avl_index_t *where)
+avl_find(avl_tree_t *tree, const void *value, avl_index_t *where)
 {
 	avl_node_t *node;
 	avl_node_t *prev = NULL;
diff --git a/external/cddl/osnet/dist/common/nvpair/nvpair.c b/external/cddl/osnet/dist/common/nvpair/nvpair.c
index 77891bf776445..8115091ab9a9a 100644
--- a/external/cddl/osnet/dist/common/nvpair/nvpair.c
+++ b/external/cddl/osnet/dist/common/nvpair/nvpair.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/stropts.h>
 #include <sys/debug.h>
 #include <sys/isa_defs.h>
@@ -692,6 +690,18 @@ nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type)
 	return (ENOENT);
 }
 
+int
+nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	if (nvl == NULL || nvp == NULL)
+		return (EINVAL);
+
+	nvp_buf_unlink(nvl, nvp);
+	nvpair_free(nvp);
+	nvp_buf_free(nvl, nvp);
+	return (0);
+}
+
 /*
  * This function calculates the size of an nvpair value.
  *
@@ -1162,6 +1172,42 @@ nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp)
 	return (curr != NULL ? &curr->nvi_nvp : NULL);
 }
 
+nvpair_t *
+nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp)
+{
+	nvpriv_t *priv;
+	i_nvp_t *curr;
+
+	if (nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (NULL);
+
+	curr = NVPAIR2I_NVP(nvp);
+
+	if (nvp == NULL)
+		curr = priv->nvp_last;
+	else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp))
+		curr = curr->nvi_prev;
+	else
+		curr = NULL;
+
+	priv->nvp_curr = curr;
+
+	return (curr != NULL ? &curr->nvi_nvp : NULL);
+}
+
+boolean_t
+nvlist_empty(nvlist_t *nvl)
+{
+	nvpriv_t *priv;
+
+	if (nvl == NULL ||
+	    (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL)
+		return (B_TRUE);
+
+	return (priv->nvp_list == NULL);
+}
+
 char *
 nvpair_name(nvpair_t *nvp)
 {
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_comutil.c b/external/cddl/osnet/dist/common/zfs/zfs_comutil.c
index 74517a3f6920d..53f485c0b6663 100644
--- a/external/cddl/osnet/dist/common/zfs/zfs_comutil.c
+++ b/external/cddl/osnet/dist/common/zfs/zfs_comutil.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file is intended for functions that ought to be common between user
  * land (libzfs) and the kernel. When many common routines need to be shared
@@ -33,10 +31,13 @@
 
 #if defined(_KERNEL)
 #include <sys/systm.h>
+#else
+#include <string.h>
 #endif
 
 #include <sys/types.h>
 #include <sys/fs/zfs.h>
+#include <sys/int_limits.h>
 #include <sys/nvpair.h>
 
 /*
@@ -63,3 +64,42 @@ zfs_allocatable_devs(nvlist_t *nv)
 	}
 	return (B_FALSE);
 }
+
+void
+zpool_get_rewind_policy(nvlist_t *nvl, zpool_rewind_policy_t *zrpp)
+{
+	nvlist_t *policy;
+	nvpair_t *elem;
+	char *nm;
+
+	/* Defaults */
+	zrpp->zrp_request = ZPOOL_NO_REWIND;
+	zrpp->zrp_maxmeta = 0;
+	zrpp->zrp_maxdata = UINT64_MAX;
+	zrpp->zrp_txg = UINT64_MAX;
+
+	if (nvl == NULL)
+		return;
+
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
+		nm = nvpair_name(elem);
+		if (strcmp(nm, ZPOOL_REWIND_POLICY) == 0) {
+			if (nvpair_value_nvlist(elem, &policy) == 0)
+				zpool_get_rewind_policy(policy, zrpp);
+			return;
+		} else if (strcmp(nm, ZPOOL_REWIND_REQUEST) == 0) {
+			if (nvpair_value_uint32(elem, &zrpp->zrp_request) == 0)
+				if (zrpp->zrp_request & ~ZPOOL_REWIND_POLICIES)
+					zrpp->zrp_request = ZPOOL_NO_REWIND;
+		} else if (strcmp(nm, ZPOOL_REWIND_REQUEST_TXG) == 0) {
+			(void) nvpair_value_uint64(elem, &zrpp->zrp_txg);
+		} else if (strcmp(nm, ZPOOL_REWIND_META_THRESH) == 0) {
+			(void) nvpair_value_uint64(elem, &zrpp->zrp_maxmeta);
+		} else if (strcmp(nm, ZPOOL_REWIND_DATA_THRESH) == 0) {
+			(void) nvpair_value_uint64(elem, &zrpp->zrp_maxdata);
+		}
+	}
+	if (zrpp->zrp_request == 0)
+		zrpp->zrp_request = ZPOOL_NO_REWIND;
+}
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_comutil.h b/external/cddl/osnet/dist/common/zfs/zfs_comutil.h
index f517044a80a00..748a79a5c9818 100644
--- a/external/cddl/osnet/dist/common/zfs/zfs_comutil.h
+++ b/external/cddl/osnet/dist/common/zfs/zfs_comutil.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_ZFS_COMUTIL_H
 #define	_ZFS_COMUTIL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 
@@ -35,7 +33,8 @@
 extern "C" {
 #endif
 
-extern boolean_t zfs_allocatable_devs(nvlist_t *nv);
+extern boolean_t zfs_allocatable_devs(nvlist_t *);
+extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_deleg.c b/external/cddl/osnet/dist/common/zfs/zfs_deleg.c
index 0fd5800a84dc5..35f81b584641a 100644
--- a/external/cddl/osnet/dist/common/zfs/zfs_deleg.c
+++ b/external/cddl/osnet/dist/common/zfs/zfs_deleg.c
@@ -19,13 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #if defined(_KERNEL)
 #include <sys/systm.h>
 #include <sys/sunddi.h>
@@ -66,6 +63,12 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = {
 	{ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE },
 	{ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE },
 	{ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP },
+	{ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA },
+	{ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA },
+	{ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED },
+	{ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED },
+	{ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD },
+	{ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE },
 	{NULL, ZFS_DELEG_NOTE_NONE }
 };
 
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_deleg.h b/external/cddl/osnet/dist/common/zfs/zfs_deleg.h
index 561b73e63df4a..e90cd0d5f4ba9 100644
--- a/external/cddl/osnet/dist/common/zfs/zfs_deleg.h
+++ b/external/cddl/osnet/dist/common/zfs/zfs_deleg.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_ZFS_DELEG_H
 #define	_ZFS_DELEG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/fs/zfs.h>
 
 #ifdef	__cplusplus
@@ -59,6 +57,12 @@ typedef enum {
 	ZFS_DELEG_NOTE_USERPROP,
 	ZFS_DELEG_NOTE_MOUNT,
 	ZFS_DELEG_NOTE_SHARE,
+	ZFS_DELEG_NOTE_USERQUOTA,
+	ZFS_DELEG_NOTE_GROUPQUOTA,
+	ZFS_DELEG_NOTE_USERUSED,
+	ZFS_DELEG_NOTE_GROUPUSED,
+	ZFS_DELEG_NOTE_HOLD,
+	ZFS_DELEG_NOTE_RELEASE,
 	ZFS_DELEG_NOTE_NONE
 } zfs_deleg_note_t;
 
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c b/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c
new file mode 100644
index 0000000000000..fa43ce6bdb5dd
--- /dev/null
+++ b/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c
@@ -0,0 +1,246 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Fletcher Checksums
+ * ------------------
+ *
+ * ZFS's 2nd and 4th order Fletcher checksums are defined by the following
+ * recurrence relations:
+ *
+ *	a  = a    + f
+ *	 i    i-1    i-1
+ *
+ *	b  = b    + a
+ *	 i    i-1    i
+ *
+ *	c  = c    + b		(fletcher-4 only)
+ *	 i    i-1    i
+ *
+ *	d  = d    + c		(fletcher-4 only)
+ *	 i    i-1    i
+ *
+ * Where
+ *	a_0 = b_0 = c_0 = d_0 = 0
+ * and
+ *	f_0 .. f_(n-1) are the input data.
+ *
+ * Using standard techniques, these translate into the following series:
+ *
+ *	     __n_			     __n_
+ *	     \   |			     \   |
+ *	a  =  >     f			b  =  >     i * f
+ *	 n   /___|   n - i		 n   /___|	 n - i
+ *	     i = 1			     i = 1
+ *
+ *
+ *	     __n_			     __n_
+ *	     \   |  i*(i+1)		     \   |  i*(i+1)*(i+2)
+ *	c  =  >     ------- f		d  =  >     ------------- f
+ *	 n   /___|     2     n - i	 n   /___|	  6	   n - i
+ *	     i = 1			     i = 1
+ *
+ * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators.
+ * Since the additions are done mod (2^64), errors in the high bits may not
+ * be noticed.  For this reason, fletcher-2 is deprecated.
+ *
+ * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators.
+ * A conservative estimate of how big the buffer can get before we overflow
+ * can be estimated using f_i = 0xffffffff for all i:
+ *
+ * % bc
+ *  f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4
+ * 2264
+ *  quit
+ * %
+ *
+ * So blocks of up to 2k will not overflow.  Our largest block size is
+ * 128k, which has 32k 4-byte words, so we can compute the largest possible
+ * accumulators, then divide by 2^64 to figure the max amount of overflow:
+ *
+ * % bc
+ *  a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c }
+ *  a/2^64;b/2^64;c/2^64;d/2^64
+ * 0
+ * 0
+ * 1365
+ * 11186858
+ *  quit
+ * %
+ *
+ * So a and b cannot overflow.  To make sure each bit of input has some
+ * effect on the contents of c and d, we can look at what the factors of
+ * the coefficients in the equations for c_n and d_n are.  The number of 2s
+ * in the factors determines the lowest set bit in the multiplier.  Running
+ * through the cases for n*(n+1)/2 reveals that the highest power of 2 is
+ * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15.  So while some data may overflow
+ * the 64-bit accumulators, every bit of every f_i effects every accumulator,
+ * even for 128k blocks.
+ *
+ * If we wanted to make a stronger version of fletcher4 (fletcher4c?),
+ * we could do our calculations mod (2^32 - 1) by adding in the carries
+ * periodically, and store the number of carries in the top 32-bits.
+ *
+ * --------------------
+ * Checksum Performance
+ * --------------------
+ *
+ * There are two interesting components to checksum performance: cached and
+ * uncached performance.  With cached data, fletcher-2 is about four times
+ * faster than fletcher-4.  With uncached data, the performance difference is
+ * negligible, since the cost of a cache fill dominates the processing time.
+ * Even though fletcher-4 is slower than fletcher-2, it is still a pretty
+ * efficient pass over the data.
+ *
+ * In normal operation, the data which is being checksummed is in a buffer
+ * which has been filled either by:
+ *
+ *	1. a compression step, which will be mostly cached, or
+ *	2. a bcopy() or copyin(), which will be uncached (because the
+ *	   copy is cache-bypassing).
+ *
+ * For both cached and uncached data, both fletcher checksums are much faster
+ * than sha-256, and slower than 'off', which doesn't touch the data at all.
+ */
+
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+#include <sys/byteorder.h>
+#include <sys/zio.h>
+#include <sys/spa.h>
+
+void
+fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+	uint64_t a0, b0, a1, b1;
+
+	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+		a0 += ip[0];
+		a1 += ip[1];
+		b0 += a0;
+		b1 += a1;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint64_t *ip = buf;
+	const uint64_t *ipend = ip + (size / sizeof (uint64_t));
+	uint64_t a0, b0, a1, b1;
+
+	for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) {
+		a0 += BSWAP_64(ip[0]);
+		a1 += BSWAP_64(ip[1]);
+		b0 += a0;
+		b1 += a1;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1);
+}
+
+void
+fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	for (a = b = c = d = 0; ip < ipend; ip++) {
+		a += ip[0];
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	for (a = b = c = d = 0; ip < ipend; ip++) {
+		a += BSWAP_32(ip[0]);
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_native(const void *buf, uint64_t size,
+    zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	a = zcp->zc_word[0];
+	b = zcp->zc_word[1];
+	c = zcp->zc_word[2];
+	d = zcp->zc_word[3];
+
+	for (; ip < ipend; ip++) {
+		a += ip[0];
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
+
+void
+fletcher_4_incremental_byteswap(const void *buf, uint64_t size,
+    zio_cksum_t *zcp)
+{
+	const uint32_t *ip = buf;
+	const uint32_t *ipend = ip + (size / sizeof (uint32_t));
+	uint64_t a, b, c, d;
+
+	a = zcp->zc_word[0];
+	b = zcp->zc_word[1];
+	c = zcp->zc_word[2];
+	d = zcp->zc_word[3];
+
+	for (; ip < ipend; ip++) {
+		a += BSWAP_32(ip[0]);
+		b += a;
+		c += b;
+		d += c;
+	}
+
+	ZIO_SET_CHECKSUM(zcp, a, b, c, d);
+}
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h b/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h
new file mode 100644
index 0000000000000..b49df0cf4f0fd
--- /dev/null
+++ b/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h
@@ -0,0 +1,53 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef	_ZFS_FLETCHER_H
+#define	_ZFS_FLETCHER_H
+
+#include <sys/types.h>
+#include <sys/spa.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * fletcher checksum functions
+ */
+
+void fletcher_2_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_native(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *);
+void fletcher_4_incremental_native(const void *, uint64_t,
+    zio_cksum_t *);
+void fletcher_4_incremental_byteswap(const void *, uint64_t,
+    zio_cksum_t *);
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _ZFS_FLETCHER_H */
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c b/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c
index a9d109be20ab7..5cfafea471b3d 100644
--- a/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c
+++ b/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Common name validation routines for ZFS.  These routines are shared by the
  * userland code as well as the ioctl() layer to ensure that we don't
@@ -61,7 +59,7 @@ valid_char(char c)
  * Snapshot names must be made up of alphanumeric characters plus the following
  * characters:
  *
- * 	[-_.:]
+ * 	[-_.: ]
  */
 int
 snapshot_namecheck(const char *path, namecheck_err_t *why, char *what)
@@ -345,19 +343,3 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what)
 
 	return (0);
 }
-
-/*
- * Check if the dataset name is private for internal usage.
- * '$' is reserved for internal dataset names. e.g. "$MOS"
- *
- * Return 1 if the given name is used internally.
- * Return 0 if it is not.
- */
-int
-dataset_name_hidden(const char *name)
-{
-	if (strchr(name, '$') != NULL)
-		return (1);
-
-	return (0);
-}
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h b/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h
index ec85e62f72e81..7711da099be98 100644
--- a/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h
+++ b/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_ZFS_NAMECHECK_H
 #define	_ZFS_NAMECHECK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -50,7 +48,6 @@ typedef enum {
 int pool_namecheck(const char *, namecheck_err_t *, char *);
 int dataset_namecheck(const char *, namecheck_err_t *, char *);
 int mountpoint_namecheck(const char *, namecheck_err_t *);
-int dataset_name_hidden(const char *);
 int snapshot_namecheck(const char *, namecheck_err_t *, char *);
 int permset_namecheck(const char *, namecheck_err_t *, char *);
 
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_prop.c b/external/cddl/osnet/dist/common/zfs/zfs_prop.c
index effd2dba70922..b6f80614f8faf 100644
--- a/external/cddl/osnet/dist/common/zfs/zfs_prop.c
+++ b/external/cddl/osnet/dist/common/zfs/zfs_prop.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -43,6 +43,14 @@
 
 static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS];
 
+/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */
+const char *zfs_userquota_prop_prefixes[] = {
+	"userused@",
+	"userquota@",
+	"groupused@",
+	"groupquota@"
+};
+
 zprop_desc_t *
 zfs_prop_get_table(void)
 {
@@ -61,6 +69,16 @@ zfs_prop_init(void)
 		{ NULL }
 	};
 
+	static zprop_index_t dedup_table[] = {
+		{ "on",		ZIO_CHECKSUM_ON },
+		{ "off",	ZIO_CHECKSUM_OFF },
+		{ "verify",	ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY },
+		{ "sha256",	ZIO_CHECKSUM_SHA256 },
+		{ "sha256,verify",
+				ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY },
+		{ NULL }
+	};
+
 	static zprop_index_t compress_table[] = {
 		{ "on",		ZIO_COMPRESS_ON },
 		{ "off",	ZIO_COMPRESS_OFF },
@@ -75,6 +93,7 @@ zfs_prop_init(void)
 		{ "gzip-7",	ZIO_COMPRESS_GZIP_7 },
 		{ "gzip-8",	ZIO_COMPRESS_GZIP_8 },
 		{ "gzip-9",	ZIO_COMPRESS_GZIP_9 },
+		{ "zle",	ZIO_COMPRESS_ZLE },
 		{ NULL }
 	};
 
@@ -133,6 +152,7 @@ zfs_prop_init(void)
 		{ "1",		1 },
 		{ "2",		2 },
 		{ "3",		3 },
+		{ "4",		4 },
 		{ "current",	ZPL_VERSION },
 		{ NULL }
 	};
@@ -143,6 +163,12 @@ zfs_prop_init(void)
 		{ NULL }
 	};
 
+	static zprop_index_t logbias_table[] = {
+		{ "latency",	ZFS_LOGBIAS_LATENCY },
+		{ "throughput",	ZFS_LOGBIAS_THROUGHPUT },
+		{ NULL }
+	};
+
 	static zprop_index_t canmount_table[] = {
 		{ "off",	ZFS_CANMOUNT_OFF },
 		{ "on",		ZFS_CANMOUNT_ON },
@@ -162,10 +188,15 @@ zfs_prop_init(void)
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
 	    "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM",
 	    checksum_table);
+	register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "on | off | verify | sha256[,verify]", "DEDUP",
+	    dedup_table);
 	register_index(ZFS_PROP_COMPRESSION, "compression",
 	    ZIO_COMPRESS_DEFAULT, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
-	    "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table);
+	    "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS",
+	    compress_table);
 	register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN,
 	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM,
 	    "hidden | visible", "SNAPDIR", snapdir_table);
@@ -187,6 +218,9 @@ zfs_prop_init(void)
 	    ZFS_CACHE_ALL, PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME,
 	    "all | none | metadata", "SECONDARYCACHE", cache_table);
+	register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY,
+	    PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
+	    "latency | throughput", "LOGBIAS", logbias_table);
 
 	/* inherit index (boolean) properties */
 	register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT,
@@ -218,7 +252,7 @@ zfs_prop_init(void)
 	/* default index properties */
 	register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT,
-	    "1 | 2 | 3 | current", "VERSION", version_table);
+	    "1 | 2 | 3 | 4 | current", "VERSION", version_table);
 	register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON,
 	    PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto",
 	    "CANMOUNT", canmount_table);
@@ -226,6 +260,9 @@ zfs_prop_init(void)
 	/* readonly index (boolean) properties */
 	register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table);
+	register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0,
+	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY",
+	    boolean_table);
 
 	/* set once index properties */
 	register_index(ZFS_PROP_NORMALIZE, "normalization", 0,
@@ -254,6 +291,8 @@ zfs_prop_init(void)
 	    ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE");
 	register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT,
 	    ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB");
+	register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT,
+	    PROP_INHERIT, ZFS_TYPE_DATASET, "<sensitivity label>", "MLSLABEL");
 
 	/* readonly number properties */
 	register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY,
@@ -265,8 +304,8 @@ zfs_prop_init(void)
 	register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0,
 	    PROP_READONLY, ZFS_TYPE_DATASET,
 	    "<1.00x or higher if compressed>", "RATIO");
-	register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", 8192,
-	    PROP_ONETIME,
+	register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize",
+	    ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME,
 	    ZFS_TYPE_VOLUME, "512 to 128k, power of 2",	"VOLBLOCK");
 	register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDSNAP");
@@ -277,6 +316,8 @@ zfs_prop_init(void)
 	register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0,
 	    PROP_READONLY,
 	    ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "<size>", "USEDREFRESERV");
+	register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY,
+	    ZFS_TYPE_SNAPSHOT, "<count>", "USERREFS");
 
 	/* default number properties */
 	register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT,
@@ -298,15 +339,25 @@ zfs_prop_init(void)
 
 	/* hidden properties */
 	register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_DATASET, NULL);
+	    PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG");
 	register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER,
-	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL);
+	    PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES");
 	register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING,
 	    PROP_READONLY, ZFS_TYPE_DATASET, "NAME");
 	register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING,
 	    PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS");
+	register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu",
+	    PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME,
+	    "STMF_SBD_LU");
 	register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY,
 	    ZFS_TYPE_DATASET, "GUID");
+	register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting",
+	    PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET,
+	    "USERACCOUNTING");
+	register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE");
+	register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER,
+	    PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID");
 
 	/* oddball properties */
 	register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL,
@@ -318,6 +369,11 @@ boolean_t
 zfs_prop_delegatable(zfs_prop_t prop)
 {
 	zprop_desc_t *pd = &zfs_prop_table[prop];
+
+	/* The mlslabel property is never delegatable. */
+	if (prop == ZFS_PROP_MLSLABEL)
+		return (B_FALSE);
+
 	return (pd->pd_attr != PROP_READONLY);
 }
 
@@ -330,7 +386,6 @@ zfs_name_to_prop(const char *propname)
 	return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET));
 }
 
-
 /*
  * For user property names, we allow all lowercase alphanumeric characters, plus
  * a few useful punctuation characters.
@@ -367,6 +422,26 @@ zfs_prop_user(const char *name)
 	return (B_TRUE);
 }
 
+/*
+ * Returns true if this is a valid userspace-type property (one with a '@').
+ * Note that after the @, any character is valid (eg, another @, for SID
+ * user@domain).
+ */
+boolean_t
+zfs_prop_userquota(const char *name)
+{
+	zfs_userquota_prop_t prop;
+
+	for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) {
+		if (strncmp(name, zfs_userquota_prop_prefixes[prop],
+		    strlen(zfs_userquota_prop_prefixes[prop])) == 0) {
+			return (B_TRUE);
+		}
+	}
+
+	return (B_FALSE);
+}
+
 /*
  * Tables of index types, plus functions to convert between the user view
  * (strings) and internal representation (uint64_t).
@@ -383,6 +458,12 @@ zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string)
 	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET));
 }
 
+uint64_t
+zfs_prop_random_value(zfs_prop_t prop, uint64_t seed)
+{
+	return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET));
+}
+
 /*
  * Returns TRUE if the property applies to any of the given dataset types.
  */
diff --git a/external/cddl/osnet/dist/common/zfs/zfs_prop.h b/external/cddl/osnet/dist/common/zfs/zfs_prop.h
index da5ae43093e54..38d429aa84c90 100644
--- a/external/cddl/osnet/dist/common/zfs/zfs_prop.h
+++ b/external/cddl/osnet/dist/common/zfs/zfs_prop.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_ZFS_PROP_H
 #define	_ZFS_PROP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/fs/zfs.h>
 #include <sys/types.h>
 
@@ -79,6 +77,7 @@ typedef struct {
 					/* "zfs get" help message */
 	const zprop_index_t *pd_table;	/* for index properties, a table */
 					/* defining the possible values */
+	size_t pd_table_size;		/* number of entries in pd_table[] */
 } zprop_desc_t;
 
 /*
@@ -118,6 +117,7 @@ int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t);
 int zprop_name_to_prop(const char *, zfs_type_t);
 int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t);
 int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t);
+uint64_t zprop_random_value(int, uint64_t, zfs_type_t);
 const char *zprop_values(int, zfs_type_t);
 size_t zprop_width(int, boolean_t *, zfs_type_t);
 boolean_t zprop_valid_for_type(int, zfs_type_t);
diff --git a/external/cddl/osnet/dist/common/zfs/zpool_prop.c b/external/cddl/osnet/dist/common/zfs/zpool_prop.c
index f5efe18d248b1..c8a3ca205f42a 100644
--- a/external/cddl/osnet/dist/common/zfs/zpool_prop.c
+++ b/external/cddl/osnet/dist/common/zfs/zpool_prop.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -74,20 +74,24 @@ zpool_prop_init(void)
 	/* readonly number properties */
 	register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "SIZE");
-	register_number(ZPOOL_PROP_USED, "used", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "USED");
-	register_number(ZPOOL_PROP_AVAILABLE, "available", 0, PROP_READONLY,
-	    ZFS_TYPE_POOL, "<size>", "AVAIL");
+	register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "FREE");
+	register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<size>", "ALLOC");
 	register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<size>", "CAP");
 	register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<guid>", "GUID");
 	register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY,
 	    ZFS_TYPE_POOL, "<state>", "HEALTH");
+	register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY,
+	    ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP");
 
 	/* default number properties */
 	register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION,
 	    PROP_DEFAULT, ZFS_TYPE_POOL, "<version>", "VERSION");
+	register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0,
+	    PROP_DEFAULT, ZFS_TYPE_POOL, "<threshold (min 100)>", "DEDUPDITTO");
 
 	/* default index (boolean) properties */
 	register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT,
@@ -96,6 +100,8 @@ zpool_prop_init(void)
 	    ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table);
 	register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT,
 	    ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table);
+	register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT,
+	    ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table);
 
 	/* default index properties */
 	register_index(ZPOOL_PROP_FAILUREMODE, "failmode",
@@ -164,6 +170,12 @@ zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index,
 	return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL));
 }
 
+uint64_t
+zpool_prop_random_value(zpool_prop_t prop, uint64_t seed)
+{
+	return (zprop_random_value(prop, seed, ZFS_TYPE_POOL));
+}
+
 #ifndef _KERNEL
 
 const char *
diff --git a/external/cddl/osnet/dist/common/zfs/zprop_common.c b/external/cddl/osnet/dist/common/zfs/zprop_common.c
index bd267e2e61cac..992fe5e71603f 100644
--- a/external/cddl/osnet/dist/common/zfs/zprop_common.c
+++ b/external/cddl/osnet/dist/common/zfs/zprop_common.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Common routines used by zfs and zpool property management.
  */
@@ -78,6 +76,8 @@ register_impl(int prop, const char *name, zprop_type_t type,
 	pd = &prop_tbl[prop];
 
 	ASSERT(pd->pd_name == NULL || pd->pd_name == name);
+	ASSERT(name != NULL);
+	ASSERT(colname != NULL);
 
 	pd->pd_name = name;
 	pd->pd_propnum = prop;
@@ -91,6 +91,9 @@ register_impl(int prop, const char *name, zprop_type_t type,
 	pd->pd_rightalign = rightalign;
 	pd->pd_visible = visible;
 	pd->pd_table = idx_tbl;
+	pd->pd_table_size = 0;
+	while (idx_tbl && (idx_tbl++)->pi_name != NULL)
+		pd->pd_table_size++;
 }
 
 void
@@ -205,9 +208,6 @@ propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
 #ifndef _KERNEL
 	const char *colname = prop_entry->pd_colname;
 	int c;
-
-	if (colname == NULL)
-		return (B_FALSE);
 #endif
 
 	if (len == strlen(propname) &&
@@ -215,7 +215,7 @@ propname_match(const char *p, size_t len, zprop_desc_t *prop_entry)
 		return (B_TRUE);
 
 #ifndef _KERNEL
-	if (len != strlen(colname))
+	if (colname == NULL || len != strlen(colname))
 		return (B_FALSE);
 
 	for (c = 0; c < len; c++)
@@ -312,6 +312,25 @@ zprop_index_to_string(int prop, uint64_t index, const char **string,
 	return (-1);
 }
 
+/*
+ * Return a random valid property value.  Used by ztest.
+ */
+uint64_t
+zprop_random_value(int prop, uint64_t seed, zfs_type_t type)
+{
+	zprop_desc_t *prop_tbl;
+	const zprop_index_t *idx_tbl;
+
+	ASSERT((uint_t)prop < zprop_get_numprops(type));
+	prop_tbl = zprop_get_proptable(type);
+	idx_tbl = prop_tbl[prop].pd_table;
+
+	if (idx_tbl == NULL)
+		return (seed);
+
+	return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value);
+}
+
 const char *
 zprop_values(int prop, zfs_type_t type)
 {
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/drti.c b/external/cddl/osnet/dist/lib/libdtrace/common/drti.c
index f8570e686f5b3..3b5f0cbbdf306 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/drti.c
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/drti.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <unistd.h>
 #include <fcntl.h>
 #include <dlfcn.h>
@@ -61,13 +58,14 @@ static const char *olddevname = "/devices/pseudo/dtrace@0:helper";
 static const char *modname;	/* Name of this load object */
 static int gen;			/* DOF helper generation */
 extern dof_hdr_t __SUNW_dof;	/* DOF defined in the .SUNW_dof section */
+static boolean_t dof_init_debug = B_FALSE;	/* From DTRACE_DOF_INIT_DEBUG */
 
 static void
 dprintf(int debug, const char *fmt, ...)
 {
 	va_list ap;
 
-	if (debug && getenv("DTRACE_DOF_INIT_DEBUG") == NULL)
+	if (debug && !dof_init_debug)
 		return;
 
 	va_start(ap, fmt);
@@ -104,6 +102,9 @@ dtrace_dof_init(void)
 	if (getenv("DTRACE_DOF_INIT_DISABLE") != NULL)
 		return;
 
+	if (getenv("DTRACE_DOF_INIT_DEBUG") != NULL)
+		dof_init_debug = B_TRUE;
+
 	if (dlinfo(RTLD_SELF, RTLD_DI_LINKMAP, &lmp) == -1 || lmp == NULL) {
 		dprintf(1, "couldn't discover module name or address\n");
 		return;
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c
index 62d39e07dd416..564189a000adb 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <stdlib.h>
 #include <strings.h>
 #include <errno.h>
@@ -1063,7 +1061,7 @@ dt_print_usym(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr, dtrace_actkind_t act)
 	do {
 		n = len;
 		s = alloca(n);
-	} while ((len = dtrace_uaddr2str(dtp, pid, pc, s, n)) >= n);
+	} while ((len = dtrace_uaddr2str(dtp, pid, pc, s, n)) > n);
 
 	return (dt_printf(dtp, fp, format, s));
 }
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_error.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_error.c
index 5005f593a43da..0bfabc919c857 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_error.c
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_error.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <strings.h>
 #include <dt_impl.h>
 
@@ -105,7 +103,8 @@ static const struct {
 	{ EDT_BADSETOPT, "Invalid setopt() library action" },
 	{ EDT_BADSTACKPC, "Invalid stack program counter size" },
 	{ EDT_BADAGGVAR, "Invalid aggregation variable identifier" },
-	{ EDT_OVERSION,	"Client requested deprecated version of library" }
+	{ EDT_OVERSION,	"Client requested deprecated version of library" },
+	{ EDT_ENABLING_ERR, "Failed to enable probe" }
 };
 
 static const int _dt_nerr = sizeof (_dt_errlist) / sizeof (_dt_errlist[0]);
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h b/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h
index 9b22dfbb641a1..1937ce06474de 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h
@@ -20,21 +20,20 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_DT_IMPL_H
 #define	_DT_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/param.h>
 #include <sys/objfs.h>
 #include <setjmp.h>
 #include <libctf.h>
 #include <dtrace.h>
 #include <gelf.h>
+#include <synch.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -498,7 +497,8 @@ enum {
 	EDT_BADSETOPT,		/* invalid setopt library action */
 	EDT_BADSTACKPC,		/* invalid stack program counter size */
 	EDT_BADAGGVAR,		/* invalid aggregation variable identifier */
-	EDT_OVERSION		/* client is requesting deprecated version */
+	EDT_OVERSION,		/* client is requesting deprecated version */
+	EDT_ENABLING_ERR	/* failed to enable probe */
 };
 
 /*
@@ -568,17 +568,8 @@ extern int dt_buffered_flush(dtrace_hdl_t *, dtrace_probedata_t *,
 extern void dt_buffered_disable(dtrace_hdl_t *);
 extern void dt_buffered_destroy(dtrace_hdl_t *);
 
-extern int dt_rw_read_held(pthread_rwlock_t *);
-extern int dt_rw_write_held(pthread_rwlock_t *);
-extern int dt_mutex_held(pthread_mutex_t *);
-
 extern uint64_t dt_stddev(uint64_t *, uint64_t);
 
-#define	DT_RW_READ_HELD(x)	dt_rw_read_held(x)
-#define	DT_RW_WRITE_HELD(x)	dt_rw_write_held(x)
-#define	DT_RW_LOCK_HELD(x)	(DT_RW_READ_HELD(x) || DT_RW_WRITE_HELD(x))
-#define	DT_MUTEX_HELD(x)	dt_mutex_held(x)
-
 extern int dt_options_load(dtrace_hdl_t *);
 
 extern void dt_dprintf(const char *, ...);
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c
index 25197031ce112..f8fdc4edbeb2a 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/modctl.h>
 #include <sys/kobj.h>
@@ -66,6 +64,10 @@ dt_module_symhash_insert(dt_module_t *dmp, const char *name, uint_t id)
 static uint_t
 dt_module_syminit32(dt_module_t *dmp)
 {
+#if STT_NUM != (STT_TLS + 1)
+#error "STT_NUM has grown. update dt_module_syminit32()"
+#endif
+
 	const Elf32_Sym *sym = dmp->dm_symtab.cts_data;
 	const char *base = dmp->dm_strtab.cts_data;
 	size_t ss_size = dmp->dm_strtab.cts_size;
@@ -95,6 +97,10 @@ dt_module_syminit32(dt_module_t *dmp)
 static uint_t
 dt_module_syminit64(dt_module_t *dmp)
 {
+#if STT_NUM != (STT_TLS + 1)
+#error "STT_NUM has grown. update dt_module_syminit64()"
+#endif
+
 	const Elf64_Sym *sym = dmp->dm_symtab.cts_data;
 	const char *base = dmp->dm_strtab.cts_data;
 	size_t ss_size = dmp->dm_strtab.cts_size;
@@ -468,7 +474,7 @@ dt_module_load_sect(dtrace_hdl_t *dtp, dt_module_t *dmp, ctf_sect_t *ctsp)
 	Elf_Data *dp;
 	Elf_Scn *sp;
 
-	if (elf_getshstrndx(dmp->dm_elf, &shstrs) == 0)
+	if (elf_getshdrstrndx(dmp->dm_elf, &shstrs) == -1)
 		return (dt_set_errno(dtp, EDT_NOTLOADED));
 
 	for (sp = NULL; (sp = elf_nextscn(dmp->dm_elf, sp)) != NULL; ) {
@@ -817,7 +823,7 @@ dt_module_update(dtrace_hdl_t *dtp, const char *name)
 	(void) close(fd);
 
 	if (dmp->dm_elf == NULL || err == -1 ||
-	    elf_getshstrndx(dmp->dm_elf, &shstrs) == 0) {
+	    elf_getshdrstrndx(dmp->dm_elf, &shstrs) == -1) {
 		dt_dprintf("failed to load %s: %s\n",
 		    fname, elf_errmsg(elf_errno()));
 		dt_module_destroy(dtp, dmp);
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c
index 091772405ffd1..241805154adcf 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -576,7 +576,7 @@ dt_pid_create_usdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp,
 	struct ps_prochandle *P = dpr->dpr_proc;
 	int ret = 0;
 
-	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
+	assert(MUTEX_HELD(&dpr->dpr_lock));
 
 	(void) Pupdate_maps(P);
 	if (Pobject_iter(P, dt_pid_usdt_mapping, P) != 0) {
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c
index 953511b1d029c..4400771214c9b 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/sysmacros.h>
 #include <strings.h>
 #include <stdlib.h>
@@ -330,7 +328,7 @@ pfprint_addr(dtrace_hdl_t *dtp, FILE *fp, const char *format,
 	do {
 		n = len;
 		s = alloca(n);
-	} while ((len = dtrace_addr2str(dtp, val, s, n)) >= n);
+	} while ((len = dtrace_addr2str(dtp, val, s, n)) > n);
 
 	return (dt_printf(dtp, fp, format, s));
 }
@@ -383,7 +381,7 @@ pfprint_uaddr(dtrace_hdl_t *dtp, FILE *fp, const char *format,
 	do {
 		n = len;
 		s = alloca(n);
-	} while ((len = dtrace_uaddr2str(dtp, pid, val, s, n)) >= n);
+	} while ((len = dtrace_uaddr2str(dtp, pid, val, s, n)) > n);
 
 	return (dt_printf(dtp, fp, format, s));
 }
@@ -1223,6 +1221,20 @@ pfprint_average(dtrace_hdl_t *dtp, FILE *fp, const char *format,
 	    data[0] ? data[1] / normal / data[0] : 0));
 }
 
+/*ARGSUSED*/
+static int
+pfprint_stddev(dtrace_hdl_t *dtp, FILE *fp, const char *format,
+    const dt_pfargd_t *pfd, const void *addr, size_t size, uint64_t normal)
+{
+	const uint64_t *data = addr;
+
+	if (size != sizeof (uint64_t) * 4)
+		return (dt_set_errno(dtp, EDT_DMISMATCH));
+
+	return (dt_printf(dtp, fp, format,
+	    dt_stddev((uint64_t *)data, normal)));
+}
+
 /*ARGSUSED*/
 static int
 pfprint_quantize(dtrace_hdl_t *dtp, FILE *fp, const char *format,
@@ -1415,6 +1427,9 @@ dt_printf_format(dtrace_hdl_t *dtp, FILE *fp, const dt_pfargv_t *pfv,
 		case DTRACEAGG_AVG:
 			func = pfprint_average;
 			break;
+		case DTRACEAGG_STDDEV:
+			func = pfprint_stddev;
+			break;
 		case DTRACEAGG_QUANTIZE:
 			func = pfprint_quantize;
 			break;
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c
index 419f13b8474ca..001534163bd04 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * DTrace Process Control
  *
@@ -99,7 +97,7 @@ dt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data)
 	struct ps_prochandle *P = dpr->dpr_proc;
 	dt_bkpt_t *dbp;
 
-	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
+	assert(MUTEX_HELD(&dpr->dpr_lock));
 
 	if ((dbp = dt_zalloc(dpr->dpr_hdl, sizeof (dt_bkpt_t))) != NULL) {
 		dbp->dbp_func = func;
@@ -121,7 +119,7 @@ dt_proc_bpdestroy(dt_proc_t *dpr, int delbkpts)
 	int state = Pstate(dpr->dpr_proc);
 	dt_bkpt_t *dbp, *nbp;
 
-	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
+	assert(MUTEX_HELD(&dpr->dpr_lock));
 
 	for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = nbp) {
 		if (delbkpts && dbp->dbp_active &&
@@ -141,7 +139,7 @@ dt_proc_bpmatch(dtrace_hdl_t *dtp, dt_proc_t *dpr)
 	const lwpstatus_t *psp = &Pstatus(dpr->dpr_proc)->pr_lwp;
 	dt_bkpt_t *dbp;
 
-	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
+	assert(MUTEX_HELD(&dpr->dpr_lock));
 
 	for (dbp = dt_list_next(&dpr->dpr_bps);
 	    dbp != NULL; dbp = dt_list_next(dbp)) {
@@ -167,7 +165,7 @@ dt_proc_bpenable(dt_proc_t *dpr)
 {
 	dt_bkpt_t *dbp;
 
-	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
+	assert(MUTEX_HELD(&dpr->dpr_lock));
 
 	for (dbp = dt_list_next(&dpr->dpr_bps);
 	    dbp != NULL; dbp = dt_list_next(dbp)) {
@@ -184,7 +182,7 @@ dt_proc_bpdisable(dt_proc_t *dpr)
 {
 	dt_bkpt_t *dbp;
 
-	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
+	assert(MUTEX_HELD(&dpr->dpr_lock));
 
 	for (dbp = dt_list_next(&dpr->dpr_bps);
 	    dbp != NULL; dbp = dt_list_next(dbp)) {
@@ -232,7 +230,7 @@ dt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr,
 static void
 dt_proc_stop(dt_proc_t *dpr, uint8_t why)
 {
-	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
+	assert(MUTEX_HELD(&dpr->dpr_lock));
 	assert(why != DT_PROC_STOP_IDLE);
 
 	if (dpr->dpr_stop & why) {
@@ -333,7 +331,7 @@ dt_proc_attach(dt_proc_t *dpr, int exec)
 	rd_err_e err;
 	GElf_Sym sym;
 
-	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
+	assert(MUTEX_HELD(&dpr->dpr_lock));
 
 	if (exec) {
 		if (psp->pr_lwp.pr_errno != 0)
@@ -399,7 +397,7 @@ dt_proc_waitrun(dt_proc_t *dpr)
 	const long wstop = PCWSTOP;
 	int pfd = Pctlfd(P);
 
-	assert(DT_MUTEX_HELD(&dpr->dpr_lock));
+	assert(MUTEX_HELD(&dpr->dpr_lock));
 	assert(psp->pr_flags & PR_STOPPED);
 	assert(Pstate(P) == PS_STOP);
 
@@ -712,9 +710,12 @@ dt_proc_destroy(dtrace_hdl_t *dtp, struct ps_prochandle *P)
 	if (!(Pstatus(dpr->dpr_proc)->pr_flags & (PR_KLC | PR_RLC))) {
 		dt_dprintf("abandoning pid %d\n", (int)dpr->dpr_pid);
 		rflag = PRELEASE_HANG;
+	} else if (Pstatus(dpr->dpr_proc)->pr_flags & PR_KLC) {
+		dt_dprintf("killing pid %d\n", (int)dpr->dpr_pid);
+		rflag = PRELEASE_KILL; /* apply kill-on-last-close */
 	} else {
 		dt_dprintf("releasing pid %d\n", (int)dpr->dpr_pid);
-		rflag = 0; /* apply kill or run-on-last-close */
+		rflag = 0; /* apply run-on-last-close */
 	}
 
 	if (dpr->dpr_tid) {
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_program.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_program.c
index 29d883aca4d24..8105df0737d01 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_program.c
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_program.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <unistd.h>
 #include <strings.h>
 #include <stdlib.h>
@@ -173,6 +171,9 @@ dtrace_program_exec(dtrace_hdl_t *dtp, dtrace_prog_t *pgp,
 		case E2BIG:
 			err = EDT_DIFSIZE;
 			break;
+		case EBUSY:
+			err = EDT_ENABLING_ERR;
+			break;
 		default:
 			err = errno;
 		}
diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c
index b2163e69e9a65..97221c84d6cc0 100644
--- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c
+++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,13 +18,12 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/sysmacros.h>
 
 #include <strings.h>
@@ -57,8 +55,8 @@ int
 dtrace_xstr2desc(dtrace_hdl_t *dtp, dtrace_probespec_t spec,
     const char *s, int argc, char *const argv[], dtrace_probedesc_t *pdp)
 {
-	size_t off, len, vlen;
-	const char *p, *q, *v;
+	size_t off, len, vlen, wlen;
+	const char *p, *q, *v, *w;
 
 	char buf[32]; /* for id_t as %d (see below) */
 
@@ -74,6 +72,8 @@ dtrace_xstr2desc(dtrace_hdl_t *dtp, dtrace_probespec_t spec,
 
 		q = p + 1;
 		vlen = 0;
+		w = NULL;
+		wlen = 0;
 
 		if ((v = strchr(q, '$')) != NULL && v < q + len) {
 			/*
@@ -98,14 +98,14 @@ dtrace_xstr2desc(dtrace_hdl_t *dtp, dtrace_probespec_t spec,
 			}
 
 			if (isdigit(v[1])) {
-				char *end;
 				long i;
 
 				errno = 0;
-				i = strtol(v + 1, &end, 10);
+				i = strtol(v + 1, (char **)&w, 10);
+
+				wlen = vlen - (w - v);
 
-				if (i < 0 || i >= argc ||
-				    errno != 0 || end != v + vlen)
+				if (i < 0 || i >= argc || errno != 0)
 					return (dt_set_errno(dtp, EDT_BADSPCV));
 
 				v = argv[i];
@@ -141,7 +141,7 @@ dtrace_xstr2desc(dtrace_hdl_t *dtp, dtrace_probespec_t spec,
 		off = dtrace_probespecs[spec--].dtps_offset;
 		bcopy(q, (char *)pdp + off, len);
 		bcopy(v, (char *)pdp + off + len, vlen);
-
+		bcopy(w, (char *)pdp + off + len + vlen, wlen);
 	} while (--p >= s);
 
 	pdp->dtpd_id = DTRACE_IDNONE;
@@ -803,30 +803,6 @@ dt_popcb(const ulong_t *bp, ulong_t n)
 	return (popc + dt_popc(bp[maxw] & ((1UL << maxb) - 1)));
 }
 
-struct _rwlock;
-struct _lwp_mutex;
-
-int
-dt_rw_read_held(pthread_rwlock_t *lock)
-{
-	extern int _rw_read_held(struct _rwlock *);
-	return (_rw_read_held((struct _rwlock *)lock));
-}
-
-int
-dt_rw_write_held(pthread_rwlock_t *lock)
-{
-	extern int _rw_write_held(struct _rwlock *);
-	return (_rw_write_held((struct _rwlock *)lock));
-}
-
-int
-dt_mutex_held(pthread_mutex_t *lock)
-{
-	extern int _mutex_held(struct _lwp_mutex *);
-	return (_mutex_held((struct _lwp_mutex *)lock));
-}
-
 static int
 dt_string2str(char *s, char *str, int nbytes)
 {
diff --git a/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c b/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c
index 0845cb08cf8d8..57915cd7373e2 100644
--- a/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c
+++ b/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c
@@ -19,14 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <unistd.h>
 #include <strings.h>
+#include <libintl.h>
 #include <sys/types.h>
 #include <sys/inttypes.h>
 #include "libnvpair.h"
@@ -272,6 +271,156 @@ nvlist_print(FILE *fp, nvlist_t *nvl)
 	nvlist_print_with_indent(fp, nvl, 0);
 }
 
+
+#define	NVP(elem, type, vtype, ptype, format) { \
+	vtype	value; \
+\
+	(void) nvpair_value_##type(elem, &value); \
+	(void) printf("%*s%s: " format "\n", indent, "", \
+	    nvpair_name(elem), (ptype)value); \
+}
+
+#define	NVPA(elem, type, vtype, ptype, format) { \
+	uint_t	i, count; \
+	vtype	*value;  \
+\
+	(void) nvpair_value_##type(elem, &value, &count); \
+	for (i = 0; i < count; i++) { \
+		(void) printf("%*s%s[%d]: " format "\n", indent, "", \
+		    nvpair_name(elem), i, (ptype)value[i]); \
+	} \
+}
+
+/*
+ * Similar to nvlist_print() but handles arrays slightly differently.
+ */
+void
+dump_nvlist(nvlist_t *list, int indent)
+{
+	nvpair_t	*elem = NULL;
+	boolean_t	bool_value;
+	nvlist_t	*nvlist_value;
+	nvlist_t	**nvlist_array_value;
+	uint_t		i, count;
+
+	if (list == NULL) {
+		return;
+	}
+
+	while ((elem = nvlist_next_nvpair(list, elem)) != NULL) {
+		switch (nvpair_type(elem)) {
+		case DATA_TYPE_BOOLEAN_VALUE:
+			(void) nvpair_value_boolean_value(elem, &bool_value);
+			(void) printf("%*s%s: %s\n", indent, "",
+			    nvpair_name(elem), bool_value ? "true" : "false");
+			break;
+
+		case DATA_TYPE_BYTE:
+			NVP(elem, byte, uchar_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT8:
+			NVP(elem, int8, int8_t, int, "%d");
+			break;
+
+		case DATA_TYPE_UINT8:
+			NVP(elem, uint8, uint8_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT16:
+			NVP(elem, int16, int16_t, int, "%d");
+			break;
+
+		case DATA_TYPE_UINT16:
+			NVP(elem, uint16, uint16_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT32:
+			NVP(elem, int32, int32_t, long, "%ld");
+			break;
+
+		case DATA_TYPE_UINT32:
+			NVP(elem, uint32, uint32_t, ulong_t, "%lu");
+			break;
+
+		case DATA_TYPE_INT64:
+			NVP(elem, int64, int64_t, longlong_t, "%lld");
+			break;
+
+		case DATA_TYPE_UINT64:
+			NVP(elem, uint64, uint64_t, u_longlong_t, "%llu");
+			break;
+
+		case DATA_TYPE_STRING:
+			NVP(elem, string, char *, char *, "'%s'");
+			break;
+
+		case DATA_TYPE_BYTE_ARRAY:
+			NVPA(elem, byte_array, uchar_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT8_ARRAY:
+			NVPA(elem, int8_array, int8_t, int, "%d");
+			break;
+
+		case DATA_TYPE_UINT8_ARRAY:
+			NVPA(elem, uint8_array, uint8_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT16_ARRAY:
+			NVPA(elem, int16_array, int16_t, int, "%d");
+			break;
+
+		case DATA_TYPE_UINT16_ARRAY:
+			NVPA(elem, uint16_array, uint16_t, int, "%u");
+			break;
+
+		case DATA_TYPE_INT32_ARRAY:
+			NVPA(elem, int32_array, int32_t, long, "%ld");
+			break;
+
+		case DATA_TYPE_UINT32_ARRAY:
+			NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu");
+			break;
+
+		case DATA_TYPE_INT64_ARRAY:
+			NVPA(elem, int64_array, int64_t, longlong_t, "%lld");
+			break;
+
+		case DATA_TYPE_UINT64_ARRAY:
+			NVPA(elem, uint64_array, uint64_t, u_longlong_t,
+			    "%llu");
+			break;
+
+		case DATA_TYPE_STRING_ARRAY:
+			NVPA(elem, string_array, char *, char *, "'%s'");
+			break;
+
+		case DATA_TYPE_NVLIST:
+			(void) nvpair_value_nvlist(elem, &nvlist_value);
+			(void) printf("%*s%s:\n", indent, "",
+			    nvpair_name(elem));
+			dump_nvlist(nvlist_value, indent + 4);
+			break;
+
+		case DATA_TYPE_NVLIST_ARRAY:
+			(void) nvpair_value_nvlist_array(elem,
+			    &nvlist_array_value, &count);
+			for (i = 0; i < count; i++) {
+				(void) printf("%*s%s[%u]:\n", indent, "",
+				    nvpair_name(elem), i);
+				dump_nvlist(nvlist_array_value[i], indent + 4);
+			}
+			break;
+
+		default:
+			(void) printf(dgettext(TEXT_DOMAIN, "bad config type "
+			    "%d for %s\n"), nvpair_type(elem),
+			    nvpair_name(elem));
+		}
+	}
+}
+
 /*
  * Determine if string 'value' matches 'nvp' value.  The 'value' string is
  * converted, depending on the type of 'nvp', prior to match.  For numeric
diff --git a/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h b/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h
index e655e0d4069dc..15c1c781679f8 100644
--- a/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h
+++ b/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_LIBNVPAIR_H
 #define	_LIBNVPAIR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/nvpair.h>
 #include <stdlib.h>
 #include <stdio.h>
@@ -40,6 +38,7 @@ extern "C" {
 void nvlist_print(FILE *, nvlist_t *);
 int nvpair_value_match(nvpair_t *, int, char *, char **);
 int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **);
+void dump_nvlist(nvlist_t *, int);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/lib/libshare/common/libshare.h b/external/cddl/osnet/dist/lib/libshare/common/libshare.h
index a560b7731fcdb..e733ea4d10bdb 100644
--- a/external/cddl/osnet/dist/lib/libshare/common/libshare.h
+++ b/external/cddl/osnet/dist/lib/libshare/common/libshare.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,8 +31,6 @@
 #ifndef _LIBSHARE_H
 #define	_LIBSHARE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h b/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h
index c650865f30adb..b5630534749fd 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,6 +29,7 @@
 
 #include <assert.h>
 #include <libnvpair.h>
+#include <sys/mnttab.h>
 #include <sys/param.h>
 #include <sys/types.h>
 #include <sys/varargs.h>
@@ -65,7 +66,6 @@ enum {
 	EZFS_BADSTREAM,		/* bad backup stream */
 	EZFS_DSREADONLY,	/* dataset is readonly */
 	EZFS_VOLTOOBIG,		/* volume is too large for 32-bit system */
-	EZFS_VOLHASDATA,	/* volume already contains data */
 	EZFS_INVALIDNAME,	/* invalid dataset name */
 	EZFS_BADRESTORE,	/* unable to restore to destination */
 	EZFS_BADBACKUP,		/* backup failed */
@@ -84,7 +84,6 @@ enum {
 	EZFS_UMOUNTFAILED,	/* failed to unmount dataset */
 	EZFS_UNSHARENFSFAILED,	/* unshare(1M) failed */
 	EZFS_SHARENFSFAILED,	/* share(1M) failed */
-	EZFS_DEVLINKS,		/* failed to create zvol links */
 	EZFS_PERM,		/* permission denied */
 	EZFS_NOSPC,		/* out of space */
 	EZFS_IO,		/* I/O error */
@@ -115,6 +114,13 @@ enum {
 	EZFS_VDEVNOTSUP,	/* unsupported vdev type */
 	EZFS_NOTSUP,		/* ops not supported on this dataset */
 	EZFS_ACTIVE_SPARE,	/* pool has active shared spare devices */
+	EZFS_UNPLAYED_LOGS,	/* log device has unplayed logs */
+	EZFS_REFTAG_RELE,	/* snapshot release: tag not found */
+	EZFS_REFTAG_HOLD,	/* snapshot hold: tag already exists */
+	EZFS_TAGTOOLONG,	/* snapshot hold/rele: tag too long */
+	EZFS_PIPEFAILED,	/* pipe create failed */
+	EZFS_THREADCREATEFAILED, /* thread create failed */
+	EZFS_POSTSPLIT_ONLINE,	/* onlining a disk after splitting it */
 	EZFS_UNKNOWN
 };
 
@@ -175,6 +181,14 @@ extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t);
 extern int libzfs_errno(libzfs_handle_t *);
 extern const char *libzfs_error_action(libzfs_handle_t *);
 extern const char *libzfs_error_description(libzfs_handle_t *);
+extern void libzfs_mnttab_init(libzfs_handle_t *);
+extern void libzfs_mnttab_fini(libzfs_handle_t *);
+extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t);
+extern int libzfs_mnttab_find(libzfs_handle_t *, const char *,
+    struct mnttab *);
+extern void libzfs_mnttab_add(libzfs_handle_t *, const char *,
+    const char *, const char *);
+extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *);
 
 /*
  * Basic handle functions
@@ -201,11 +215,19 @@ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *,
 extern int zpool_destroy(zpool_handle_t *);
 extern int zpool_add(zpool_handle_t *, nvlist_t *);
 
+typedef struct splitflags {
+	/* do not split, but return the config that would be split off */
+	int dryrun : 1;
+
+	/* after splitting, import the pool */
+	int import : 1;
+} splitflags_t;
+
 /*
  * Functions to manipulate pool and vdev state
  */
 extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t);
-extern int zpool_clear(zpool_handle_t *, const char *);
+extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *);
 
 extern int zpool_vdev_online(zpool_handle_t *, const char *, int,
     vdev_state_t *);
@@ -214,13 +236,17 @@ extern int zpool_vdev_attach(zpool_handle_t *, const char *,
     const char *, nvlist_t *, int);
 extern int zpool_vdev_detach(zpool_handle_t *, const char *);
 extern int zpool_vdev_remove(zpool_handle_t *, const char *);
+extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *,
+    splitflags_t);
 
-extern int zpool_vdev_fault(zpool_handle_t *, uint64_t);
-extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t);
+extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t);
+extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t);
 extern int zpool_vdev_clear(zpool_handle_t *, uint64_t);
 
 extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *,
     boolean_t *, boolean_t *);
+extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *,
+    boolean_t *, boolean_t *, boolean_t *);
 extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *);
 
 /*
@@ -256,9 +282,15 @@ typedef enum {
 	ZPOOL_STATUS_HOSTID_MISMATCH,	/* last accessed by another system */
 	ZPOOL_STATUS_IO_FAILURE_WAIT,	/* failed I/O, failmode 'wait' */
 	ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */
+	ZPOOL_STATUS_BAD_LOG,		/* cannot read log chain(s) */
+
+	/*
+	 * These faults have no corresponding message ID.  At the time we are
+	 * checking the status, the original reason for the FMA fault (I/O or
+	 * checksum errors) has been lost.
+	 */
 	ZPOOL_STATUS_FAULTED_DEV_R,	/* faulted device with replicas */
 	ZPOOL_STATUS_FAULTED_DEV_NR,	/* faulted device with no replicas */
-	ZPOOL_STATUS_BAD_LOG,		/* cannot read log chain(s) */
 
 	/*
 	 * The following are not faults per se, but still an error possibly
@@ -268,6 +300,7 @@ typedef enum {
 	ZPOOL_STATUS_VERSION_OLDER,	/* older on-disk version */
 	ZPOOL_STATUS_RESILVERING,	/* device being resilvered */
 	ZPOOL_STATUS_OFFLINE_DEV,	/* device online */
+	ZPOOL_STATUS_REMOVED_DEV,	/* removed device */
 
 	/*
 	 * Finally, the following indicates a healthy pool.
@@ -277,6 +310,7 @@ typedef enum {
 
 extern zpool_status_t zpool_get_status(zpool_handle_t *, char **);
 extern zpool_status_t zpool_import_status(nvlist_t *, char **);
+extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh);
 
 /*
  * Statistics and configuration functions.
@@ -289,6 +323,7 @@ extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **);
  * Import and export functions
  */
 extern int zpool_export(zpool_handle_t *, boolean_t);
+extern int zpool_export_force(zpool_handle_t *);
 extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *,
     char *altroot);
 extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
@@ -297,30 +332,48 @@ extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *,
 /*
  * Search for pools to import
  */
+
+typedef struct importargs {
+	char **path;		/* a list of paths to search		*/
+	int paths;		/* number of paths to search		*/
+	char *poolname;		/* name of a pool to find		*/
+	uint64_t guid;		/* guid of a pool to find		*/
+	char *cachefile;	/* cachefile to use for import		*/
+	int can_be_active : 1;	/* can the pool be active?		*/
+	int unique : 1;		/* does 'poolname' already exist?	*/
+	int exists : 1;		/* set on return if pool already exists	*/
+} importargs_t;
+
+extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *);
+
+/* legacy pool search routines */
 extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **);
 extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *,
     char *, uint64_t);
-extern nvlist_t *zpool_find_import_byname(libzfs_handle_t *, int, char **,
-    char *);
-extern nvlist_t *zpool_find_import_byguid(libzfs_handle_t *, int, char **,
-    uint64_t);
-extern nvlist_t *zpool_find_import_activeok(libzfs_handle_t *, int, char **);
 
 /*
  * Miscellaneous pool functions
  */
 struct zfs_cmd;
 
-extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *);
+extern const char *hist_event_table[LOG_END];
+
+extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *,
+    boolean_t verbose);
 extern int zpool_upgrade(zpool_handle_t *, uint64_t);
 extern int zpool_get_history(zpool_handle_t *, nvlist_t **);
+extern int zpool_history_unpack(char *, uint64_t, uint64_t *,
+    nvlist_t ***, uint_t *);
 extern void zpool_set_history_str(const char *subcommand, int argc,
     char **argv, char *history_str);
 extern int zpool_stage_history(libzfs_handle_t *, const char *);
 extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *,
     size_t len);
 extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *);
-extern int zpool_get_physpath(zpool_handle_t *, char *);
+extern int zpool_get_physpath(zpool_handle_t *, char *, size_t);
+extern void zpool_explain_recover(libzfs_handle_t *, const char *, int,
+    nvlist_t *);
+
 /*
  * Basic handle manipulations.  These functions do not create or destroy the
  * underlying datasets, only the references to them.
@@ -351,13 +404,20 @@ extern const char *zfs_prop_to_name(zfs_prop_t);
 extern int zfs_prop_set(zfs_handle_t *, const char *, const char *);
 extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t,
     zprop_source_t *, char *, size_t, boolean_t);
+extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t,
+    boolean_t);
 extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *,
     zprop_source_t *, char *, size_t);
+extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
+    uint64_t *propvalue);
+extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
+    char *propbuf, int proplen, boolean_t literal);
 extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t);
-extern int zfs_prop_inherit(zfs_handle_t *, const char *);
+extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t);
 extern const char *zfs_prop_values(zfs_prop_t);
 extern int zfs_prop_is_string(zfs_prop_t prop);
 extern nvlist_t *zfs_get_user_props(zfs_handle_t *);
+extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *);
 
 typedef struct zprop_list {
 	int		pl_prop;
@@ -365,10 +425,12 @@ typedef struct zprop_list {
 	struct zprop_list *pl_next;
 	boolean_t	pl_all;
 	size_t		pl_width;
+	size_t		pl_recvd_width;
 	boolean_t	pl_fixed;
 } zprop_list_t;
 
-extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **);
+extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t);
+extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *);
 
 #define	ZFS_MOUNTPOINT_NONE	"none"
 #define	ZFS_MOUNTPOINT_LEGACY	"legacy"
@@ -391,13 +453,24 @@ extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **,
     zfs_type_t);
 extern void zprop_free_list(zprop_list_t *);
 
+#define	ZFS_GET_NCOLS	5
+
+typedef enum {
+	GET_COL_NONE,
+	GET_COL_NAME,
+	GET_COL_PROPERTY,
+	GET_COL_VALUE,
+	GET_COL_RECVD,
+	GET_COL_SOURCE
+} zfs_get_column_t;
+
 /*
  * Functions for printing zfs or zpool properties
  */
 typedef struct zprop_get_cbdata {
 	int cb_sources;
-	int cb_columns[4];
-	int cb_colwidths[5];
+	zfs_get_column_t cb_columns[ZFS_GET_NCOLS];
+	int cb_colwidths[ZFS_GET_NCOLS + 1];
 	boolean_t cb_scripted;
 	boolean_t cb_literal;
 	boolean_t cb_first;
@@ -406,12 +479,8 @@ typedef struct zprop_get_cbdata {
 } zprop_get_cbdata_t;
 
 void zprop_print_one_property(const char *, zprop_get_cbdata_t *,
-    const char *, const char *, zprop_source_t, const char *);
-
-#define	GET_COL_NAME		1
-#define	GET_COL_PROPERTY	2
-#define	GET_COL_VALUE		3
-#define	GET_COL_SOURCE		4
+    const char *, const char *, zprop_source_t, const char *,
+    const char *);
 
 /*
  * Iterator functions.
@@ -422,6 +491,7 @@ extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *);
 extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
+extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *);
 
 /*
  * Functions to create and destroy datasets.
@@ -429,15 +499,53 @@ extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *);
 extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t,
     nvlist_t *);
 extern int zfs_create_ancestors(libzfs_handle_t *, const char *);
-extern int zfs_destroy(zfs_handle_t *);
-extern int zfs_destroy_snaps(zfs_handle_t *, char *);
+extern int zfs_destroy(zfs_handle_t *, boolean_t);
+extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t);
 extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *);
 extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *);
 extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t);
 extern int zfs_rename(zfs_handle_t *, const char *, boolean_t);
+
+typedef struct sendflags {
+	/* print informational messages (ie, -v was specified) */
+	int verbose : 1;
+
+	/* recursive send  (ie, -R) */
+	int replicate : 1;
+
+	/* for incrementals, do all intermediate snapshots */
+	int doall : 1; /* (ie, -I) */
+
+	/* if dataset is a clone, do incremental from its origin */
+	int fromorigin : 1;
+
+	/* do deduplication */
+	int dedup : 1;
+
+	/* send properties (ie, -p) */
+	int props : 1;
+} sendflags_t;
+
+typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *);
+
 extern int zfs_send(zfs_handle_t *, const char *, const char *,
-    boolean_t, boolean_t, boolean_t, boolean_t, int);
+    sendflags_t, int, snapfilter_cb_t, void *);
+
 extern int zfs_promote(zfs_handle_t *);
+extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t,
+    boolean_t, boolean_t);
+extern int zfs_hold_range(zfs_handle_t *, const char *, const char *,
+    const char *, boolean_t, boolean_t);
+extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t);
+extern int zfs_release_range(zfs_handle_t *, const char *, const char *,
+    const char *, boolean_t);
+extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *);
+
+typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain,
+    uid_t rid, uint64_t space);
+
+extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
+    zfs_userspace_cb_t func, void *arg);
 
 typedef struct recvflags {
 	/* print informational messages (ie, -v was specified) */
@@ -446,6 +554,12 @@ typedef struct recvflags {
 	/* the destination is a prefix, not the exact fs (ie, -d) */
 	int isprefix : 1;
 
+	/*
+	 * Only the tail of the sent snapshot path is appended to the
+	 * destination to determine the received snapshot name (ie, -e).
+	 */
+	int istail : 1;
+
 	/* do not actually do the recv, just check if it would work (ie, -n) */
 	int dryrun : 1;
 
@@ -457,6 +571,9 @@ typedef struct recvflags {
 
 	/* byteswap flag is used internally; callers need not specify */
 	int byteswap : 1;
+
+	/* do not mount file systems as they are extracted (private) */
+	int nomount : 1;
 } recvflags_t;
 
 extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t,
@@ -473,17 +590,6 @@ extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *,
     zfs_type_t);
 extern int zfs_spa_version(zfs_handle_t *, int *);
 
-/*
- * dataset permission functions.
- */
-extern int zfs_perm_set(zfs_handle_t *, nvlist_t *);
-extern int zfs_perm_remove(zfs_handle_t *, nvlist_t *);
-extern int zfs_build_perms(zfs_handle_t *, char *, char *,
-    zfs_deleg_who_type_t, zfs_deleg_inherit_t, nvlist_t **nvlist_t);
-extern int zfs_perm_get(zfs_handle_t *, zfs_allow_t **);
-extern void zfs_free_allows(zfs_allow_t *);
-extern void zfs_deleg_permissions(void);
-
 /*
  * Mount support functions.
  */
@@ -518,7 +624,7 @@ extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *);
 extern int zfs_share_iscsi(zfs_handle_t *);
 extern int zfs_unshare_iscsi(zfs_handle_t *);
 extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *);
-extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *,
+extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *,
     void *, void *, int, zfs_share_op_t);
 
 /*
@@ -543,18 +649,22 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **,
     boolean_t *);
 
 /*
- * ftyp special.  Read the label from a given device.
+ * Label manipulation.
  */
 extern int zpool_read_label(int, nvlist_t **);
+extern int zpool_clear_label(int);
+
+/* is this zvol valid for use as a dump device? */
+extern int zvol_check_dump_config(char *);
 
 /*
- * Create and remove zvol /dev links.
+ * Management interfaces for SMB ACL files
  */
-extern int zpool_create_zvol_links(zpool_handle_t *);
-extern int zpool_remove_zvol_links(zpool_handle_t *);
 
-/* is this zvol valid for use as a dump device? */
-extern int zvol_check_dump_config(char *);
+int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *);
+int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *);
+int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *);
+int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *);
 
 /*
  * Enable and disable datasets within a pool by mounting/unmounting and
@@ -563,6 +673,17 @@ extern int zvol_check_dump_config(char *);
 extern int zpool_enable_datasets(zpool_handle_t *, const char *, int);
 extern int zpool_disable_datasets(zpool_handle_t *, boolean_t);
 
+/*
+ * Mappings between vdev and FRU.
+ */
+extern void libzfs_fru_refresh(libzfs_handle_t *);
+extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *);
+extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *);
+extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *,
+    const char *);
+extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *);
+extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c
index b905bc6cb6afc..c970d1e488d1b 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  *
  * Portions Copyright 2007 Ramprakash Jelari
@@ -119,18 +119,8 @@ changelist_prefix(prop_changelist_t *clp)
 		if (ZFS_IS_VOLUME(cn->cn_handle)) {
 			switch (clp->cl_realprop) {
 			case ZFS_PROP_NAME:
-				/*
-				 * If this was a rename, unshare the zvol, and
-				 * remove the /dev/zvol links.
-				 */
+				/* If this was a rename, unshare the zvol */
 				(void) zfs_unshare_iscsi(cn->cn_handle);
-
-				if (zvol_remove_link(cn->cn_handle->zfs_hdl,
-				    cn->cn_handle->zfs_name) != 0) {
-					ret = -1;
-					cn->cn_needpost = B_FALSE;
-					(void) zfs_share_iscsi(cn->cn_handle);
-				}
 				break;
 
 			case ZFS_PROP_VOLSIZE:
@@ -218,6 +208,7 @@ changelist_postfix(prop_changelist_t *clp)
 
 		boolean_t sharenfs;
 		boolean_t sharesmb;
+		boolean_t mounted;
 
 		/*
 		 * If we are in the global zone, but this dataset is exported
@@ -234,15 +225,7 @@ changelist_postfix(prop_changelist_t *clp)
 		zfs_refresh_properties(cn->cn_handle);
 
 		if (ZFS_IS_VOLUME(cn->cn_handle)) {
-			/*
-			 * If we're doing a rename, recreate the /dev/zvol
-			 * links.
-			 */
-			if (clp->cl_realprop == ZFS_PROP_NAME &&
-			    zvol_create_link(cn->cn_handle->zfs_hdl,
-			    cn->cn_handle->zfs_name) != 0) {
-				errors++;
-			} else if (cn->cn_shared ||
+			if (cn->cn_shared ||
 			    clp->cl_prop == ZFS_PROP_SHAREISCSI) {
 				if (zfs_prop_get(cn->cn_handle,
 				    ZFS_PROP_SHAREISCSI, shareopts,
@@ -272,20 +255,29 @@ changelist_postfix(prop_changelist_t *clp)
 		    shareopts, sizeof (shareopts), NULL, NULL, 0,
 		    B_FALSE) == 0) && (strcmp(shareopts, "off") != 0));
 
-		if ((cn->cn_mounted || clp->cl_waslegacy || sharenfs ||
-		    sharesmb) && !zfs_is_mounted(cn->cn_handle, NULL) &&
-		    zfs_mount(cn->cn_handle, NULL, 0) != 0)
-			errors++;
+		mounted = zfs_is_mounted(cn->cn_handle, NULL);
+
+		if (!mounted && (cn->cn_mounted ||
+		    ((sharenfs || sharesmb || clp->cl_waslegacy) &&
+		    (zfs_prop_get_int(cn->cn_handle,
+		    ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) {
+
+			if (zfs_mount(cn->cn_handle, NULL, 0) != 0)
+				errors++;
+			else
+				mounted = TRUE;
+		}
 
 		/*
-		 * We always re-share even if the filesystem is currently
-		 * shared, so that we can adopt any new options.
+		 * If the file system is mounted we always re-share even
+		 * if the filesystem is currently shared, so that we can
+		 * adopt any new options.
 		 */
-		if (sharenfs)
+		if (sharenfs && mounted)
 			errors += zfs_share_nfs(cn->cn_handle);
 		else if (cn->cn_shared || clp->cl_waslegacy)
 			errors += zfs_unshare_nfs(cn->cn_handle, NULL);
-		if (sharesmb)
+		if (sharesmb && mounted)
 			errors += zfs_share_smb(cn->cn_handle);
 		else if (cn->cn_shared || clp->cl_waslegacy)
 			errors += zfs_unshare_smb(cn->cn_handle, NULL);
@@ -498,6 +490,14 @@ change_one(zfs_handle_t *zhp, void *data)
 			    &idx);
 			uu_list_insert(clp->cl_list, cn, idx);
 		} else {
+			/*
+			 * Add this child to beginning of the list. Children
+			 * below this one in the hierarchy will get added above
+			 * this one in the list. This produces a list in
+			 * reverse dataset name order.
+			 * This is necessary when the original mountpoint
+			 * is legacy or none.
+			 */
 			ASSERT(!clp->cl_alldependents);
 			verify(uu_list_insert_before(clp->cl_list,
 			    uu_list_first(clp->cl_list), cn) == 0);
@@ -564,6 +564,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
 	zfs_handle_t *temp;
 	char property[ZFS_MAXPROPLEN];
 	uu_compare_fn_t *compare = NULL;
+	boolean_t legacy = B_FALSE;
 
 	if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL)
 		return (NULL);
@@ -576,8 +577,19 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
 	if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED ||
 	    prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS ||
 	    prop == ZFS_PROP_SHARESMB) {
-		compare = compare_mountpoints;
-		clp->cl_sorted = B_TRUE;
+
+		if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT,
+		    property, sizeof (property),
+		    NULL, NULL, 0, B_FALSE) == 0 &&
+		    (strcmp(property, "legacy") == 0 ||
+		    strcmp(property, "none") == 0)) {
+
+			legacy = B_TRUE;
+		}
+		if (!legacy) {
+			compare = compare_mountpoints;
+			clp->cl_sorted = B_TRUE;
+		}
 	}
 
 	clp->cl_pool = uu_list_pool_create("changelist_pool",
@@ -621,8 +633,6 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
 		clp->cl_prop = ZFS_PROP_MOUNTPOINT;
 	} else if (prop == ZFS_PROP_VOLSIZE) {
 		clp->cl_prop = ZFS_PROP_MOUNTPOINT;
-	} else if (prop == ZFS_PROP_VERSION) {
-		clp->cl_prop = ZFS_PROP_MOUNTPOINT;
 	} else {
 		clp->cl_prop = prop;
 	}
@@ -687,6 +697,12 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
 		(void) uu_list_find(clp->cl_list, cn, NULL, &idx);
 		uu_list_insert(clp->cl_list, cn, idx);
 	} else {
+		/*
+		 * Add the target dataset to the end of the list.
+		 * The list is not really unsorted. The list will be
+		 * in reverse dataset name order. This is necessary
+		 * when the original mountpoint is legacy or none.
+		 */
 		verify(uu_list_insert_after(clp->cl_list,
 		    uu_list_last(clp->cl_list), cn) == 0);
 	}
@@ -695,11 +711,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags,
 	 * If the mountpoint property was previously 'legacy', or 'none',
 	 * record it as the behavior of changelist_postfix() will be different.
 	 */
-	if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) &&
-	    (zfs_prop_get(zhp, prop, property, sizeof (property),
-	    NULL, NULL, 0, B_FALSE) == 0 &&
-	    (strcmp(property, "legacy") == 0 ||
-	    strcmp(property, "none") == 0))) {
+	if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) {
 		/*
 		 * do not automatically mount ex-legacy datasets if
 		 * we specifically set canmount to noauto
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c
index 94640d1b128cf..dc27238c9cf37 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * The pool configuration repository is stored in /etc/zfs/zpool.cache as a
  * single packed nvlist.  While it would be nice to just read in this
@@ -313,21 +311,33 @@ zpool_iter(libzfs_handle_t *hdl, zpool_iter_f func, void *data)
 	zpool_handle_t *zhp;
 	int ret;
 
-	if (namespace_reload(hdl) != 0)
+	/*
+	 * If someone makes a recursive call to zpool_iter(), we want to avoid
+	 * refreshing the namespace because that will invalidate the parent
+	 * context.  We allow recursive calls, but simply re-use the same
+	 * namespace AVL tree.
+	 */
+	if (!hdl->libzfs_pool_iter && namespace_reload(hdl) != 0)
 		return (-1);
 
+	hdl->libzfs_pool_iter++;
 	for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL;
 	    cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) {
 
-		if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0)
+		if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) {
+			hdl->libzfs_pool_iter--;
 			return (-1);
+		}
 
 		if (zhp == NULL)
 			continue;
 
-		if ((ret = func(zhp, data)) != 0)
+		if ((ret = func(zhp, data)) != 0) {
+			hdl->libzfs_pool_iter--;
 			return (ret);
+		}
 	}
+	hdl->libzfs_pool_iter--;
 
 	return (0);
 }
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c
index a8005ffc0cf53..bd63372301d53 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c
@@ -20,14 +20,12 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#include <assert.h>
 #include <ctype.h>
 #include <errno.h>
-#include <libdevinfo.h>
 #include <libintl.h>
 #include <math.h>
 #include <stdio.h>
@@ -38,15 +36,17 @@
 #include <zone.h>
 #include <fcntl.h>
 #include <sys/mntent.h>
-#include <sys/mnttab.h>
 #include <sys/mount.h>
-#include <sys/avl.h>
 #include <priv.h>
 #include <pwd.h>
 #include <grp.h>
 #include <stddef.h>
 #include <ucred.h>
+#include <idmap.h>
+#include <aclutils.h>
+#include <directory.h>
 
+#include <sys/dnode.h>
 #include <sys/spa.h>
 #include <sys/zap.h>
 #include <libzfs.h>
@@ -56,7 +56,8 @@
 #include "libzfs_impl.h"
 #include "zfs_deleg.h"
 
-static int zvol_create_link_common(libzfs_handle_t *, const char *, int);
+static int userquota_propname_decode(const char *propname, boolean_t zoned,
+    zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp);
 
 /*
  * Given a single type (not a mask of types), return the type in a human
@@ -108,7 +109,6 @@ path_to_str(const char *path, int types)
 		return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT));
 	}
 
-
 	/*
 	 * The user has requested either filesystems or volumes.
 	 * We have no way of knowing a priori what type this would be, so always
@@ -123,8 +123,8 @@ path_to_str(const char *path, int types)
 
 /*
  * Validate a ZFS path.  This is used even before trying to open the dataset, to
- * provide a more meaningful error message.  We place a more useful message in
- * 'buf' detailing exactly why the name was not valid.
+ * provide a more meaningful error message.  We call zfs_error_aux() to
+ * explain exactly why the name was not valid.
  */
 static int
 zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type,
@@ -319,21 +319,43 @@ zpool_free_handles(libzfs_handle_t *hdl)
  * Utility function to gather stats (objset and zpl) for the given object.
  */
 static int
-get_stats(zfs_handle_t *zhp)
+get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
-	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
-	nvlist_t *allprops, *userprops;
 
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
+
+	while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) {
+		if (errno == ENOMEM) {
+			if (zcmd_expand_dst_nvlist(hdl, zc) != 0) {
+				return (-1);
+			}
+		} else {
+			return (-1);
+		}
+	}
+	return (0);
+}
+
+/*
+ * Utility function to get the received properties of the given object.
+ */
+static int
+get_recvd_props_ioctl(zfs_handle_t *zhp)
+{
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	nvlist_t *recvdprops;
+	zfs_cmd_t zc = { 0 };
+	int err;
 
 	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0)
 		return (-1);
 
-	while (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) {
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) {
 		if (errno == ENOMEM) {
 			if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) {
-				zcmd_free_nvlists(&zc);
 				return (-1);
 			}
 		} else {
@@ -342,15 +364,32 @@ get_stats(zfs_handle_t *zhp)
 		}
 	}
 
-	zhp->zfs_dmustats = zc.zc_objset_stats; /* structure assignment */
+	err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops);
+	zcmd_free_nvlists(&zc);
+	if (err != 0)
+		return (-1);
 
-	if (zcmd_read_dst_nvlist(hdl, &zc, &allprops) != 0) {
-		zcmd_free_nvlists(&zc);
+	nvlist_free(zhp->zfs_recvd_props);
+	zhp->zfs_recvd_props = recvdprops;
+
+	return (0);
+}
+
+static int
+put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc)
+{
+	nvlist_t *allprops, *userprops;
+
+	zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */
+
+	if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) {
 		return (-1);
 	}
 
-	zcmd_free_nvlists(&zc);
-
+	/*
+	 * XXX Why do we store the user props separately, in addition to
+	 * storing them in zfs_props?
+	 */
 	if ((userprops = process_user_props(zhp, allprops)) == NULL) {
 		nvlist_free(allprops);
 		return (-1);
@@ -365,6 +404,22 @@ get_stats(zfs_handle_t *zhp)
 	return (0);
 }
 
+static int
+get_stats(zfs_handle_t *zhp)
+{
+	int rc = 0;
+	zfs_cmd_t zc = { 0 };
+
+	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
+		return (-1);
+	if (get_stats_ioctl(zhp, &zc) != 0)
+		rc = -1;
+	else if (put_stats_zhdl(zhp, &zc) != 0)
+		rc = -1;
+	zcmd_free_nvlists(&zc);
+	return (rc);
+}
+
 /*
  * Refresh the properties currently stored in the handle.
  */
@@ -378,74 +433,11 @@ zfs_refresh_properties(zfs_handle_t *zhp)
  * Makes a handle from the given dataset name.  Used by zfs_open() and
  * zfs_iter_* to create child handles on the fly.
  */
-zfs_handle_t *
-make_dataset_handle(libzfs_handle_t *hdl, const char *path)
+static int
+make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc)
 {
-	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
-	char *logstr;
-
-	if (zhp == NULL)
-		return (NULL);
-
-	zhp->zfs_hdl = hdl;
-
-	/*
-	 * Preserve history log string.
-	 * any changes performed here will be
-	 * logged as an internal event.
-	 */
-	logstr = zhp->zfs_hdl->libzfs_log_str;
-	zhp->zfs_hdl->libzfs_log_str = NULL;
-top:
-	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
-
-	if (get_stats(zhp) != 0) {
-		zhp->zfs_hdl->libzfs_log_str = logstr;
-		free(zhp);
-		return (NULL);
-	}
-
-	if (zhp->zfs_dmustats.dds_inconsistent) {
-		zfs_cmd_t zc = { 0 };
-
-		/*
-		 * If it is dds_inconsistent, then we've caught it in
-		 * the middle of a 'zfs receive' or 'zfs destroy', and
-		 * it is inconsistent from the ZPL's point of view, so
-		 * can't be mounted.  However, it could also be that we
-		 * have crashed in the middle of one of those
-		 * operations, in which case we need to get rid of the
-		 * inconsistent state.  We do that by either rolling
-		 * back to the previous snapshot (which will fail if
-		 * there is none), or destroying the filesystem.  Note
-		 * that if we are still in the middle of an active
-		 * 'receive' or 'destroy', then the rollback and destroy
-		 * will fail with EBUSY and we will drive on as usual.
-		 */
-
-		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-
-		if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) {
-			(void) zvol_remove_link(hdl, zhp->zfs_name);
-			zc.zc_objset_type = DMU_OST_ZVOL;
-		} else {
-			zc.zc_objset_type = DMU_OST_ZFS;
-		}
-
-		/*
-		 * If we can successfully destroy it, pretend that it
-		 * never existed.
-		 */
-		if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc) == 0) {
-			zhp->zfs_hdl->libzfs_log_str = logstr;
-			free(zhp);
-			errno = ENOENT;
-			return (NULL);
-		}
-		/* If we can successfully roll it back, reget the stats */
-		if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc) == 0)
-			goto top;
-	}
+	if (put_stats_zhdl(zhp, zc) != 0)
+		return (-1);
 
 	/*
 	 * We've managed to open the dataset and gather statistics.  Determine
@@ -467,8 +459,53 @@ make_dataset_handle(libzfs_handle_t *hdl, const char *path)
 	else
 		abort();	/* we should never see any other types */
 
-	zhp->zfs_hdl->libzfs_log_str = logstr;
 	zhp->zpool_hdl = zpool_handle(zhp);
+	return (0);
+}
+
+zfs_handle_t *
+make_dataset_handle(libzfs_handle_t *hdl, const char *path)
+{
+	zfs_cmd_t zc = { 0 };
+
+	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
+
+	if (zhp == NULL)
+		return (NULL);
+
+	zhp->zfs_hdl = hdl;
+	(void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name));
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) {
+		free(zhp);
+		return (NULL);
+	}
+	if (get_stats_ioctl(zhp, &zc) == -1) {
+		zcmd_free_nvlists(&zc);
+		free(zhp);
+		return (NULL);
+	}
+	if (make_dataset_handle_common(zhp, &zc) == -1) {
+		free(zhp);
+		zhp = NULL;
+	}
+	zcmd_free_nvlists(&zc);
+	return (zhp);
+}
+
+static zfs_handle_t *
+make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc)
+{
+	zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1);
+
+	if (zhp == NULL)
+		return (NULL);
+
+	zhp->zfs_hdl = hdl;
+	(void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name));
+	if (make_dataset_handle_common(zhp, zc) == -1) {
+		free(zhp);
+		return (NULL);
+	}
 	return (zhp);
 }
 
@@ -524,9 +561,145 @@ zfs_close(zfs_handle_t *zhp)
 		free(zhp->zfs_mntopts);
 	nvlist_free(zhp->zfs_props);
 	nvlist_free(zhp->zfs_user_props);
+	nvlist_free(zhp->zfs_recvd_props);
 	free(zhp);
 }
 
+typedef struct mnttab_node {
+	struct mnttab mtn_mt;
+	avl_node_t mtn_node;
+} mnttab_node_t;
+
+static int
+libzfs_mnttab_cache_compare(const void *arg1, const void *arg2)
+{
+	const mnttab_node_t *mtn1 = arg1;
+	const mnttab_node_t *mtn2 = arg2;
+	int rv;
+
+	rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special);
+
+	if (rv == 0)
+		return (0);
+	return (rv > 0 ? 1 : -1);
+}
+
+void
+libzfs_mnttab_init(libzfs_handle_t *hdl)
+{
+	assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0);
+	avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare,
+	    sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node));
+}
+
+void
+libzfs_mnttab_update(libzfs_handle_t *hdl)
+{
+	struct mnttab entry;
+
+	rewind(hdl->libzfs_mnttab);
+	while (getmntent(hdl->libzfs_mnttab, &entry) == 0) {
+		mnttab_node_t *mtn;
+
+		if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0)
+			continue;
+		mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
+		mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special);
+		mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp);
+		mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype);
+		mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts);
+		avl_add(&hdl->libzfs_mnttab_cache, mtn);
+	}
+}
+
+void
+libzfs_mnttab_fini(libzfs_handle_t *hdl)
+{
+	void *cookie = NULL;
+	mnttab_node_t *mtn;
+
+	while (mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) {
+		free(mtn->mtn_mt.mnt_special);
+		free(mtn->mtn_mt.mnt_mountp);
+		free(mtn->mtn_mt.mnt_fstype);
+		free(mtn->mtn_mt.mnt_mntopts);
+		free(mtn);
+	}
+	avl_destroy(&hdl->libzfs_mnttab_cache);
+}
+
+void
+libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable)
+{
+	hdl->libzfs_mnttab_enable = enable;
+}
+
+int
+libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname,
+    struct mnttab *entry)
+{
+	mnttab_node_t find;
+	mnttab_node_t *mtn;
+
+	if (!hdl->libzfs_mnttab_enable) {
+		struct mnttab srch = { 0 };
+
+		if (avl_numnodes(&hdl->libzfs_mnttab_cache))
+			libzfs_mnttab_fini(hdl);
+		rewind(hdl->libzfs_mnttab);
+		srch.mnt_special = (char *)fsname;
+		srch.mnt_fstype = MNTTYPE_ZFS;
+		if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0)
+			return (0);
+		else
+			return (ENOENT);
+	}
+
+	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
+		libzfs_mnttab_update(hdl);
+
+	find.mtn_mt.mnt_special = (char *)fsname;
+	mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL);
+	if (mtn) {
+		*entry = mtn->mtn_mt;
+		return (0);
+	}
+	return (ENOENT);
+}
+
+void
+libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special,
+    const char *mountp, const char *mntopts)
+{
+	mnttab_node_t *mtn;
+
+	if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0)
+		return;
+	mtn = zfs_alloc(hdl, sizeof (mnttab_node_t));
+	mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special);
+	mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp);
+	mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS);
+	mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts);
+	avl_add(&hdl->libzfs_mnttab_cache, mtn);
+}
+
+void
+libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname)
+{
+	mnttab_node_t find;
+	mnttab_node_t *ret;
+
+	find.mtn_mt.mnt_special = (char *)fsname;
+	if (ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) {
+		avl_remove(&hdl->libzfs_mnttab_cache, ret);
+		free(ret->mtn_mt.mnt_special);
+		free(ret->mtn_mt.mnt_mountp);
+		free(ret->mtn_mt.mnt_fstype);
+		free(ret->mtn_mt.mnt_mntopts);
+		free(ret);
+	}
+}
+
 int
 zfs_spa_version(zfs_handle_t *zhp, int *spa_version)
 {
@@ -581,23 +754,18 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 		return (NULL);
 	}
 
+	/*
+	 * Make sure this property is valid and applies to this type.
+	 */
+
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
 		const char *propname = nvpair_name(elem);
 
-		/*
-		 * Make sure this property is valid and applies to this type.
-		 */
-		if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
-			if (!zfs_prop_user(propname)) {
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "invalid property '%s'"), propname);
-				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
-				goto error;
-			}
-
+		prop = zfs_name_to_prop(propname);
+		if (prop == ZPROP_INVAL && zfs_prop_user(propname)) {
 			/*
-			 * If this is a user property, make sure it's a
+			 * This is a user property: make sure it's a
 			 * string, and that it's less than ZAP_MAXNAMELEN.
 			 */
 			if (nvpair_type(elem) != DATA_TYPE_STRING) {
@@ -623,6 +791,10 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 			continue;
 		}
 
+		/*
+		 * Currently, only user properties can be modified on
+		 * snapshots.
+		 */
 		if (type == ZFS_TYPE_SNAPSHOT) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 			    "this property can not be modified for snapshots"));
@@ -630,6 +802,85 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 			goto error;
 		}
 
+		if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) {
+			zfs_userquota_prop_t uqtype;
+			char newpropname[128];
+			char domain[128];
+			uint64_t rid;
+			uint64_t valary[3];
+
+			if (userquota_propname_decode(propname, zoned,
+			    &uqtype, domain, sizeof (domain), &rid) != 0) {
+				zfs_error_aux(hdl,
+				    dgettext(TEXT_DOMAIN,
+				    "'%s' has an invalid user/group name"),
+				    propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			if (uqtype != ZFS_PROP_USERQUOTA &&
+			    uqtype != ZFS_PROP_GROUPQUOTA) {
+				zfs_error_aux(hdl,
+				    dgettext(TEXT_DOMAIN, "'%s' is readonly"),
+				    propname);
+				(void) zfs_error(hdl, EZFS_PROPREADONLY,
+				    errbuf);
+				goto error;
+			}
+
+			if (nvpair_type(elem) == DATA_TYPE_STRING) {
+				(void) nvpair_value_string(elem, &strval);
+				if (strcmp(strval, "none") == 0) {
+					intval = 0;
+				} else if (zfs_nicestrtonum(hdl,
+				    strval, &intval) != 0) {
+					(void) zfs_error(hdl,
+					    EZFS_BADPROP, errbuf);
+					goto error;
+				}
+			} else if (nvpair_type(elem) ==
+			    DATA_TYPE_UINT64) {
+				(void) nvpair_value_uint64(elem, &intval);
+				if (intval == 0) {
+					zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+					    "use 'none' to disable "
+					    "userquota/groupquota"));
+					goto error;
+				}
+			} else {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "'%s' must be a number"), propname);
+				(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+				goto error;
+			}
+
+			/*
+			 * Encode the prop name as
+			 * userquota@<hex-rid>-domain, to make it easy
+			 * for the kernel to decode.
+			 */
+			(void) snprintf(newpropname, sizeof (newpropname),
+			    "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype],
+			    (longlong_t)rid, domain);
+			valary[0] = uqtype;
+			valary[1] = rid;
+			valary[2] = intval;
+			if (nvlist_add_uint64_array(ret, newpropname,
+			    valary, 3) != 0) {
+				(void) no_memory(hdl);
+				goto error;
+			}
+			continue;
+		}
+
+		if (prop == ZPROP_INVAL) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "invalid property '%s'"), propname);
+			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+			goto error;
+		}
+
 		if (!zfs_prop_valid_for_type(prop, type)) {
 			zfs_error_aux(hdl,
 			    dgettext(TEXT_DOMAIN, "'%s' does not "
@@ -700,6 +951,60 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 
 			break;
 
+		case ZFS_PROP_MLSLABEL:
+		{
+			/*
+			 * Verify the mlslabel string and convert to
+			 * internal hex label string.
+			 */
+
+			m_label_t *new_sl;
+			char *hex = NULL;	/* internal label string */
+
+			/* Default value is already OK. */
+			if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+				break;
+
+			/* Verify the label can be converted to binary form */
+			if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) ||
+			    (str_to_label(strval, &new_sl, MAC_LABEL,
+			    L_NO_CORRECTION, NULL) == -1)) {
+				goto badlabel;
+			}
+
+			/* Now translate to hex internal label string */
+			if (label_to_str(new_sl, &hex, M_INTERNAL,
+			    DEF_NAMES) != 0) {
+				if (hex)
+					free(hex);
+				goto badlabel;
+			}
+			m_label_free(new_sl);
+
+			/* If string is already in internal form, we're done. */
+			if (strcmp(strval, hex) == 0) {
+				free(hex);
+				break;
+			}
+
+			/* Replace the label string with the internal form. */
+			(void) nvlist_remove(ret, zfs_prop_to_name(prop),
+			    DATA_TYPE_STRING);
+			verify(nvlist_add_string(ret, zfs_prop_to_name(prop),
+			    hex) == 0);
+			free(hex);
+
+			break;
+
+badlabel:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "invalid mlslabel '%s'"), strval);
+			(void) zfs_error(hdl, EZFS_BADPROP, errbuf);
+			m_label_free(new_sl);	/* OK if null */
+			goto error;
+
+		}
+
 		case ZFS_PROP_MOUNTPOINT:
 		{
 			namecheck_err_t why;
@@ -769,7 +1074,7 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 			} else if (getzoneid() != GLOBAL_ZONEID) {
 				/*
 				 * If zoned property is 'off', this must be in
-				 * a globle zone. If not, something is wrong.
+				 * a global zone. If not, something is wrong.
 				 */
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "'%s' cannot be set while dataset "
@@ -953,808 +1258,82 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl,
 	return (NULL);
 }
 
-static int
-zfs_get_perm_who(const char *who, zfs_deleg_who_type_t *who_type,
-    uint64_t *ret_who)
+void
+zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err,
+    char *errbuf)
 {
-	struct passwd *pwd;
-	struct group *grp;
-	uid_t id;
-
-	if (*who_type == ZFS_DELEG_EVERYONE || *who_type == ZFS_DELEG_CREATE ||
-	    *who_type == ZFS_DELEG_NAMED_SET) {
-		*ret_who = -1;
-		return (0);
-	}
-	if (who == NULL && !(*who_type == ZFS_DELEG_EVERYONE))
-		return (EZFS_BADWHO);
-
-	if (*who_type == ZFS_DELEG_WHO_UNKNOWN &&
-	    strcmp(who, "everyone") == 0) {
-		*ret_who = -1;
-		*who_type = ZFS_DELEG_EVERYONE;
-		return (0);
-	}
-
-	pwd = getpwnam(who);
-	grp = getgrnam(who);
-
-	if ((*who_type == ZFS_DELEG_USER) && pwd) {
-		*ret_who = pwd->pw_uid;
-	} else if ((*who_type == ZFS_DELEG_GROUP) && grp) {
-		*ret_who = grp->gr_gid;
-	} else if (pwd) {
-		*ret_who = pwd->pw_uid;
-		*who_type = ZFS_DELEG_USER;
-	} else if (grp) {
-		*ret_who = grp->gr_gid;
-		*who_type = ZFS_DELEG_GROUP;
-	} else {
-		char *end;
-
-		id = strtol(who, &end, 10);
-		if (errno != 0 || *end != '\0') {
-			return (EZFS_BADWHO);
-		} else {
-			*ret_who = id;
-			if (*who_type == ZFS_DELEG_WHO_UNKNOWN)
-				*who_type = ZFS_DELEG_USER;
-		}
-	}
-
-	return (0);
-}
-
-static void
-zfs_perms_add_to_nvlist(nvlist_t *who_nvp, char *name, nvlist_t *perms_nvp)
-{
-	if (perms_nvp != NULL) {
-		verify(nvlist_add_nvlist(who_nvp,
-		    name, perms_nvp) == 0);
-	} else {
-		verify(nvlist_add_boolean(who_nvp, name) == 0);
-	}
-}
-
-static void
-helper(zfs_deleg_who_type_t who_type, uint64_t whoid, char *whostr,
-    zfs_deleg_inherit_t inherit, nvlist_t *who_nvp, nvlist_t *perms_nvp,
-    nvlist_t *sets_nvp)
-{
-	boolean_t do_perms, do_sets;
-	char name[ZFS_MAX_DELEG_NAME];
-
-	do_perms = (nvlist_next_nvpair(perms_nvp, NULL) != NULL);
-	do_sets = (nvlist_next_nvpair(sets_nvp, NULL) != NULL);
-
-	if (!do_perms && !do_sets)
-		do_perms = do_sets = B_TRUE;
-
-	if (do_perms) {
-		zfs_deleg_whokey(name, who_type, inherit,
-		    (who_type == ZFS_DELEG_NAMED_SET) ?
-		    whostr : (void *)&whoid);
-		zfs_perms_add_to_nvlist(who_nvp, name, perms_nvp);
-	}
-	if (do_sets) {
-		zfs_deleg_whokey(name, toupper(who_type), inherit,
-		    (who_type == ZFS_DELEG_NAMED_SET) ?
-		    whostr : (void *)&whoid);
-		zfs_perms_add_to_nvlist(who_nvp, name, sets_nvp);
-	}
-}
-
-static void
-zfs_perms_add_who_nvlist(nvlist_t *who_nvp, uint64_t whoid, void *whostr,
-    nvlist_t *perms_nvp, nvlist_t *sets_nvp,
-    zfs_deleg_who_type_t who_type, zfs_deleg_inherit_t inherit)
-{
-	if (who_type == ZFS_DELEG_NAMED_SET || who_type == ZFS_DELEG_CREATE) {
-		helper(who_type, whoid, whostr, 0,
-		    who_nvp, perms_nvp, sets_nvp);
-	} else {
-		if (inherit & ZFS_DELEG_PERM_LOCAL) {
-			helper(who_type, whoid, whostr, ZFS_DELEG_LOCAL,
-			    who_nvp, perms_nvp, sets_nvp);
-		}
-		if (inherit & ZFS_DELEG_PERM_DESCENDENT) {
-			helper(who_type, whoid, whostr, ZFS_DELEG_DESCENDENT,
-			    who_nvp, perms_nvp, sets_nvp);
-		}
-	}
-}
-
-/*
- * Construct nvlist to pass down to kernel for setting/removing permissions.
- *
- * The nvlist is constructed as a series of nvpairs with an optional embedded
- * nvlist of permissions to remove or set.  The topmost nvpairs are the actual
- * base attribute named stored in the dsl.
- * Arguments:
- *
- * whostr:   is a comma separated list of users, groups, or a single set name.
- *           whostr may be null for everyone or create perms.
- * who_type: is the type of entry in whostr.  Typically this will be
- *           ZFS_DELEG_WHO_UNKNOWN.
- * perms:    common separated list of permissions.  May be null if user
- *           is requested to remove permissions by who.
- * inherit:  Specifies the inheritance of the permissions.  Will be either
- *           ZFS_DELEG_PERM_LOCAL and/or  ZFS_DELEG_PERM_DESCENDENT.
- * nvp       The constructed nvlist to pass to zfs_perm_set().
- *           The output nvp will look something like this.
- *              ul$1234 -> {create ; destroy }
- *              Ul$1234 -> { @myset }
- *              s-$@myset - { snapshot; checksum; compression }
- */
-int
-zfs_build_perms(zfs_handle_t *zhp, char *whostr, char *perms,
-    zfs_deleg_who_type_t who_type, zfs_deleg_inherit_t inherit, nvlist_t **nvp)
-{
-	nvlist_t *who_nvp;
-	nvlist_t *perms_nvp = NULL;
-	nvlist_t *sets_nvp = NULL;
-	char errbuf[1024];
-	char *who_tok, *perm;
-	int error;
-
-	*nvp = NULL;
-
-	if (perms) {
-		if ((error = nvlist_alloc(&perms_nvp,
-		    NV_UNIQUE_NAME, 0)) != 0) {
-			return (1);
-		}
-		if ((error = nvlist_alloc(&sets_nvp,
-		    NV_UNIQUE_NAME, 0)) != 0) {
-			nvlist_free(perms_nvp);
-			return (1);
-		}
-	}
-
-	if ((error = nvlist_alloc(&who_nvp, NV_UNIQUE_NAME, 0)) != 0) {
-		if (perms_nvp)
-			nvlist_free(perms_nvp);
-		if (sets_nvp)
-			nvlist_free(sets_nvp);
-		return (1);
-	}
-
-	if (who_type == ZFS_DELEG_NAMED_SET) {
-		namecheck_err_t why;
-		char what;
-
-		if ((error = permset_namecheck(whostr, &why, &what)) != 0) {
-			nvlist_free(who_nvp);
-			if (perms_nvp)
-				nvlist_free(perms_nvp);
-			if (sets_nvp)
-				nvlist_free(sets_nvp);
-
-			switch (why) {
-			case NAME_ERR_NO_AT:
-				zfs_error_aux(zhp->zfs_hdl,
-				    dgettext(TEXT_DOMAIN,
-				    "set definition must begin with an '@' "
-				    "character"));
-			}
-			return (zfs_error(zhp->zfs_hdl,
-			    EZFS_BADPERMSET, whostr));
-		}
-	}
-
-	/*
-	 * Build up nvlist(s) of permissions.  Two nvlists are maintained.
-	 * The first nvlist perms_nvp will have normal permissions and the
-	 * other sets_nvp will have only permssion set names in it.
-	 */
-	for (perm = strtok(perms, ","); perm; perm = strtok(NULL, ",")) {
-		const char *perm_canonical = zfs_deleg_canonicalize_perm(perm);
-
-		if (perm_canonical) {
-			verify(nvlist_add_boolean(perms_nvp,
-			    perm_canonical) == 0);
-		} else if (perm[0] == '@') {
-			verify(nvlist_add_boolean(sets_nvp, perm) == 0);
-		} else {
-			nvlist_free(who_nvp);
-			nvlist_free(perms_nvp);
-			nvlist_free(sets_nvp);
-			return (zfs_error(zhp->zfs_hdl, EZFS_BADPERM, perm));
-		}
-	}
-
-	if (whostr && who_type != ZFS_DELEG_CREATE) {
-		who_tok = strtok(whostr, ",");
-		if (who_tok == NULL) {
-			nvlist_free(who_nvp);
-			if (perms_nvp)
-				nvlist_free(perms_nvp);
-			if (sets_nvp)
-				nvlist_free(sets_nvp);
-			(void) snprintf(errbuf, sizeof (errbuf),
-			    dgettext(TEXT_DOMAIN, "Who string is NULL"),
-			    whostr);
-			return (zfs_error(zhp->zfs_hdl, EZFS_BADWHO, errbuf));
-		}
-	}
-
-	/*
-	 * Now create the nvlist(s)
-	 */
-	do {
-		uint64_t who_id;
-
-		error = zfs_get_perm_who(who_tok, &who_type,
-		    &who_id);
-		if (error) {
-			nvlist_free(who_nvp);
-			if (perms_nvp)
-				nvlist_free(perms_nvp);
-			if (sets_nvp)
-				nvlist_free(sets_nvp);
-			(void) snprintf(errbuf, sizeof (errbuf),
-			    dgettext(TEXT_DOMAIN,
-			    "Unable to determine uid/gid for "
-			    "%s "), who_tok);
-			return (zfs_error(zhp->zfs_hdl, EZFS_BADWHO, errbuf));
-		}
+	switch (err) {
 
+	case ENOSPC:
 		/*
-		 * add entries for both local and descendent when required
+		 * For quotas and reservations, ENOSPC indicates
+		 * something different; setting a quota or reservation
+		 * doesn't use any disk space.
 		 */
-		zfs_perms_add_who_nvlist(who_nvp, who_id, who_tok,
-		    perms_nvp, sets_nvp, who_type, inherit);
-
-	} while (who_tok = strtok(NULL, ","));
-	*nvp = who_nvp;
-	return (0);
-}
-
-static int
-zfs_perm_set_common(zfs_handle_t *zhp, nvlist_t *nvp, boolean_t unset)
-{
-	zfs_cmd_t zc = { 0 };
-	int error;
-	char errbuf[1024];
-
-	(void) snprintf(errbuf, sizeof (errbuf),
-	    dgettext(TEXT_DOMAIN, "Cannot update 'allows' for '%s'"),
-	    zhp->zfs_name);
-
-	if (zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, nvp))
-		return (-1);
-
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	zc.zc_perm_action = unset;
-
-	error = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SET_FSACL, &zc);
-	if (error && errno == ENOTSUP) {
-		(void) snprintf(errbuf, sizeof (errbuf),
-		    gettext("Pool must be upgraded to use 'allow/unallow'"));
-		zcmd_free_nvlists(&zc);
-		return (zfs_error(zhp->zfs_hdl, EZFS_BADVERSION, errbuf));
-	} else if (error) {
-		return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf));
-	}
-	zcmd_free_nvlists(&zc);
-
-	return (error);
-}
-
-int
-zfs_perm_set(zfs_handle_t *zhp, nvlist_t *nvp)
-{
-	return (zfs_perm_set_common(zhp, nvp, B_FALSE));
-}
-
-int
-zfs_perm_remove(zfs_handle_t *zhp, nvlist_t *perms)
-{
-	return (zfs_perm_set_common(zhp, perms, B_TRUE));
-}
-
-static int
-perm_compare(const void *arg1, const void *arg2)
-{
-	const zfs_perm_node_t *node1 = arg1;
-	const zfs_perm_node_t *node2 = arg2;
-	int ret;
-
-	ret = strcmp(node1->z_pname, node2->z_pname);
-
-	if (ret > 0)
-		return (1);
-	if (ret < 0)
-		return (-1);
-	else
-		return (0);
-}
-
-static void
-zfs_destroy_perm_tree(avl_tree_t *tree)
-{
-	zfs_perm_node_t *permnode;
-	void *cookie = NULL;
-
-	while ((permnode = avl_destroy_nodes(tree,  &cookie)) != NULL)
-		free(permnode);
-	avl_destroy(tree);
-}
-
-static void
-zfs_destroy_tree(avl_tree_t *tree)
-{
-	zfs_allow_node_t *allownode;
-	void *cookie = NULL;
-
-	while ((allownode = avl_destroy_nodes(tree, &cookie)) != NULL) {
-		zfs_destroy_perm_tree(&allownode->z_localdescend);
-		zfs_destroy_perm_tree(&allownode->z_local);
-		zfs_destroy_perm_tree(&allownode->z_descend);
-		free(allownode);
-	}
-	avl_destroy(tree);
-}
-
-void
-zfs_free_allows(zfs_allow_t *allow)
-{
-	zfs_allow_t *allownext;
-	zfs_allow_t *freeallow;
-
-	allownext = allow;
-	while (allownext) {
-		zfs_destroy_tree(&allownext->z_sets);
-		zfs_destroy_tree(&allownext->z_crperms);
-		zfs_destroy_tree(&allownext->z_user);
-		zfs_destroy_tree(&allownext->z_group);
-		zfs_destroy_tree(&allownext->z_everyone);
-		freeallow = allownext;
-		allownext = allownext->z_next;
-		free(freeallow);
-	}
-}
-
-static zfs_allow_t *
-zfs_alloc_perm_tree(zfs_handle_t *zhp, zfs_allow_t *prev, char *setpoint)
-{
-	zfs_allow_t *ptree;
-
-	if ((ptree = zfs_alloc(zhp->zfs_hdl,
-	    sizeof (zfs_allow_t))) == NULL) {
-		return (NULL);
-	}
-
-	(void) strlcpy(ptree->z_setpoint, setpoint, sizeof (ptree->z_setpoint));
-	avl_create(&ptree->z_sets,
-	    perm_compare, sizeof (zfs_allow_node_t),
-	    offsetof(zfs_allow_node_t, z_node));
-	avl_create(&ptree->z_crperms,
-	    perm_compare, sizeof (zfs_allow_node_t),
-	    offsetof(zfs_allow_node_t, z_node));
-	avl_create(&ptree->z_user,
-	    perm_compare, sizeof (zfs_allow_node_t),
-	    offsetof(zfs_allow_node_t, z_node));
-	avl_create(&ptree->z_group,
-	    perm_compare, sizeof (zfs_allow_node_t),
-	    offsetof(zfs_allow_node_t, z_node));
-	avl_create(&ptree->z_everyone,
-	    perm_compare, sizeof (zfs_allow_node_t),
-	    offsetof(zfs_allow_node_t, z_node));
-
-	if (prev)
-		prev->z_next = ptree;
-	ptree->z_next = NULL;
-	return (ptree);
-}
+		switch (prop) {
+		case ZFS_PROP_QUOTA:
+		case ZFS_PROP_REFQUOTA:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "size is less than current used or "
+			    "reserved space"));
+			(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
+			break;
 
-/*
- * Add permissions to the appropriate AVL permission tree.
- * The appropriate tree may not be the requested tree.
- * For example if ld indicates a local permission, but
- * same permission also exists as a descendent permission
- * then the permission will be removed from the descendent
- * tree and add the the local+descendent tree.
- */
-static int
-zfs_coalesce_perm(zfs_handle_t *zhp, zfs_allow_node_t *allownode,
-    char *perm, char ld)
-{
-	zfs_perm_node_t pnode, *permnode, *permnode2;
-	zfs_perm_node_t *newnode;
-	avl_index_t where, where2;
-	avl_tree_t *tree, *altree;
-
-	(void) strlcpy(pnode.z_pname, perm, sizeof (pnode.z_pname));
-
-	if (ld == ZFS_DELEG_NA) {
-		tree =  &allownode->z_localdescend;
-		altree = &allownode->z_descend;
-	} else if (ld == ZFS_DELEG_LOCAL) {
-		tree = &allownode->z_local;
-		altree = &allownode->z_descend;
-	} else {
-		tree = &allownode->z_descend;
-		altree = &allownode->z_local;
-	}
-	permnode = avl_find(tree, &pnode, &where);
-	permnode2 = avl_find(altree, &pnode, &where2);
-
-	if (permnode2) {
-		avl_remove(altree, permnode2);
-		free(permnode2);
-		if (permnode == NULL) {
-			tree =  &allownode->z_localdescend;
-		}
-	}
+		case ZFS_PROP_RESERVATION:
+		case ZFS_PROP_REFRESERVATION:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "size is greater than available space"));
+			(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
+			break;
 
-	/*
-	 * Now insert new permission in either requested location
-	 * local/descendent or into ld when perm will exist in both.
-	 */
-	if (permnode == NULL) {
-		if ((newnode = zfs_alloc(zhp->zfs_hdl,
-		    sizeof (zfs_perm_node_t))) == NULL) {
-			return (-1);
+		default:
+			(void) zfs_standard_error(hdl, err, errbuf);
+			break;
 		}
-		*newnode = pnode;
-		avl_add(tree, newnode);
-	}
-	return (0);
-}
+		break;
 
-/*
- * Uggh, this is going to be a bit complicated.
- * we have an nvlist coming out of the kernel that
- * will indicate where the permission is set and then
- * it will contain allow of the various "who's", and what
- * their permissions are.  To further complicate this
- * we will then have to coalesce the local,descendent
- * and local+descendent permissions where appropriate.
- * The kernel only knows about a permission as being local
- * or descendent, but not both.
- *
- * In order to make this easier for zfs_main to deal with
- * a series of AVL trees will be used to maintain
- * all of this, primarily for sorting purposes as well
- * as the ability to quickly locate a specific entry.
- *
- * What we end up with are tree's for sets, create perms,
- * user, groups and everyone.  With each of those trees
- * we have subtrees for local, descendent and local+descendent
- * permissions.
- */
-int
-zfs_perm_get(zfs_handle_t *zhp, zfs_allow_t **zfs_perms)
-{
-	zfs_cmd_t zc = { 0 };
-	int error;
-	nvlist_t *nvlist;
-	nvlist_t *permnv, *sourcenv;
-	nvpair_t *who_pair, *source_pair;
-	nvpair_t *perm_pair;
-	char errbuf[1024];
-	zfs_allow_t *zallowp, *newallowp;
-	char  ld;
-	char *nvpname;
-	uid_t	uid;
-	gid_t	gid;
-	avl_tree_t *tree;
-	avl_index_t where;
+	case EBUSY:
+		(void) zfs_standard_error(hdl, EBUSY, errbuf);
+		break;
 
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	case EROFS:
+		(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
+		break;
 
-	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
-		return (-1);
+	case ENOTSUP:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "pool and or dataset must be upgraded to set this "
+		    "property or value"));
+		(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
+		break;
 
-	while (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) {
-		if (errno == ENOMEM) {
-			if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, &zc) != 0) {
-				zcmd_free_nvlists(&zc);
-				return (-1);
-			}
-		} else if (errno == ENOTSUP) {
-			zcmd_free_nvlists(&zc);
-			(void) snprintf(errbuf, sizeof (errbuf),
-			    gettext("Pool must be upgraded to use 'allow'"));
-			return (zfs_error(zhp->zfs_hdl,
-			    EZFS_BADVERSION, errbuf));
+	case ERANGE:
+		if (prop == ZFS_PROP_COMPRESSION) {
+			(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "property setting is not allowed on "
+			    "bootable datasets"));
+			(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
 		} else {
-			zcmd_free_nvlists(&zc);
-			return (-1);
+			(void) zfs_standard_error(hdl, err, errbuf);
 		}
-	}
-
-	if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &nvlist) != 0) {
-		zcmd_free_nvlists(&zc);
-		return (-1);
-	}
-
-	zcmd_free_nvlists(&zc);
-
-	source_pair = nvlist_next_nvpair(nvlist, NULL);
-
-	if (source_pair == NULL) {
-		*zfs_perms = NULL;
-		return (0);
-	}
-
-	*zfs_perms = zfs_alloc_perm_tree(zhp, NULL, nvpair_name(source_pair));
-	if (*zfs_perms == NULL) {
-		return (0);
-	}
-
-	zallowp = *zfs_perms;
-
-	for (;;) {
-		struct passwd *pwd;
-		struct group *grp;
-		zfs_allow_node_t *allownode;
-		zfs_allow_node_t  findallownode;
-		zfs_allow_node_t *newallownode;
-
-		(void) strlcpy(zallowp->z_setpoint,
-		    nvpair_name(source_pair),
-		    sizeof (zallowp->z_setpoint));
-
-		if ((error = nvpair_value_nvlist(source_pair, &sourcenv)) != 0)
-			goto abort;
+		break;
 
+	case EOVERFLOW:
 		/*
-		 * Make sure nvlist is composed correctly
+		 * This platform can't address a volume this big.
 		 */
-		if (zfs_deleg_verify_nvlist(sourcenv)) {
-			goto abort;
-		}
-
-		who_pair = nvlist_next_nvpair(sourcenv, NULL);
-		if (who_pair == NULL) {
-			goto abort;
-		}
-
-		do {
-			error = nvpair_value_nvlist(who_pair, &permnv);
-			if (error) {
-				goto abort;
-			}
-
-			/*
-			 * First build up the key to use
-			 * for looking up in the various
-			 * who trees.
-			 */
-			ld = nvpair_name(who_pair)[1];
-			nvpname = nvpair_name(who_pair);
-			switch (nvpair_name(who_pair)[0]) {
-			case ZFS_DELEG_USER:
-			case ZFS_DELEG_USER_SETS:
-				tree = &zallowp->z_user;
-				uid = atol(&nvpname[3]);
-				pwd = getpwuid(uid);
-				(void) snprintf(findallownode.z_key,
-				    sizeof (findallownode.z_key), "user %s",
-				    (pwd) ? pwd->pw_name :
-				    &nvpair_name(who_pair)[3]);
-				break;
-			case ZFS_DELEG_GROUP:
-			case ZFS_DELEG_GROUP_SETS:
-				tree = &zallowp->z_group;
-				gid = atol(&nvpname[3]);
-				grp = getgrgid(gid);
-				(void) snprintf(findallownode.z_key,
-				    sizeof (findallownode.z_key), "group %s",
-				    (grp) ? grp->gr_name :
-				    &nvpair_name(who_pair)[3]);
-				break;
-			case ZFS_DELEG_CREATE:
-			case ZFS_DELEG_CREATE_SETS:
-				tree = &zallowp->z_crperms;
-				(void) strlcpy(findallownode.z_key, "",
-				    sizeof (findallownode.z_key));
-				break;
-			case ZFS_DELEG_EVERYONE:
-			case ZFS_DELEG_EVERYONE_SETS:
-				(void) snprintf(findallownode.z_key,
-				    sizeof (findallownode.z_key), "everyone");
-				tree = &zallowp->z_everyone;
-				break;
-			case ZFS_DELEG_NAMED_SET:
-			case ZFS_DELEG_NAMED_SET_SETS:
-				(void) snprintf(findallownode.z_key,
-				    sizeof (findallownode.z_key), "%s",
-				    &nvpair_name(who_pair)[3]);
-				tree = &zallowp->z_sets;
-				break;
-			}
-
-			/*
-			 * Place who in tree
-			 */
-			allownode = avl_find(tree, &findallownode, &where);
-			if (allownode == NULL) {
-				if ((newallownode = zfs_alloc(zhp->zfs_hdl,
-				    sizeof (zfs_allow_node_t))) == NULL) {
-					goto abort;
-				}
-				avl_create(&newallownode->z_localdescend,
-				    perm_compare,
-				    sizeof (zfs_perm_node_t),
-				    offsetof(zfs_perm_node_t, z_node));
-				avl_create(&newallownode->z_local,
-				    perm_compare,
-				    sizeof (zfs_perm_node_t),
-				    offsetof(zfs_perm_node_t, z_node));
-				avl_create(&newallownode->z_descend,
-				    perm_compare,
-				    sizeof (zfs_perm_node_t),
-				    offsetof(zfs_perm_node_t, z_node));
-				(void) strlcpy(newallownode->z_key,
-				    findallownode.z_key,
-				    sizeof (findallownode.z_key));
-				avl_insert(tree, newallownode, where);
-				allownode = newallownode;
-			}
-
-			/*
-			 * Now iterate over the permissions and
-			 * place them in the appropriate local,
-			 * descendent or local+descendent tree.
-			 *
-			 * The permissions are added to the tree
-			 * via zfs_coalesce_perm().
-			 */
-			perm_pair = nvlist_next_nvpair(permnv, NULL);
-			if (perm_pair == NULL)
-				goto abort;
-			do {
-				if (zfs_coalesce_perm(zhp, allownode,
-				    nvpair_name(perm_pair), ld) != 0)
-					goto abort;
-			} while (perm_pair = nvlist_next_nvpair(permnv,
-			    perm_pair));
-		} while (who_pair = nvlist_next_nvpair(sourcenv, who_pair));
-
-		source_pair = nvlist_next_nvpair(nvlist, source_pair);
-		if (source_pair == NULL)
+#ifdef _ILP32
+		if (prop == ZFS_PROP_VOLSIZE) {
+			(void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
 			break;
-
-		/*
-		 * allocate another node from the link list of
-		 * zfs_allow_t structures
-		 */
-		newallowp = zfs_alloc_perm_tree(zhp, zallowp,
-		    nvpair_name(source_pair));
-		if (newallowp == NULL) {
-			goto abort;
 		}
-		zallowp = newallowp;
-	}
-	nvlist_free(nvlist);
-	return (0);
-abort:
-	zfs_free_allows(*zfs_perms);
-	nvlist_free(nvlist);
-	return (-1);
-}
-
-static char *
-zfs_deleg_perm_note(zfs_deleg_note_t note)
-{
-	/*
-	 * Don't put newlines on end of lines
-	 */
-	switch (note) {
-	case ZFS_DELEG_NOTE_CREATE:
-		return (dgettext(TEXT_DOMAIN,
-		    "Must also have the 'mount' ability"));
-	case ZFS_DELEG_NOTE_DESTROY:
-		return (dgettext(TEXT_DOMAIN,
-		    "Must also have the 'mount' ability"));
-	case ZFS_DELEG_NOTE_SNAPSHOT:
-		return (dgettext(TEXT_DOMAIN,
-		    "Must also have the 'mount' ability"));
-	case ZFS_DELEG_NOTE_ROLLBACK:
-		return (dgettext(TEXT_DOMAIN,
-		    "Must also have the 'mount' ability"));
-	case ZFS_DELEG_NOTE_CLONE:
-		return (dgettext(TEXT_DOMAIN, "Must also have the 'create' "
-		    "ability and 'mount'\n"
-		    "\t\t\t\tability in the origin file system"));
-	case ZFS_DELEG_NOTE_PROMOTE:
-		return (dgettext(TEXT_DOMAIN, "Must also have the 'mount'\n"
-		    "\t\t\t\tand 'promote' ability in the origin file system"));
-	case ZFS_DELEG_NOTE_RENAME:
-		return (dgettext(TEXT_DOMAIN, "Must also have the 'mount' "
-		    "and 'create' \n\t\t\t\tability in the new parent"));
-	case ZFS_DELEG_NOTE_RECEIVE:
-		return (dgettext(TEXT_DOMAIN, "Must also have the 'mount'"
-		    " and 'create' ability"));
-	case ZFS_DELEG_NOTE_USERPROP:
-		return (dgettext(TEXT_DOMAIN,
-		    "Allows changing any user property"));
-	case ZFS_DELEG_NOTE_ALLOW:
-		return (dgettext(TEXT_DOMAIN,
-		    "Must also have the permission that is being\n"
-		    "\t\t\t\tallowed"));
-	case ZFS_DELEG_NOTE_MOUNT:
-		return (dgettext(TEXT_DOMAIN,
-		    "Allows mount/umount of ZFS datasets"));
-	case ZFS_DELEG_NOTE_SHARE:
-		return (dgettext(TEXT_DOMAIN,
-		    "Allows sharing file systems over NFS or SMB\n"
-		    "\t\t\t\tprotocols"));
-	case ZFS_DELEG_NOTE_NONE:
+#endif
+		/* FALLTHROUGH */
 	default:
-		return (dgettext(TEXT_DOMAIN, ""));
+		(void) zfs_standard_error(hdl, err, errbuf);
 	}
 }
 
-typedef enum {
-	ZFS_DELEG_SUBCOMMAND,
-	ZFS_DELEG_PROP,
-	ZFS_DELEG_OTHER
-} zfs_deleg_perm_type_t;
-
-/*
- * is the permission a subcommand or other?
- */
-zfs_deleg_perm_type_t
-zfs_deleg_perm_type(const char *perm)
-{
-	if (strcmp(perm, "userprop") == 0)
-		return (ZFS_DELEG_OTHER);
-	else
-		return (ZFS_DELEG_SUBCOMMAND);
-}
-
-static char *
-zfs_deleg_perm_type_str(zfs_deleg_perm_type_t type)
-{
-	switch (type) {
-	case ZFS_DELEG_SUBCOMMAND:
-		return (dgettext(TEXT_DOMAIN, "subcommand"));
-	case ZFS_DELEG_PROP:
-		return (dgettext(TEXT_DOMAIN, "property"));
-	case ZFS_DELEG_OTHER:
-		return (dgettext(TEXT_DOMAIN, "other"));
-	}
-	return ("");
-}
-
-/*ARGSUSED*/
-static int
-zfs_deleg_prop_cb(int prop, void *cb)
-{
-	if (zfs_prop_delegatable(prop))
-		(void) fprintf(stderr, "%-15s %-15s\n", zfs_prop_to_name(prop),
-		    zfs_deleg_perm_type_str(ZFS_DELEG_PROP));
-
-	return (ZPROP_CONT);
-}
-
-void
-zfs_deleg_permissions(void)
-{
-	int i;
-
-	(void) fprintf(stderr, "\n%-15s %-15s\t%s\n\n", "NAME",
-	    "TYPE", "NOTES");
-
-	/*
-	 * First print out the subcommands
-	 */
-	for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) {
-		(void) fprintf(stderr, "%-15s %-15s\t%s\n",
-		    zfs_deleg_perm_tab[i].z_perm,
-		    zfs_deleg_perm_type_str(
-		    zfs_deleg_perm_type(zfs_deleg_perm_tab[i].z_perm)),
-		    zfs_deleg_perm_note(zfs_deleg_perm_tab[i].z_note));
-	}
-
-	(void) zprop_iter(zfs_deleg_prop_cb, NULL, B_FALSE, B_TRUE,
-	    ZFS_TYPE_DATASET|ZFS_TYPE_VOLUME);
-}
-
 /*
  * Given a property name and value, set the property for the given dataset.
  */
@@ -1821,80 +1400,9 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 		goto error;
 
 	ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc);
-	if (ret != 0) {
-		switch (errno) {
-
-		case ENOSPC:
-			/*
-			 * For quotas and reservations, ENOSPC indicates
-			 * something different; setting a quota or reservation
-			 * doesn't use any disk space.
-			 */
-			switch (prop) {
-			case ZFS_PROP_QUOTA:
-			case ZFS_PROP_REFQUOTA:
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "size is less than current used or "
-				    "reserved space"));
-				(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
-				break;
-
-			case ZFS_PROP_RESERVATION:
-			case ZFS_PROP_REFRESERVATION:
-				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "size is greater than available space"));
-				(void) zfs_error(hdl, EZFS_PROPSPACE, errbuf);
-				break;
-
-			default:
-				(void) zfs_standard_error(hdl, errno, errbuf);
-				break;
-			}
-			break;
-
-		case EBUSY:
-			if (prop == ZFS_PROP_VOLBLOCKSIZE)
-				(void) zfs_error(hdl, EZFS_VOLHASDATA, errbuf);
-			else
-				(void) zfs_standard_error(hdl, EBUSY, errbuf);
-			break;
-
-		case EROFS:
-			(void) zfs_error(hdl, EZFS_DSREADONLY, errbuf);
-			break;
-
-		case ENOTSUP:
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "pool and or dataset must be upgraded to set this "
-			    "property or value"));
-			(void) zfs_error(hdl, EZFS_BADVERSION, errbuf);
-			break;
 
-		case ERANGE:
-			if (prop == ZFS_PROP_COMPRESSION) {
-				(void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-				    "property setting is not allowed on "
-				    "bootable datasets"));
-				(void) zfs_error(hdl, EZFS_NOTSUP, errbuf);
-			} else {
-				(void) zfs_standard_error(hdl, errno, errbuf);
-			}
-			break;
-
-		case EOVERFLOW:
-			/*
-			 * This platform can't address a volume this big.
-			 */
-#ifdef _ILP32
-			if (prop == ZFS_PROP_VOLSIZE) {
-				(void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf);
-				break;
-			}
-#endif
-			/* FALLTHROUGH */
-		default:
-			(void) zfs_standard_error(hdl, errno, errbuf);
-		}
+	if (ret != 0) {
+		zfs_setprop_error(hdl, prop, errno, errbuf);
 	} else {
 		if (do_prefix)
 			ret = changelist_postfix(cl);
@@ -1916,10 +1424,11 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval)
 }
 
 /*
- * Given a property, inherit the value from the parent dataset.
+ * Given a property, inherit the value from the parent dataset, or if received
+ * is TRUE, revert to the received value, if any.
  */
 int
-zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
+zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret;
@@ -1931,6 +1440,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot inherit %s for '%s'"), propname, zhp->zfs_name);
 
+	zc.zc_cookie = received;
 	if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) {
 		/*
 		 * For user properties, the amount of work we have to do is very
@@ -1957,7 +1467,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname)
 	if (zfs_prop_readonly(prop))
 		return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf));
 
-	if (!zfs_prop_inheritable(prop))
+	if (!zfs_prop_inheritable(prop) && !received)
 		return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf));
 
 	/*
@@ -2031,6 +1541,8 @@ getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 		verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0);
 		(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
 	} else {
+		verify(!zhp->zfs_props_table ||
+		    zhp->zfs_props_table[prop] == B_TRUE);
 		value = zfs_prop_default_numeric(prop);
 		*source = "";
 	}
@@ -2050,6 +1562,8 @@ getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 		verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0);
 		(void) nvlist_lookup_string(nv, ZPROP_SOURCE, source);
 	} else {
+		verify(!zhp->zfs_props_table ||
+		    zhp->zfs_props_table[prop] == B_TRUE);
 		if ((value = (char *)zfs_prop_default_string(prop)) == NULL)
 			value = "";
 		*source = "";
@@ -2058,6 +1572,26 @@ getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source)
 	return (value);
 }
 
+static boolean_t
+zfs_is_recvd_props_mode(zfs_handle_t *zhp)
+{
+	return (zhp->zfs_props == zhp->zfs_recvd_props);
+}
+
+static void
+zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
+{
+	*cookie = (uint64_t)(uintptr_t)zhp->zfs_props;
+	zhp->zfs_props = zhp->zfs_recvd_props;
+}
+
+static void
+zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie)
+{
+	zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie;
+	*cookie = 0;
+}
+
 /*
  * Internal function for getting a numeric property.  Both zfs_prop_get() and
  * zfs_prop_get_int() are built using this interface.
@@ -2076,6 +1610,7 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
 	struct mnttab mnt;
 	char *mntopt_on = NULL;
 	char *mntopt_off = NULL;
+	boolean_t received = zfs_is_recvd_props_mode(zhp);
 
 	*source = NULL;
 
@@ -2123,15 +1658,11 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
 	 */
 	if (!zhp->zfs_mntcheck &&
 	    (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) {
-		struct mnttab entry, search = { 0 };
-		FILE *mnttab = zhp->zfs_hdl->libzfs_mnttab;
+		libzfs_handle_t *hdl = zhp->zfs_hdl;
+		struct mnttab entry;
 
-		search.mnt_special = (char *)zhp->zfs_name;
-		search.mnt_fstype = MNTTYPE_ZFS;
-		rewind(mnttab);
-
-		if (getmntany(mnttab, &entry, &search) == 0) {
-			zhp->zfs_mntopts = zfs_strdup(zhp->zfs_hdl,
+		if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) {
+			zhp->zfs_mntopts = zfs_strdup(hdl,
 			    entry.mnt_mntopts);
 			if (zhp->zfs_mntopts == NULL)
 				return (-1);
@@ -2155,6 +1686,9 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
 	case ZFS_PROP_NBMAND:
 		*val = getprop_uint64(zhp, prop, source);
 
+		if (received)
+			break;
+
 		if (hasmntopt(&mnt, mntopt_on) && !*val) {
 			*val = B_TRUE;
 			if (src)
@@ -2167,22 +1701,17 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
 		break;
 
 	case ZFS_PROP_CANMOUNT:
-		*val = getprop_uint64(zhp, prop, source);
-		if (*val != ZFS_CANMOUNT_ON)
-			*source = zhp->zfs_name;
-		else
-			*source = "";	/* default */
-		break;
-
+	case ZFS_PROP_VOLSIZE:
 	case ZFS_PROP_QUOTA:
 	case ZFS_PROP_REFQUOTA:
 	case ZFS_PROP_RESERVATION:
 	case ZFS_PROP_REFRESERVATION:
 		*val = getprop_uint64(zhp, prop, source);
-		if (*val == 0)
-			*source = "";	/* default */
-		else
+
+		if (*source == NULL) {
+			/* not default, must be local */
 			*source = zhp->zfs_name;
+		}
 		break;
 
 	case ZFS_PROP_MOUNTED:
@@ -2203,21 +1732,13 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
 		(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 		if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) {
 			zcmd_free_nvlists(&zc);
-			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
-			    "unable to get %s property"),
-			    zfs_prop_to_name(prop));
-			return (zfs_error(zhp->zfs_hdl, EZFS_BADVERSION,
-			    dgettext(TEXT_DOMAIN, "internal error")));
+			return (-1);
 		}
 		if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 ||
 		    nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop),
 		    val) != 0) {
 			zcmd_free_nvlists(&zc);
-			zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
-			    "unable to get %s property"),
-			    zfs_prop_to_name(prop));
-			return (zfs_error(zhp->zfs_hdl, EZFS_NOMEM,
-			    dgettext(TEXT_DOMAIN, "internal error")));
+			return (-1);
 		}
 		if (zplprops)
 			nvlist_free(zplprops);
@@ -2230,13 +1751,13 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src,
 		case PROP_TYPE_INDEX:
 			*val = getprop_uint64(zhp, prop, source);
 			/*
-			 * If we tried to use a defalut value for a
+			 * If we tried to use a default value for a
 			 * readonly property, it means that it was not
-			 * present; return an error.
+			 * present.
 			 */
 			if (zfs_prop_readonly(prop) &&
-			    *source && (*source)[0] == '\0') {
-				return (-1);
+			    *source != NULL && (*source)[0] == '\0') {
+				*source = NULL;
 			}
 			break;
 
@@ -2266,6 +1787,8 @@ get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source,
 		*srctype = ZPROP_SRC_NONE;
 	} else if (source[0] == '\0') {
 		*srctype = ZPROP_SRC_DEFAULT;
+	} else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) {
+		*srctype = ZPROP_SRC_RECEIVED;
 	} else {
 		if (strcmp(source, zhp->zfs_name) == 0) {
 			*srctype = ZPROP_SRC_LOCAL;
@@ -2277,6 +1800,43 @@ get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source,
 
 }
 
+int
+zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf,
+    size_t proplen, boolean_t literal)
+{
+	zfs_prop_t prop;
+	int err = 0;
+
+	if (zhp->zfs_recvd_props == NULL)
+		if (get_recvd_props_ioctl(zhp) != 0)
+			return (-1);
+
+	prop = zfs_name_to_prop(propname);
+
+	if (prop != ZPROP_INVAL) {
+		uint64_t cookie;
+		if (!nvlist_exists(zhp->zfs_recvd_props, propname))
+			return (-1);
+		zfs_set_recvd_props_mode(zhp, &cookie);
+		err = zfs_prop_get(zhp, prop, propbuf, proplen,
+		    NULL, NULL, 0, literal);
+		zfs_unset_recvd_props_mode(zhp, &cookie);
+	} else if (zfs_prop_userquota(propname)) {
+		return (-1);
+	} else {
+		nvlist_t *propval;
+		char *recvdval;
+		if (nvlist_lookup_nvlist(zhp->zfs_recvd_props,
+		    propname, &propval) != 0)
+			return (-1);
+		verify(nvlist_lookup_string(propval, ZPROP_VALUE,
+		    &recvdval) == 0);
+		(void) strlcpy(propbuf, recvdval, proplen);
+	}
+
+	return (err == 0 ? 0 : -1);
+}
+
 /*
  * Retrieve a property from the given object.  If 'literal' is specified, then
  * numbers are left as exact values.  Otherwise, numbers are converted to a
@@ -2292,6 +1852,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 	uint64_t val;
 	char *str;
 	const char *strval;
+	boolean_t received = zfs_is_recvd_props_mode(zhp);
 
 	/*
 	 * Check to see if this property applies to our object
@@ -2299,6 +1860,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type))
 		return (-1);
 
+	if (received && zfs_prop_readonly(prop))
+		return (-1);
+
 	if (src)
 		*src = ZPROP_SRC_NONE;
 
@@ -2338,10 +1902,22 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 		if (str[0] == '/') {
 			char buf[MAXPATHLEN];
 			char *root = buf;
-			const char *relpath = zhp->zfs_name + strlen(source);
+			const char *relpath;
 
-			if (relpath[0] == '/')
-				relpath++;
+			/*
+			 * If we inherit the mountpoint, even from a dataset
+			 * with a received value, the source will be the path of
+			 * the dataset we inherit from. If source is
+			 * ZPROP_SOURCE_VAL_RECVD, the received value is not
+			 * inherited.
+			 */
+			if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) {
+				relpath = "";
+			} else {
+				relpath = zhp->zfs_name + strlen(source);
+				if (relpath[0] == '/')
+					relpath++;
+			}
 
 			if ((zpool_get_prop(zhp->zpool_hdl,
 			    ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL)) ||
@@ -2420,8 +1996,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 	case ZFS_PROP_COMPRESSRATIO:
 		if (get_numeric_property(zhp, prop, src, &source, &val) != 0)
 			return (-1);
-		(void) snprintf(propbuf, proplen, "%lld.%02lldx", (longlong_t)
-		    val / 100, (longlong_t)val % 100);
+		(void) snprintf(propbuf, proplen, "%llu.%02llux",
+		    (u_longlong_t)(val / 100),
+		    (u_longlong_t)(val % 100));
 		break;
 
 	case ZFS_PROP_TYPE:
@@ -2466,6 +2043,44 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen,
 		(void) strlcpy(propbuf, zhp->zfs_name, proplen);
 		break;
 
+	case ZFS_PROP_MLSLABEL:
+		{
+			m_label_t *new_sl = NULL;
+			char *ascii = NULL;	/* human readable label */
+
+			(void) strlcpy(propbuf,
+			    getprop_string(zhp, prop, &source), proplen);
+
+			if (literal || (strcasecmp(propbuf,
+			    ZFS_MLSLABEL_DEFAULT) == 0))
+				break;
+
+			/*
+			 * Try to translate the internal hex string to
+			 * human-readable output.  If there are any
+			 * problems just use the hex string.
+			 */
+
+			if (str_to_label(propbuf, &new_sl, MAC_LABEL,
+			    L_NO_CORRECTION, NULL) == -1) {
+				m_label_free(new_sl);
+				break;
+			}
+
+			if (label_to_str(new_sl, &ascii, M_LABEL,
+			    DEF_NAMES) != 0) {
+				if (ascii)
+					free(ascii);
+				m_label_free(new_sl);
+				break;
+			}
+			m_label_free(new_sl);
+
+			(void) strlcpy(propbuf, ascii, proplen);
+			free(ascii);
+		}
+		break;
+
 	default:
 		switch (zfs_prop_get_type(prop)) {
 		case PROP_TYPE_NUMBER:
@@ -2520,40 +2135,249 @@ zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop)
 }
 
 int
-zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val)
+zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val)
+{
+	char buf[64];
+
+	(void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val);
+	return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf));
+}
+
+/*
+ * Similar to zfs_prop_get(), but returns the value as an integer.
+ */
+int
+zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value,
+    zprop_source_t *src, char *statbuf, size_t statlen)
+{
+	char *source;
+
+	/*
+	 * Check to see if this property applies to our object
+	 */
+	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) {
+		return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE,
+		    dgettext(TEXT_DOMAIN, "cannot get property '%s'"),
+		    zfs_prop_to_name(prop)));
+	}
+
+	if (src)
+		*src = ZPROP_SRC_NONE;
+
+	if (get_numeric_property(zhp, prop, src, &source, value) != 0)
+		return (-1);
+
+	get_source(zhp, src, source, statbuf, statlen);
+
+	return (0);
+}
+
+static int
+idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser,
+    char **domainp, idmap_rid_t *ridp)
+{
+	idmap_handle_t *idmap_hdl = NULL;
+	idmap_get_handle_t *get_hdl = NULL;
+	idmap_stat status;
+	int err = EINVAL;
+
+	if (idmap_init(&idmap_hdl) != IDMAP_SUCCESS)
+		goto out;
+	if (idmap_get_create(idmap_hdl, &get_hdl) != IDMAP_SUCCESS)
+		goto out;
+
+	if (isuser) {
+		err = idmap_get_sidbyuid(get_hdl, id,
+		    IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
+	} else {
+		err = idmap_get_sidbygid(get_hdl, id,
+		    IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status);
+	}
+	if (err == IDMAP_SUCCESS &&
+	    idmap_get_mappings(get_hdl) == IDMAP_SUCCESS &&
+	    status == IDMAP_SUCCESS)
+		err = 0;
+	else
+		err = EINVAL;
+out:
+	if (get_hdl)
+		idmap_get_destroy(get_hdl);
+	if (idmap_hdl)
+		(void) idmap_fini(idmap_hdl);
+	return (err);
+}
+
+/*
+ * convert the propname into parameters needed by kernel
+ * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829
+ * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789
+ */
+static int
+userquota_propname_decode(const char *propname, boolean_t zoned,
+    zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp)
+{
+	zfs_userquota_prop_t type;
+	char *cp, *end;
+	char *numericsid = NULL;
+	boolean_t isuser;
+
+	domain[0] = '\0';
+
+	/* Figure out the property type ({user|group}{quota|space}) */
+	for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) {
+		if (strncmp(propname, zfs_userquota_prop_prefixes[type],
+		    strlen(zfs_userquota_prop_prefixes[type])) == 0)
+			break;
+	}
+	if (type == ZFS_NUM_USERQUOTA_PROPS)
+		return (EINVAL);
+	*typep = type;
+
+	isuser = (type == ZFS_PROP_USERQUOTA ||
+	    type == ZFS_PROP_USERUSED);
+
+	cp = strchr(propname, '@') + 1;
+
+	if (strchr(cp, '@')) {
+		/*
+		 * It's a SID name (eg "user@domain") that needs to be
+		 * turned into S-1-domainID-RID.
+		 */
+		directory_error_t e;
+		if (zoned && getzoneid() == GLOBAL_ZONEID)
+			return (ENOENT);
+		if (isuser) {
+			e = directory_sid_from_user_name(NULL,
+			    cp, &numericsid);
+		} else {
+			e = directory_sid_from_group_name(NULL,
+			    cp, &numericsid);
+		}
+		if (e != NULL) {
+			directory_error_free(e);
+			return (ENOENT);
+		}
+		if (numericsid == NULL)
+			return (ENOENT);
+		cp = numericsid;
+		/* will be further decoded below */
+	}
+
+	if (strncmp(cp, "S-1-", 4) == 0) {
+		/* It's a numeric SID (eg "S-1-234-567-89") */
+		(void) strlcpy(domain, cp, domainlen);
+		cp = strrchr(domain, '-');
+		*cp = '\0';
+		cp++;
+
+		errno = 0;
+		*ridp = strtoull(cp, &end, 10);
+		if (numericsid) {
+			free(numericsid);
+			numericsid = NULL;
+		}
+		if (errno != 0 || *end != '\0')
+			return (EINVAL);
+	} else if (!isdigit(*cp)) {
+		/*
+		 * It's a user/group name (eg "user") that needs to be
+		 * turned into a uid/gid
+		 */
+		if (zoned && getzoneid() == GLOBAL_ZONEID)
+			return (ENOENT);
+		if (isuser) {
+			struct passwd *pw;
+			pw = getpwnam(cp);
+			if (pw == NULL)
+				return (ENOENT);
+			*ridp = pw->pw_uid;
+		} else {
+			struct group *gr;
+			gr = getgrnam(cp);
+			if (gr == NULL)
+				return (ENOENT);
+			*ridp = gr->gr_gid;
+		}
+	} else {
+		/* It's a user/group ID (eg "12345"). */
+		uid_t id = strtoul(cp, &end, 10);
+		idmap_rid_t rid;
+		char *mapdomain;
+
+		if (*end != '\0')
+			return (EINVAL);
+		if (id > MAXUID) {
+			/* It's an ephemeral ID. */
+			if (idmap_id_to_numeric_domain_rid(id, isuser,
+			    &mapdomain, &rid) != 0)
+				return (ENOENT);
+			(void) strlcpy(domain, mapdomain, domainlen);
+			*ridp = rid;
+		} else {
+			*ridp = id;
+		}
+	}
+
+	ASSERT3P(numericsid, ==, NULL);
+	return (0);
+}
+
+static int
+zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname,
+    uint64_t *propvalue, zfs_userquota_prop_t *typep)
+{
+	int err;
+	zfs_cmd_t zc = { 0 };
+
+	(void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	err = userquota_propname_decode(propname,
+	    zfs_prop_get_int(zhp, ZFS_PROP_ZONED),
+	    typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid);
+	zc.zc_objset_type = *typep;
+	if (err)
+		return (err);
+
+	err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc);
+	if (err)
+		return (err);
+
+	*propvalue = zc.zc_cookie;
+	return (0);
+}
+
+int
+zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname,
+    uint64_t *propvalue)
 {
-	char buf[64];
+	zfs_userquota_prop_t type;
 
-	zfs_nicenum(val, buf, sizeof (buf));
-	return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf));
+	return (zfs_prop_get_userquota_common(zhp, propname, propvalue,
+	    &type));
 }
 
-/*
- * Similar to zfs_prop_get(), but returns the value as an integer.
- */
 int
-zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value,
-    zprop_source_t *src, char *statbuf, size_t statlen)
+zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname,
+    char *propbuf, int proplen, boolean_t literal)
 {
-	char *source;
-
-	/*
-	 * Check to see if this property applies to our object
-	 */
-	if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) {
-		return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE,
-		    dgettext(TEXT_DOMAIN, "cannot get property '%s'"),
-		    zfs_prop_to_name(prop)));
-	}
-
-	if (src)
-		*src = ZPROP_SRC_NONE;
+	int err;
+	uint64_t propvalue;
+	zfs_userquota_prop_t type;
 
-	if (get_numeric_property(zhp, prop, src, &source, value) != 0)
-		return (-1);
+	err = zfs_prop_get_userquota_common(zhp, propname, &propvalue,
+	    &type);
 
-	get_source(zhp, src, source, statbuf, statlen);
+	if (err)
+		return (err);
 
+	if (literal) {
+		(void) snprintf(propbuf, proplen, "%llu", propvalue);
+	} else if (propvalue == 0 &&
+	    (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA)) {
+		(void) strlcpy(propbuf, "none", proplen);
+	} else {
+		zfs_nicenum(propvalue, propbuf, proplen);
+	}
 	return (0);
 }
 
@@ -2575,6 +2399,46 @@ zfs_get_type(const zfs_handle_t *zhp)
 	return (zhp->zfs_type);
 }
 
+static int
+zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc)
+{
+	int rc;
+	uint64_t	orig_cookie;
+
+	orig_cookie = zc->zc_cookie;
+top:
+	(void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name));
+	rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc);
+
+	if (rc == -1) {
+		switch (errno) {
+		case ENOMEM:
+			/* expand nvlist memory and try again */
+			if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) {
+				zcmd_free_nvlists(zc);
+				return (-1);
+			}
+			zc->zc_cookie = orig_cookie;
+			goto top;
+		/*
+		 * An errno value of ESRCH indicates normal completion.
+		 * If ENOENT is returned, then the underlying dataset
+		 * has been removed since we obtained the handle.
+		 */
+		case ESRCH:
+		case ENOENT:
+			rc = 1;
+			break;
+		default:
+			rc = zfs_standard_error(zhp->zfs_hdl, errno,
+			    dgettext(TEXT_DOMAIN,
+			    "cannot iterate filesystems"));
+			break;
+		}
+	}
+	return (rc);
+}
+
 /*
  * Iterate over all child filesystems
  */
@@ -2588,37 +2452,27 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 	if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM)
 		return (0);
 
-	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	    ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
-	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
-		/*
-		 * Ignore private dataset names.
-		 */
-		if (dataset_name_hidden(zc.zc_name))
-			continue;
+	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
+		return (-1);
 
+	while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT,
+	    &zc)) == 0) {
 		/*
 		 * Silently ignore errors, as the only plausible explanation is
 		 * that the pool has since been removed.
 		 */
-		if ((nzhp = make_dataset_handle(zhp->zfs_hdl,
-		    zc.zc_name)) == NULL)
+		if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
+		    &zc)) == NULL) {
 			continue;
+		}
 
-		if ((ret = func(nzhp, data)) != 0)
+		if ((ret = func(nzhp, data)) != 0) {
+			zcmd_free_nvlists(&zc);
 			return (ret);
+		}
 	}
-
-	/*
-	 * An errno value of ESRCH indicates normal completion.  If ENOENT is
-	 * returned, then the underlying dataset has been removed since we
-	 * obtained the handle.
-	 */
-	if (errno != ESRCH && errno != ENOENT)
-		return (zfs_standard_error(zhp->zfs_hdl, errno,
-		    dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
-
-	return (0);
+	zcmd_free_nvlists(&zc);
+	return ((ret < 0) ? ret : 0);
 }
 
 /*
@@ -2634,29 +2488,23 @@ zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 	if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
 		return (0);
 
-	for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	    ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT,
-	    &zc) == 0;
-	    (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) {
+	if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0)
+		return (-1);
+	while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT,
+	    &zc)) == 0) {
 
-		if ((nzhp = make_dataset_handle(zhp->zfs_hdl,
-		    zc.zc_name)) == NULL)
+		if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl,
+		    &zc)) == NULL) {
 			continue;
+		}
 
-		if ((ret = func(nzhp, data)) != 0)
+		if ((ret = func(nzhp, data)) != 0) {
+			zcmd_free_nvlists(&zc);
 			return (ret);
+		}
 	}
-
-	/*
-	 * An errno value of ESRCH indicates normal completion.  If ENOENT is
-	 * returned, then the underlying dataset has been removed since we
-	 * obtained the handle.  Silently ignore this case, and return success.
-	 */
-	if (errno != ESRCH && errno != ENOENT)
-		return (zfs_standard_error(zhp->zfs_hdl, errno,
-		    dgettext(TEXT_DOMAIN, "cannot iterate filesystems")));
-
-	return (0);
+	zcmd_free_nvlists(&zc);
+	return ((ret < 0) ? ret : 0);
 }
 
 /*
@@ -2673,6 +2521,27 @@ zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data)
 	return (zfs_iter_snapshots(zhp, func, data));
 }
 
+/*
+ * Is one dataset name a child dataset of another?
+ *
+ * Needs to handle these cases:
+ * Dataset 1	"a/foo"		"a/foo"		"a/foo"		"a/foo"
+ * Dataset 2	"a/fo"		"a/foobar"	"a/bar/baz"	"a/foo/bar"
+ * Descendant?	No.		No.		No.		Yes.
+ */
+static boolean_t
+is_descendant(const char *ds1, const char *ds2)
+{
+	size_t d1len = strlen(ds1);
+
+	/* ds2 can't be a descendant if it's smaller */
+	if (strlen(ds2) < d1len)
+		return (B_FALSE);
+
+	/* otherwise, compare strings and verify that there's a '/' char */
+	return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0));
+}
+
 /*
  * Given a complete name, return just the portion that refers to the parent.
  * Can return NULL if this is a pool.
@@ -2708,9 +2577,10 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
 	char *slash;
 	zfs_handle_t *zhp;
 	char errbuf[1024];
+	uint64_t is_zoned;
 
-	(void) snprintf(errbuf, sizeof (errbuf), "cannot create '%s'",
-	    path);
+	(void) snprintf(errbuf, sizeof (errbuf),
+	    dgettext(TEXT_DOMAIN, "cannot create '%s'"), path);
 
 	/* get parent, and check to see if this is just a pool */
 	if (parent_name(path, parent, sizeof (parent)) != 0) {
@@ -2750,9 +2620,12 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned,
 			return (zfs_standard_error(hdl, errno, errbuf));
 	}
 
-	*zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+	is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED);
+	if (zoned != NULL)
+		*zoned = is_zoned;
+
 	/* we are in a non-global zone, but parent is in the global zone */
-	if (getzoneid() != GLOBAL_ZONEID && !(*zoned)) {
+	if (getzoneid() != GLOBAL_ZONEID && !is_zoned) {
 		(void) zfs_standard_error(hdl, EPERM, errbuf);
 		zfs_close(zhp);
 		return (-1);
@@ -2884,11 +2757,10 @@ int
 zfs_create_ancestors(libzfs_handle_t *hdl, const char *path)
 {
 	int prefix;
-	uint64_t zoned;
 	char *path_copy;
 	int rc;
 
-	if (check_parents(hdl, path, &zoned, B_TRUE, &prefix) != 0)
+	if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0)
 		return (-1);
 
 	if ((path_copy = strdup(path)) != NULL) {
@@ -3002,18 +2874,6 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
 	/* create the dataset */
 	ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc);
 
-	if (ret == 0 && type == ZFS_TYPE_VOLUME) {
-		ret = zvol_create_link(hdl, path);
-		if (ret) {
-			(void) zfs_standard_error(hdl, errno,
-			    dgettext(TEXT_DOMAIN,
-			    "Volume successfully created, but device links "
-			    "were not created"));
-			zcmd_free_nvlists(&zc);
-			return (-1);
-		}
-	}
-
 	zcmd_free_nvlists(&zc);
 
 	/* check for failure */
@@ -3069,7 +2929,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type,
  * isn't mounted, and that there are no active dependents.
  */
 int
-zfs_destroy(zfs_handle_t *zhp)
+zfs_destroy(zfs_handle_t *zhp, boolean_t defer)
 {
 	zfs_cmd_t zc = { 0 };
 
@@ -3085,14 +2945,12 @@ zfs_destroy(zfs_handle_t *zhp)
 			return (-1);
 		}
 
-		if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
-			return (-1);
-
 		zc.zc_objset_type = DMU_OST_ZVOL;
 	} else {
 		zc.zc_objset_type = DMU_OST_ZFS;
 	}
 
+	zc.zc_defer_destroy = defer;
 	if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, errno,
 		    dgettext(TEXT_DOMAIN, "cannot destroy '%s'"),
@@ -3111,13 +2969,13 @@ struct destroydata {
 };
 
 static int
-zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
+zfs_check_snap_cb(zfs_handle_t *zhp, void *arg)
 {
 	struct destroydata *dd = arg;
 	zfs_handle_t *szhp;
 	char name[ZFS_MAXNAMELEN];
 	boolean_t closezhp = dd->closezhp;
-	int rv;
+	int rv = 0;
 
 	(void) strlcpy(name, zhp->zfs_name, sizeof (name));
 	(void) strlcat(name, "@", sizeof (name));
@@ -3129,17 +2987,9 @@ zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
 		zfs_close(szhp);
 	}
 
-	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
-		(void) zvol_remove_link(zhp->zfs_hdl, name);
-		/*
-		 * NB: this is simply a best-effort.  We don't want to
-		 * return an error, because then we wouldn't visit all
-		 * the volumes.
-		 */
-	}
-
 	dd->closezhp = B_TRUE;
-	rv = zfs_iter_filesystems(zhp, zfs_remove_link_cb, arg);
+	if (!dd->gotone)
+		rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg);
 	if (closezhp)
 		zfs_close(zhp);
 	return (rv);
@@ -3149,14 +2999,14 @@ zfs_remove_link_cb(zfs_handle_t *zhp, void *arg)
  * Destroys all snapshots with the given name in zhp & descendants.
  */
 int
-zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname)
+zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer)
 {
 	zfs_cmd_t zc = { 0 };
 	int ret;
 	struct destroydata dd = { 0 };
 
 	dd.snapname = snapname;
-	(void) zfs_remove_link_cb(zhp, &dd);
+	(void) zfs_check_snap_cb(zhp, &dd);
 
 	if (!dd.gotone) {
 		return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT,
@@ -3166,6 +3016,7 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname)
 
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+	zc.zc_defer_destroy = defer;
 
 	ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc);
 	if (ret != 0) {
@@ -3273,70 +3124,11 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props)
 			return (zfs_standard_error(zhp->zfs_hdl, errno,
 			    errbuf));
 		}
-	} else if (ZFS_IS_VOLUME(zhp)) {
-		ret = zvol_create_link(zhp->zfs_hdl, target);
 	}
 
 	return (ret);
 }
 
-typedef struct promote_data {
-	char cb_mountpoint[MAXPATHLEN];
-	const char *cb_target;
-	const char *cb_errbuf;
-	uint64_t cb_pivot_txg;
-} promote_data_t;
-
-static int
-promote_snap_cb(zfs_handle_t *zhp, void *data)
-{
-	promote_data_t *pd = data;
-	zfs_handle_t *szhp;
-	char snapname[MAXPATHLEN];
-	int rv = 0;
-
-	/* We don't care about snapshots after the pivot point */
-	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) {
-		zfs_close(zhp);
-		return (0);
-	}
-
-	/* Remove the device link if it's a zvol. */
-	if (ZFS_IS_VOLUME(zhp))
-		(void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name);
-
-	/* Check for conflicting names */
-	(void) strlcpy(snapname, pd->cb_target, sizeof (snapname));
-	(void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname));
-	szhp = make_dataset_handle(zhp->zfs_hdl, snapname);
-	if (szhp != NULL) {
-		zfs_close(szhp);
-		zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN,
-		    "snapshot name '%s' from origin \n"
-		    "conflicts with '%s' from target"),
-		    zhp->zfs_name, snapname);
-		rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf);
-	}
-	zfs_close(zhp);
-	return (rv);
-}
-
-static int
-promote_snap_done_cb(zfs_handle_t *zhp, void *data)
-{
-	promote_data_t *pd = data;
-
-	/* We don't care about snapshots after the pivot point */
-	if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) {
-		/* Create the device link if it's a zvol. */
-		if (ZFS_IS_VOLUME(zhp))
-			(void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
-	}
-
-	zfs_close(zhp);
-	return (0);
-}
-
 /*
  * Promotes the given clone fs to be the clone parent.
  */
@@ -3346,10 +3138,7 @@ zfs_promote(zfs_handle_t *zhp)
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zfs_cmd_t zc = { 0 };
 	char parent[MAXPATHLEN];
-	char *cp;
 	int ret;
-	zfs_handle_t *pzhp;
-	promote_data_t pd;
 	char errbuf[1024];
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
@@ -3367,29 +3156,7 @@ zfs_promote(zfs_handle_t *zhp)
 		    "not a cloned filesystem"));
 		return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
 	}
-	cp = strchr(parent, '@');
-	*cp = '\0';
-
-	/* Walk the snapshots we will be moving */
-	pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT);
-	if (pzhp == NULL)
-		return (-1);
-	pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG);
-	zfs_close(pzhp);
-	pd.cb_target = zhp->zfs_name;
-	pd.cb_errbuf = errbuf;
-	pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET);
-	if (pzhp == NULL)
-		return (-1);
-	(void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint,
-	    sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE);
-	ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd);
-	if (ret != 0) {
-		zfs_close(pzhp);
-		return (-1);
-	}
 
-	/* issue the ioctl */
 	(void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin,
 	    sizeof (zc.zc_value));
 	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
@@ -3398,62 +3165,18 @@ zfs_promote(zfs_handle_t *zhp)
 	if (ret != 0) {
 		int save_errno = errno;
 
-		(void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd);
-		zfs_close(pzhp);
-
 		switch (save_errno) {
 		case EEXIST:
-			/*
-			 * There is a conflicting snapshot name.  We
-			 * should have caught this above, but they could
-			 * have renamed something in the mean time.
-			 */
+			/* There is a conflicting snapshot name. */
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "conflicting snapshot name from parent '%s'"),
-			    parent);
+			    "conflicting snapshot '%s' from parent '%s'"),
+			    zc.zc_string, parent);
 			return (zfs_error(hdl, EZFS_EXISTS, errbuf));
 
 		default:
 			return (zfs_standard_error(hdl, save_errno, errbuf));
 		}
-	} else {
-		(void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd);
-	}
-
-	zfs_close(pzhp);
-	return (ret);
-}
-
-struct createdata {
-	const char *cd_snapname;
-	int cd_ifexists;
-};
-
-static int
-zfs_create_link_cb(zfs_handle_t *zhp, void *arg)
-{
-	struct createdata *cd = arg;
-	int ret;
-
-	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
-		char name[MAXPATHLEN];
-
-		(void) strlcpy(name, zhp->zfs_name, sizeof (name));
-		(void) strlcat(name, "@", sizeof (name));
-		(void) strlcat(name, cd->cd_snapname, sizeof (name));
-		(void) zvol_create_link_common(zhp->zfs_hdl, name,
-		    cd->cd_ifexists);
-		/*
-		 * NB: this is simply a best-effort.  We don't want to
-		 * return an error, because then we wouldn't visit all
-		 * the volumes.
-		 */
 	}
-
-	ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd);
-
-	zfs_close(zhp);
-
 	return (ret);
 }
 
@@ -3517,31 +3240,11 @@ zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive,
 	 * if it was recursive, the one that actually failed will be in
 	 * zc.zc_name.
 	 */
-	if (ret != 0)
+	if (ret != 0) {
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 		    "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value);
-
-	if (ret == 0 && recursive) {
-		struct createdata cd;
-
-		cd.cd_snapname = delim + 1;
-		cd.cd_ifexists = B_FALSE;
-		(void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd);
-	}
-	if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) {
-		ret = zvol_create_link(zhp->zfs_hdl, path);
-		if (ret != 0) {
-			(void) zfs_standard_error(hdl, errno,
-			    dgettext(TEXT_DOMAIN,
-			    "Volume successfully snapshotted, but device links "
-			    "were not created"));
-			zfs_close(zhp);
-			return (-1);
-		}
-	}
-
-	if (ret != 0)
 		(void) zfs_standard_error(hdl, errno, errbuf);
+	}
 
 	zfs_close(zhp);
 
@@ -3581,7 +3284,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data)
 
 			logstr = zhp->zfs_hdl->libzfs_log_str;
 			zhp->zfs_hdl->libzfs_log_str = NULL;
-			cbp->cb_error |= zfs_destroy(zhp);
+			cbp->cb_error |= zfs_destroy(zhp, B_FALSE);
 			zhp->zfs_hdl->libzfs_log_str = logstr;
 		}
 	} else {
@@ -3595,7 +3298,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data)
 			zfs_close(zhp);
 			return (0);
 		}
-		if (zfs_destroy(zhp) != 0)
+		if (zfs_destroy(zhp, B_FALSE) != 0)
 			cbp->cb_error = B_TRUE;
 		else
 			changelist_remove(clp, zhp->zfs_name);
@@ -3644,8 +3347,6 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
 	 */
 
 	if (zhp->zfs_type == ZFS_TYPE_VOLUME) {
-		if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0)
-			return (-1);
 		if (zfs_which_resv_prop(zhp, &resv_prop) < 0)
 			return (-1);
 		old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
@@ -3683,10 +3384,6 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force)
 	 */
 	if ((zhp->zfs_type == ZFS_TYPE_VOLUME) &&
 	    (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) {
-		if (err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name)) {
-			zfs_close(zhp);
-			return (err);
-		}
 		if (restore_resv) {
 			new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE);
 			if (old_volsize != new_volsize)
@@ -3801,14 +3498,11 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
 
 		if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE))
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
-		uint64_t unused;
 
 		/* validate parents */
-		if (check_parents(hdl, target, &unused, B_FALSE, NULL) != 0)
+		if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0)
 			return (-1);
 
-		(void) parent_name(target, parent, sizeof (parent));
-
 		/* make sure we're in the same pool */
 		verify((delim = strchr(target, '/')) != NULL);
 		if (strncmp(zhp->zfs_name, target, delim - target) != 0 ||
@@ -3819,10 +3513,9 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
 		}
 
 		/* new name cannot be a child of the current dataset name */
-		if (strncmp(parent, zhp->zfs_name,
-		    strlen(zhp->zfs_name)) == 0) {
+		if (is_descendant(zhp->zfs_name, target)) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "New dataset name cannot be a descendent of "
+			    "New dataset name cannot be a descendant of "
 			    "current dataset name"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
@@ -3839,7 +3532,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
 	}
 
 	if (recursive) {
-		struct destroydata dd;
 
 		parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name);
 		if (parentname == NULL) {
@@ -3854,15 +3546,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
 			goto error;
 		}
 
-		dd.snapname = delim + 1;
-		dd.gotone = B_FALSE;
-		dd.closezhp = B_TRUE;
-
-		/* We remove any zvol links prior to renaming them */
-		ret = zfs_iter_filesystems(zhrp, zfs_remove_link_cb, &dd);
-		if (ret) {
-			goto error;
-		}
 	} else {
 		if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL)
 			return (-1);
@@ -3884,202 +3567,52 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive)
 	else
 		zc.zc_objset_type = DMU_OST_ZFS;
 
-	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
-	(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
-
-	zc.zc_cookie = recursive;
-
-	if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) {
-		/*
-		 * if it was recursive, the one that actually failed will
-		 * be in zc.zc_name
-		 */
-		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
-		    "cannot rename '%s'"), zc.zc_name);
-
-		if (recursive && errno == EEXIST) {
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "a child dataset already has a snapshot "
-			    "with the new name"));
-			(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
-		} else {
-			(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
-		}
-
-		/*
-		 * On failure, we still want to remount any filesystems that
-		 * were previously mounted, so we don't alter the system state.
-		 */
-		if (recursive) {
-			struct createdata cd;
-
-			/* only create links for datasets that had existed */
-			cd.cd_snapname = delim + 1;
-			cd.cd_ifexists = B_TRUE;
-			(void) zfs_iter_filesystems(zhrp, zfs_create_link_cb,
-			    &cd);
-		} else {
-			(void) changelist_postfix(cl);
-		}
-	} else {
-		if (recursive) {
-			struct createdata cd;
-
-			/* only create links for datasets that had existed */
-			cd.cd_snapname = strchr(target, '@') + 1;
-			cd.cd_ifexists = B_TRUE;
-			ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb,
-			    &cd);
-		} else {
-			changelist_rename(cl, zfs_get_name(zhp), target);
-			ret = changelist_postfix(cl);
-		}
-	}
-
-error:
-	if (parentname) {
-		free(parentname);
-	}
-	if (zhrp) {
-		zfs_close(zhrp);
-	}
-	if (cl) {
-		changelist_free(cl);
-	}
-	return (ret);
-}
-
-/*
- * Given a zvol dataset, issue the ioctl to create the appropriate minor node,
- * poke devfsadm to create the /dev link, and then wait for the link to appear.
- */
-int
-zvol_create_link(libzfs_handle_t *hdl, const char *dataset)
-{
-	return (zvol_create_link_common(hdl, dataset, B_FALSE));
-}
-
-static int
-zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists)
-{
-	zfs_cmd_t zc = { 0 };
-	di_devlink_handle_t dhdl;
-	priv_set_t *priv_effective;
-	int privileged;
-
-	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
-	/*
-	 * Issue the appropriate ioctl.
-	 */
-	if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) {
-		switch (errno) {
-		case EEXIST:
-			/*
-			 * Silently ignore the case where the link already
-			 * exists.  This allows 'zfs volinit' to be run multiple
-			 * times without errors.
-			 */
-			return (0);
-
-		case ENOENT:
-			/*
-			 * Dataset does not exist in the kernel.  If we
-			 * don't care (see zfs_rename), then ignore the
-			 * error quietly.
-			 */
-			if (ifexists) {
-				return (0);
-			}
-
-			/* FALLTHROUGH */
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value));
 
-		default:
-			return (zfs_standard_error_fmt(hdl, errno,
-			    dgettext(TEXT_DOMAIN, "cannot create device links "
-			    "for '%s'"), dataset));
-		}
-	}
+	zc.zc_cookie = recursive;
 
-	/*
-	 * If privileged call devfsadm and wait for the links to
-	 * magically appear.
-	 * Otherwise, print out an informational message.
-	 */
+	if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) {
+		/*
+		 * if it was recursive, the one that actually failed will
+		 * be in zc.zc_name
+		 */
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "cannot rename '%s'"), zc.zc_name);
 
-	priv_effective = priv_allocset();
-	(void) getppriv(PRIV_EFFECTIVE, priv_effective);
-	privileged = (priv_isfullset(priv_effective) == B_TRUE);
-	priv_freeset(priv_effective);
-
-	if (privileged) {
-		if ((dhdl = di_devlink_init(ZFS_DRIVER,
-		    DI_MAKE_LINK)) == NULL) {
-			zfs_error_aux(hdl, strerror(errno));
-			(void) zfs_error_fmt(hdl, errno,
-			    dgettext(TEXT_DOMAIN, "cannot create device links "
-			    "for '%s'"), dataset);
-			(void) ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc);
-			return (-1);
+		if (recursive && errno == EEXIST) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "a child dataset already has a snapshot "
+			    "with the new name"));
+			(void) zfs_error(hdl, EZFS_EXISTS, errbuf);
 		} else {
-			(void) di_devlink_fini(&dhdl);
+			(void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf);
 		}
-	} else {
-		char pathname[MAXPATHLEN];
-		struct stat64 statbuf;
-		int i;
-
-#define	MAX_WAIT	10
 
 		/*
-		 * This is the poor mans way of waiting for the link
-		 * to show up.  If after 10 seconds we still don't
-		 * have it, then print out a message.
+		 * On failure, we still want to remount any filesystems that
+		 * were previously mounted, so we don't alter the system state.
 		 */
-		(void) snprintf(pathname, sizeof (pathname), "/dev/zvol/dsk/%s",
-		    dataset);
-
-		for (i = 0; i != MAX_WAIT; i++) {
-			if (stat64(pathname, &statbuf) == 0)
-				break;
-			(void) sleep(1);
+		if (!recursive)
+			(void) changelist_postfix(cl);
+	} else {
+		if (!recursive) {
+			changelist_rename(cl, zfs_get_name(zhp), target);
+			ret = changelist_postfix(cl);
 		}
-		if (i == MAX_WAIT)
-			(void) printf(gettext("%s may not be immediately "
-			    "available\n"), pathname);
 	}
 
-	return (0);
-}
-
-/*
- * Remove a minor node for the given zvol and the associated /dev links.
- */
-int
-zvol_remove_link(libzfs_handle_t *hdl, const char *dataset)
-{
-	zfs_cmd_t zc = { 0 };
-
-	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
-
-	if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) {
-		switch (errno) {
-		case ENXIO:
-			/*
-			 * Silently ignore the case where the link no longer
-			 * exists, so that 'zfs volfini' can be run multiple
-			 * times without errors.
-			 */
-			return (0);
-
-		default:
-			return (zfs_standard_error_fmt(hdl, errno,
-			    dgettext(TEXT_DOMAIN, "cannot remove device "
-			    "links for '%s'"), dataset));
-		}
+error:
+	if (parentname) {
+		free(parentname);
 	}
-
-	return (0);
+	if (zhrp) {
+		zfs_close(zhrp);
+	}
+	if (cl) {
+		changelist_free(cl);
+	}
+	return (ret);
 }
 
 nvlist_t *
@@ -4088,6 +3621,15 @@ zfs_get_user_props(zfs_handle_t *zhp)
 	return (zhp->zfs_user_props);
 }
 
+nvlist_t *
+zfs_get_recvd_props(zfs_handle_t *zhp)
+{
+	if (zhp->zfs_recvd_props == NULL)
+		if (get_recvd_props_ioctl(zhp) != 0)
+			return (NULL);
+	return (zhp->zfs_recvd_props);
+}
+
 /*
  * This function is used by 'zfs list' to determine the exact set of columns to
  * display, and their maximum widths.  This does two main things:
@@ -4097,10 +3639,12 @@ zfs_get_user_props(zfs_handle_t *zhp)
  *        for new unique user properties and add them to the list.
  *
  *      - For non fixed-width properties, keep track of the maximum width seen
- *        so that we can size the column appropriately.
+ *        so that we can size the column appropriately. If the user has
+ *        requested received property values, we also need to compute the width
+ *        of the RECEIVED column.
  */
 int
-zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp)
+zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received)
 {
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
 	zprop_list_t *entry;
@@ -4171,12 +3715,24 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp)
 				if (strlen(buf) > entry->pl_width)
 					entry->pl_width = strlen(buf);
 			}
-		} else if (nvlist_lookup_nvlist(userprops,
-		    entry->pl_user_prop, &propval)  == 0) {
-			verify(nvlist_lookup_string(propval,
-			    ZPROP_VALUE, &strval) == 0);
-			if (strlen(strval) > entry->pl_width)
-				entry->pl_width = strlen(strval);
+			if (received && zfs_prop_get_recvd(zhp,
+			    zfs_prop_to_name(entry->pl_prop),
+			    buf, sizeof (buf), B_FALSE) == 0)
+				if (strlen(buf) > entry->pl_recvd_width)
+					entry->pl_recvd_width = strlen(buf);
+		} else {
+			if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop,
+			    &propval) == 0) {
+				verify(nvlist_lookup_string(propval,
+				    ZPROP_VALUE, &strval) == 0);
+				if (strlen(strval) > entry->pl_width)
+					entry->pl_width = strlen(strval);
+			}
+			if (received && zfs_prop_get_recvd(zhp,
+			    entry->pl_user_prop,
+			    buf, sizeof (buf), B_FALSE) == 0)
+				if (strlen(buf) > entry->pl_recvd_width)
+					entry->pl_recvd_width = strlen(buf);
 		}
 	}
 
@@ -4231,18 +3787,406 @@ zfs_iscsi_perm_check(libzfs_handle_t *hdl, char *dataset, ucred_t *cred)
 
 int
 zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path,
-    void *export, void *sharetab, int sharemax, zfs_share_op_t operation)
+    char *resource, void *export, void *sharetab,
+    int sharemax, zfs_share_op_t operation)
 {
 	zfs_cmd_t zc = { 0 };
 	int error;
 
 	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 	(void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
+	if (resource)
+		(void) strlcpy(zc.zc_string, resource, sizeof (zc.zc_string));
 	zc.zc_share.z_sharedata = (uint64_t)(uintptr_t)sharetab;
 	zc.zc_share.z_exportdata = (uint64_t)(uintptr_t)export;
 	zc.zc_share.z_sharetype = operation;
 	zc.zc_share.z_sharemax = sharemax;
-
 	error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc);
 	return (error);
 }
+
+void
+zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props)
+{
+	nvpair_t *curr;
+
+	/*
+	 * Keep a reference to the props-table against which we prune the
+	 * properties.
+	 */
+	zhp->zfs_props_table = props;
+
+	curr = nvlist_next_nvpair(zhp->zfs_props, NULL);
+
+	while (curr) {
+		zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr));
+		nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr);
+
+		/*
+		 * User properties will result in ZPROP_INVAL, and since we
+		 * only know how to prune standard ZFS properties, we always
+		 * leave these in the list.  This can also happen if we
+		 * encounter an unknown DSL property (when running older
+		 * software, for example).
+		 */
+		if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE)
+			(void) nvlist_remove(zhp->zfs_props,
+			    nvpair_name(curr), nvpair_type(curr));
+		curr = next;
+	}
+}
+
+static int
+zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path,
+    zfs_smb_acl_op_t cmd, char *resource1, char *resource2)
+{
+	zfs_cmd_t zc = { 0 };
+	nvlist_t *nvlist = NULL;
+	int error;
+
+	(void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value));
+	zc.zc_cookie = (uint64_t)cmd;
+
+	if (cmd == ZFS_SMB_ACL_RENAME) {
+		if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) {
+			(void) no_memory(hdl);
+			return (NULL);
+		}
+	}
+
+	switch (cmd) {
+	case ZFS_SMB_ACL_ADD:
+	case ZFS_SMB_ACL_REMOVE:
+		(void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string));
+		break;
+	case ZFS_SMB_ACL_RENAME:
+		if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC,
+		    resource1) != 0) {
+				(void) no_memory(hdl);
+				return (-1);
+		}
+		if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET,
+		    resource2) != 0) {
+				(void) no_memory(hdl);
+				return (-1);
+		}
+		if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) {
+			nvlist_free(nvlist);
+			return (-1);
+		}
+		break;
+	case ZFS_SMB_ACL_PURGE:
+		break;
+	default:
+		return (-1);
+	}
+	error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc);
+	if (nvlist)
+		nvlist_free(nvlist);
+	return (error);
+}
+
+int
+zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset,
+    char *path, char *resource)
+{
+	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD,
+	    resource, NULL));
+}
+
+int
+zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset,
+    char *path, char *resource)
+{
+	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE,
+	    resource, NULL));
+}
+
+int
+zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path)
+{
+	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE,
+	    NULL, NULL));
+}
+
+int
+zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path,
+    char *oldname, char *newname)
+{
+	return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME,
+	    oldname, newname));
+}
+
+int
+zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type,
+    zfs_userspace_cb_t func, void *arg)
+{
+	zfs_cmd_t zc = { 0 };
+	int error;
+	zfs_useracct_t buf[100];
+
+	(void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+
+	zc.zc_objset_type = type;
+	zc.zc_nvlist_dst = (uintptr_t)buf;
+
+	/* CONSTCOND */
+	while (1) {
+		zfs_useracct_t *zua = buf;
+
+		zc.zc_nvlist_dst_size = sizeof (buf);
+		error = ioctl(zhp->zfs_hdl->libzfs_fd,
+		    ZFS_IOC_USERSPACE_MANY, &zc);
+		if (error || zc.zc_nvlist_dst_size == 0)
+			break;
+
+		while (zc.zc_nvlist_dst_size > 0) {
+			error = func(arg, zua->zu_domain, zua->zu_rid,
+			    zua->zu_space);
+			if (error != 0)
+				return (error);
+			zua++;
+			zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t);
+		}
+	}
+
+	return (error);
+}
+
+int
+zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag,
+    boolean_t recursive, boolean_t temphold, boolean_t enoent_ok)
+{
+	zfs_cmd_t zc = { 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+	if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
+	    >= sizeof (zc.zc_string))
+		return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
+	zc.zc_cookie = recursive;
+	zc.zc_temphold = temphold;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) {
+		char errbuf[ZFS_MAXNAMELEN+32];
+
+		/*
+		 * if it was recursive, the one that actually failed will be in
+		 * zc.zc_name.
+		 */
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "cannot hold '%s@%s'"), zc.zc_name, snapname);
+		switch (errno) {
+		case E2BIG:
+			/*
+			 * Temporary tags wind up having the ds object id
+			 * prepended. So even if we passed the length check
+			 * above, it's still possible for the tag to wind
+			 * up being slightly too long.
+			 */
+			return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf));
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+		case EINVAL:
+			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+		case EEXIST:
+			return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf));
+		case ENOENT:
+			if (enoent_ok)
+				return (0);
+			/* FALLTHROUGH */
+		default:
+			return (zfs_standard_error_fmt(hdl, errno, errbuf));
+		}
+	}
+
+	return (0);
+}
+
+struct hold_range_arg {
+	zfs_handle_t	*origin;
+	const char	*fromsnap;
+	const char	*tosnap;
+	char		lastsnapheld[ZFS_MAXNAMELEN];
+	const char	*tag;
+	boolean_t	temphold;
+	boolean_t	seento;
+	boolean_t	seenfrom;
+	boolean_t	holding;
+	boolean_t	recursive;
+};
+
+static int
+zfs_hold_range_one(zfs_handle_t *zhp, void *arg)
+{
+	struct hold_range_arg *hra = arg;
+	const char *thissnap;
+	int error;
+
+	thissnap = strchr(zfs_get_name(zhp), '@') + 1;
+
+	if (hra->fromsnap && !hra->seenfrom &&
+	    strcmp(hra->fromsnap, thissnap) == 0)
+		hra->seenfrom = B_TRUE;
+
+	/* snap is older or newer than the desired range, ignore it */
+	if (hra->seento || !hra->seenfrom) {
+		zfs_close(zhp);
+		return (0);
+	}
+
+	if (hra->holding) {
+		/* We could be racing with destroy, so ignore ENOENT. */
+		error = zfs_hold(hra->origin, thissnap, hra->tag,
+		    hra->recursive, hra->temphold, B_TRUE);
+		if (error == 0) {
+			(void) strlcpy(hra->lastsnapheld, zfs_get_name(zhp),
+			    sizeof (hra->lastsnapheld));
+		}
+	} else {
+		error = zfs_release(hra->origin, thissnap, hra->tag,
+		    hra->recursive);
+	}
+
+	if (!hra->seento && strcmp(hra->tosnap, thissnap) == 0)
+		hra->seento = B_TRUE;
+
+	zfs_close(zhp);
+	return (error);
+}
+
+/*
+ * Add a user hold on the set of snapshots starting with fromsnap up to
+ * and including tosnap. If we're unable to to acquire a particular hold,
+ * undo any holds up to that point.
+ */
+int
+zfs_hold_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
+    const char *tag, boolean_t recursive, boolean_t temphold)
+{
+	struct hold_range_arg arg = { 0 };
+	int error;
+
+	arg.origin = zhp;
+	arg.fromsnap = fromsnap;
+	arg.tosnap = tosnap;
+	arg.tag = tag;
+	arg.temphold = temphold;
+	arg.holding = B_TRUE;
+	arg.recursive = recursive;
+	arg.seenfrom = (fromsnap == NULL);
+
+	error = zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg);
+
+	/*
+	 * Make sure we either hold the entire range or none.
+	 */
+	if (error && arg.lastsnapheld[0] != '\0') {
+		(void) zfs_release_range(zhp, fromsnap,
+		    (const char *)arg.lastsnapheld, tag, recursive);
+	}
+	return (error);
+}
+
+int
+zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag,
+    boolean_t recursive)
+{
+	zfs_cmd_t zc = { 0 };
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+
+	(void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value));
+	if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string))
+	    >= sizeof (zc.zc_string))
+		return (zfs_error(hdl, EZFS_TAGTOOLONG, tag));
+	zc.zc_cookie = recursive;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) {
+		char errbuf[ZFS_MAXNAMELEN+32];
+
+		/*
+		 * if it was recursive, the one that actually failed will be in
+		 * zc.zc_name.
+		 */
+		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+		    "cannot release '%s' from '%s@%s'"), tag, zc.zc_name,
+		    snapname);
+		switch (errno) {
+		case ESRCH:
+			return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf));
+		case ENOTSUP:
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "pool must be upgraded"));
+			return (zfs_error(hdl, EZFS_BADVERSION, errbuf));
+		case EINVAL:
+			return (zfs_error(hdl, EZFS_BADTYPE, errbuf));
+		default:
+			return (zfs_standard_error_fmt(hdl, errno, errbuf));
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Release a user hold from the set of snapshots starting with fromsnap
+ * up to and including tosnap.
+ */
+int
+zfs_release_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
+    const char *tag, boolean_t recursive)
+{
+	struct hold_range_arg arg = { 0 };
+
+	arg.origin = zhp;
+	arg.fromsnap = fromsnap;
+	arg.tosnap = tosnap;
+	arg.tag = tag;
+	arg.recursive = recursive;
+	arg.seenfrom = (fromsnap == NULL);
+
+	return (zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg));
+}
+
+uint64_t
+zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props)
+{
+	uint64_t numdb;
+	uint64_t nblocks, volblocksize;
+	int ncopies;
+	char *strval;
+
+	if (nvlist_lookup_string(props,
+	    zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0)
+		ncopies = atoi(strval);
+	else
+		ncopies = 1;
+	if (nvlist_lookup_uint64(props,
+	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
+	    &volblocksize) != 0)
+		volblocksize = ZVOL_DEFAULT_BLOCKSIZE;
+	nblocks = volsize/volblocksize;
+	/* start with metadnode L0-L6 */
+	numdb = 7;
+	/* calculate number of indirects */
+	while (nblocks > 1) {
+		nblocks += DNODES_PER_LEVEL - 1;
+		nblocks /= DNODES_PER_LEVEL;
+		numdb += nblocks;
+	}
+	numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1);
+	volsize *= ncopies;
+	/*
+	 * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't
+	 * compressed, but in practice they compress down to about
+	 * 1100 bytes
+	 */
+	numdb *= 1ULL << DN_MAX_INDBLKSHIFT;
+	volsize += numdb;
+	return (volsize);
+}
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_fru.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_fru.c
new file mode 100644
index 0000000000000..788fa2cfb763d
--- /dev/null
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_fru.c
@@ -0,0 +1,452 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <dlfcn.h>
+#include <errno.h>
+#include <libintl.h>
+#include <link.h>
+#include <pthread.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include <libzfs.h>
+
+#include <fm/libtopo.h>
+#include <sys/fm/protocol.h>
+#include <sys/systeminfo.h>
+
+#include "libzfs_impl.h"
+
+/*
+ * This file is responsible for determining the relationship between I/O
+ * devices paths and physical locations.  In the world of MPxIO and external
+ * enclosures, the device path is not synonymous with the physical location.
+ * If you remove a drive and insert it into a different slot, it will end up
+ * with the same path under MPxIO.  If you recable storage enclosures, the
+ * device paths may change.  All of this makes it difficult to implement the
+ * 'autoreplace' property, which is supposed to automatically manage disk
+ * replacement based on physical slot.
+ *
+ * In order to work around these limitations, we have a per-vdev FRU property
+ * that is the libtopo path (minus disk-specific authority information) to the
+ * physical location of the device on the system.  This is an optional
+ * property, and is only needed when using the 'autoreplace' property or when
+ * generating FMA faults against vdevs.
+ */
+
+/*
+ * Because the FMA packages depend on ZFS, we have to dlopen() libtopo in case
+ * it is not present.  We only need this once per library instance, so it is
+ * not part of the libzfs handle.
+ */
+static void *_topo_dlhandle;
+static topo_hdl_t *(*_topo_open)(int, const char *, int *);
+static void (*_topo_close)(topo_hdl_t *);
+static char *(*_topo_snap_hold)(topo_hdl_t *, const char *, int *);
+static void (*_topo_snap_release)(topo_hdl_t *);
+static topo_walk_t *(*_topo_walk_init)(topo_hdl_t *, const char *,
+    topo_walk_cb_t, void *, int *);
+static int (*_topo_walk_step)(topo_walk_t *, int);
+static void (*_topo_walk_fini)(topo_walk_t *);
+static void (*_topo_hdl_strfree)(topo_hdl_t *, char *);
+static char *(*_topo_node_name)(tnode_t *);
+static int (*_topo_prop_get_string)(tnode_t *, const char *, const char *,
+    char **, int *);
+static int (*_topo_node_fru)(tnode_t *, nvlist_t **, nvlist_t *, int *);
+static int (*_topo_fmri_nvl2str)(topo_hdl_t *, nvlist_t *, char **, int *);
+static int (*_topo_fmri_strcmp_noauth)(topo_hdl_t *, const char *,
+    const char *);
+
+#define	ZFS_FRU_HASH_SIZE	257
+
+static size_t
+fru_strhash(const char *key)
+{
+	ulong_t g, h = 0;
+	const char *p;
+
+	for (p = key; *p != '\0'; p++) {
+		h = (h << 4) + *p;
+
+		if ((g = (h & 0xf0000000)) != 0) {
+			h ^= (g >> 24);
+			h ^= g;
+		}
+	}
+
+	return (h % ZFS_FRU_HASH_SIZE);
+}
+
+static int
+libzfs_fru_gather(topo_hdl_t *thp, tnode_t *tn, void *arg)
+{
+	libzfs_handle_t *hdl = arg;
+	nvlist_t *fru;
+	char *devpath, *frustr;
+	int err;
+	libzfs_fru_t *frup;
+	size_t idx;
+
+	/*
+	 * If this is the chassis node, and we don't yet have the system
+	 * chassis ID, then fill in this value now.
+	 */
+	if (hdl->libzfs_chassis_id[0] == '\0' &&
+	    strcmp(_topo_node_name(tn), "chassis") == 0) {
+		if (_topo_prop_get_string(tn, FM_FMRI_AUTHORITY,
+		    FM_FMRI_AUTH_CHASSIS, &devpath, &err) == 0)
+			(void) strlcpy(hdl->libzfs_chassis_id, devpath,
+			    sizeof (hdl->libzfs_chassis_id));
+	}
+
+	/*
+	 * Skip non-disk nodes.
+	 */
+	if (strcmp(_topo_node_name(tn), "disk") != 0)
+		return (TOPO_WALK_NEXT);
+
+	/*
+	 * Get the devfs path and FRU.
+	 */
+	if (_topo_prop_get_string(tn, "io", "devfs-path", &devpath, &err) != 0)
+		return (TOPO_WALK_NEXT);
+
+	if (libzfs_fru_lookup(hdl, devpath) != NULL) {
+		_topo_hdl_strfree(thp, devpath);
+		return (TOPO_WALK_NEXT);
+	}
+
+	if (_topo_node_fru(tn, &fru, NULL, &err) != 0) {
+		_topo_hdl_strfree(thp, devpath);
+		return (TOPO_WALK_NEXT);
+	}
+
+	/*
+	 * Convert the FRU into a string.
+	 */
+	if (_topo_fmri_nvl2str(thp, fru, &frustr, &err) != 0) {
+		nvlist_free(fru);
+		_topo_hdl_strfree(thp, devpath);
+		return (TOPO_WALK_NEXT);
+	}
+
+	nvlist_free(fru);
+
+	/*
+	 * Finally, we have a FRU string and device path.  Add it to the hash.
+	 */
+	if ((frup = calloc(sizeof (libzfs_fru_t), 1)) == NULL) {
+		_topo_hdl_strfree(thp, devpath);
+		_topo_hdl_strfree(thp, frustr);
+		return (TOPO_WALK_NEXT);
+	}
+
+	if ((frup->zf_device = strdup(devpath)) == NULL ||
+	    (frup->zf_fru = strdup(frustr)) == NULL) {
+		free(frup->zf_device);
+		free(frup);
+		_topo_hdl_strfree(thp, devpath);
+		_topo_hdl_strfree(thp, frustr);
+		return (TOPO_WALK_NEXT);
+	}
+
+	_topo_hdl_strfree(thp, devpath);
+	_topo_hdl_strfree(thp, frustr);
+
+	idx = fru_strhash(frup->zf_device);
+	frup->zf_chain = hdl->libzfs_fru_hash[idx];
+	hdl->libzfs_fru_hash[idx] = frup;
+	frup->zf_next = hdl->libzfs_fru_list;
+	hdl->libzfs_fru_list = frup;
+
+	return (TOPO_WALK_NEXT);
+}
+
+/*
+ * Called during initialization to setup the dynamic libtopo connection.
+ */
+#pragma init(libzfs_init_fru)
+static void
+libzfs_init_fru(void)
+{
+	char path[MAXPATHLEN];
+	char isa[257];
+
+#if defined(_LP64)
+	if (sysinfo(SI_ARCHITECTURE_64, isa, sizeof (isa)) < 0)
+		isa[0] = '\0';
+#else
+	isa[0] = '\0';
+#endif
+	(void) snprintf(path, sizeof (path),
+	    "/usr/lib/fm/%s/libtopo.so", isa);
+
+	if ((_topo_dlhandle = dlopen(path, RTLD_LAZY)) == NULL)
+		return;
+
+	_topo_open = (topo_hdl_t *(*)())
+	    dlsym(_topo_dlhandle, "topo_open");
+	_topo_close = (void (*)())
+	    dlsym(_topo_dlhandle, "topo_close");
+	_topo_snap_hold = (char *(*)())
+	    dlsym(_topo_dlhandle, "topo_snap_hold");
+	_topo_snap_release = (void (*)())
+	    dlsym(_topo_dlhandle, "topo_snap_release");
+	_topo_walk_init = (topo_walk_t *(*)())
+	    dlsym(_topo_dlhandle, "topo_walk_init");
+	_topo_walk_step = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_walk_step");
+	_topo_walk_fini = (void (*)())
+	    dlsym(_topo_dlhandle, "topo_walk_fini");
+	_topo_hdl_strfree = (void (*)())
+	    dlsym(_topo_dlhandle, "topo_hdl_strfree");
+	_topo_node_name = (char *(*)())
+	    dlsym(_topo_dlhandle, "topo_node_name");
+	_topo_prop_get_string = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_prop_get_string");
+	_topo_node_fru = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_node_fru");
+	_topo_fmri_nvl2str = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_fmri_nvl2str");
+	_topo_fmri_strcmp_noauth = (int (*)())
+	    dlsym(_topo_dlhandle, "topo_fmri_strcmp_noauth");
+
+	if (_topo_open == NULL || _topo_close == NULL ||
+	    _topo_snap_hold == NULL || _topo_snap_release == NULL ||
+	    _topo_walk_init == NULL || _topo_walk_step == NULL ||
+	    _topo_walk_fini == NULL || _topo_hdl_strfree == NULL ||
+	    _topo_node_name == NULL || _topo_prop_get_string == NULL ||
+	    _topo_node_fru == NULL || _topo_fmri_nvl2str == NULL ||
+	    _topo_fmri_strcmp_noauth == NULL) {
+		(void) dlclose(_topo_dlhandle);
+		_topo_dlhandle = NULL;
+	}
+}
+
+/*
+ * Refresh the mappings from device path -> FMRI.  We do this by walking the
+ * hc topology looking for disk nodes, and recording the io/devfs-path and FRU.
+ * Note that we strip out the disk-specific authority information (serial,
+ * part, revision, etc) so that we are left with only the identifying
+ * characteristics of the slot (hc path and chassis-id).
+ */
+void
+libzfs_fru_refresh(libzfs_handle_t *hdl)
+{
+	int err;
+	char *uuid;
+	topo_hdl_t *thp;
+	topo_walk_t *twp;
+
+	if (_topo_dlhandle == NULL)
+		return;
+
+	/*
+	 * Clear the FRU hash and initialize our basic structures.
+	 */
+	libzfs_fru_clear(hdl, B_FALSE);
+
+	if ((hdl->libzfs_topo_hdl = _topo_open(TOPO_VERSION,
+	    NULL, &err)) == NULL)
+		return;
+
+	thp = hdl->libzfs_topo_hdl;
+
+	if ((uuid = _topo_snap_hold(thp, NULL, &err)) == NULL)
+		return;
+
+	_topo_hdl_strfree(thp, uuid);
+
+	if (hdl->libzfs_fru_hash == NULL &&
+	    (hdl->libzfs_fru_hash =
+	    calloc(ZFS_FRU_HASH_SIZE * sizeof (void *), 1)) == NULL)
+		return;
+
+	/*
+	 * We now have a topo snapshot, so iterate over the hc topology looking
+	 * for disks to add to the hash.
+	 */
+	twp = _topo_walk_init(thp, FM_FMRI_SCHEME_HC,
+	    libzfs_fru_gather, hdl, &err);
+	if (twp != NULL) {
+		(void) _topo_walk_step(twp, TOPO_WALK_CHILD);
+		_topo_walk_fini(twp);
+	}
+}
+
+/*
+ * Given a devfs path, return the FRU for the device, if known.  This will
+ * automatically call libzfs_fru_refresh() if it hasn't already been called by
+ * the consumer.  The string returned is valid until the next call to
+ * libzfs_fru_refresh().
+ */
+const char *
+libzfs_fru_lookup(libzfs_handle_t *hdl, const char *devpath)
+{
+	size_t idx = fru_strhash(devpath);
+	libzfs_fru_t *frup;
+
+	if (hdl->libzfs_fru_hash == NULL)
+		libzfs_fru_refresh(hdl);
+
+	if (hdl->libzfs_fru_hash == NULL)
+		return (NULL);
+
+	for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
+	    frup = frup->zf_chain) {
+		if (strcmp(devpath, frup->zf_device) == 0)
+			return (frup->zf_fru);
+	}
+
+	return (NULL);
+}
+
+/*
+ * Given a fru path, return the device path.  This will automatically call
+ * libzfs_fru_refresh() if it hasn't already been called by the consumer.  The
+ * string returned is valid until the next call to libzfs_fru_refresh().
+ */
+const char *
+libzfs_fru_devpath(libzfs_handle_t *hdl, const char *fru)
+{
+	libzfs_fru_t *frup;
+	size_t idx;
+
+	if (hdl->libzfs_fru_hash == NULL)
+		libzfs_fru_refresh(hdl);
+
+	if (hdl->libzfs_fru_hash == NULL)
+		return (NULL);
+
+	for (idx = 0; idx < ZFS_FRU_HASH_SIZE; idx++) {
+		for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL;
+		    frup = frup->zf_next) {
+			if (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl,
+			    fru, frup->zf_fru))
+				return (frup->zf_device);
+		}
+	}
+
+	return (NULL);
+}
+
+/*
+ * Change the stored FRU for the given vdev.
+ */
+int
+zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru)
+{
+	zfs_cmd_t zc = { 0 };
+
+	(void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	(void) strncpy(zc.zc_value, fru, sizeof (zc.zc_value));
+	zc.zc_guid = vdev_guid;
+
+	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SETFRU, &zc) != 0)
+		return (zpool_standard_error_fmt(zhp->zpool_hdl, errno,
+		    dgettext(TEXT_DOMAIN, "cannot set FRU")));
+
+	return (0);
+}
+
+/*
+ * Compare to two FRUs, ignoring any authority information.
+ */
+boolean_t
+libzfs_fru_compare(libzfs_handle_t *hdl, const char *a, const char *b)
+{
+	if (hdl->libzfs_fru_hash == NULL)
+		libzfs_fru_refresh(hdl);
+
+	if (hdl->libzfs_fru_hash == NULL)
+		return (strcmp(a, b) == 0);
+
+	return (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, a, b));
+}
+
+/*
+ * This special function checks to see whether the FRU indicates it's supposed
+ * to be in the system chassis, but the chassis-id doesn't match.  This can
+ * happen in a clustered case, where both head nodes have the same logical
+ * disk, but opening the device on the other head node is meaningless.
+ */
+boolean_t
+libzfs_fru_notself(libzfs_handle_t *hdl, const char *fru)
+{
+	const char *chassisid;
+	size_t len;
+
+	if (hdl->libzfs_fru_hash == NULL)
+		libzfs_fru_refresh(hdl);
+
+	if (hdl->libzfs_chassis_id[0] == '\0')
+		return (B_FALSE);
+
+	if (strstr(fru, "/chassis=0/") == NULL)
+		return (B_FALSE);
+
+	if ((chassisid = strstr(fru, ":chassis-id=")) == NULL)
+		return (B_FALSE);
+
+	chassisid += 12;
+	len = strlen(hdl->libzfs_chassis_id);
+	if (strncmp(chassisid, hdl->libzfs_chassis_id, len) == 0 &&
+	    (chassisid[len] == '/' || chassisid[len] == ':'))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+/*
+ * Clear memory associated with the FRU hash.
+ */
+void
+libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final)
+{
+	libzfs_fru_t *frup;
+
+	while ((frup = hdl->libzfs_fru_list) != NULL) {
+		hdl->libzfs_fru_list = frup->zf_next;
+		free(frup->zf_device);
+		free(frup->zf_fru);
+		free(frup);
+	}
+
+	hdl->libzfs_fru_list = NULL;
+
+	if (hdl->libzfs_topo_hdl != NULL) {
+		_topo_snap_release(hdl->libzfs_topo_hdl);
+		_topo_close(hdl->libzfs_topo_hdl);
+		hdl->libzfs_topo_hdl = NULL;
+	}
+
+	if (final) {
+		free(hdl->libzfs_fru_hash);
+	} else if (hdl->libzfs_fru_hash != NULL) {
+		bzero(hdl->libzfs_fru_hash,
+		    ZFS_FRU_HASH_SIZE * sizeof (void *));
+	}
+}
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c
index e7cbf2386014e..bc21c51ae26c0 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Iterate over all children of the current object.  This includes the normal
  * dataset hierarchy, but also arbitrary hierarchies due to clones.  We want to
@@ -399,13 +397,6 @@ iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset)
 	for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name));
 	    ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0;
 	    (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) {
-
-		/*
-		 * Ignore private dataset names.
-		 */
-		if (dataset_name_hidden(zc.zc_name))
-			continue;
-
 		/*
 		 * Get statistics for this dataset, to determine the type of the
 		 * dataset and clone statistics.  If this fails, the dataset has
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h
index 9f1f66d51db50..ef34591fe3945 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,6 +38,8 @@
 #include <libzfs.h>
 #include <libshare.h>
 
+#include <fm/libtopo.h>
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -47,6 +49,13 @@ extern "C" {
 #endif
 #define	VERIFY	verify
 
+typedef struct libzfs_fru {
+	char *zf_device;
+	char *zf_fru;
+	struct libzfs_fru *zf_chain;
+	struct libzfs_fru *zf_next;
+} libzfs_fru_t;
+
 struct libzfs_handle {
 	int libzfs_error;
 	int libzfs_fd;
@@ -63,7 +72,15 @@ struct libzfs_handle {
 	int libzfs_printerr;
 	void *libzfs_sharehdl; /* libshare handle */
 	uint_t libzfs_shareflags;
+	boolean_t libzfs_mnttab_enable;
+	avl_tree_t libzfs_mnttab_cache;
+	int libzfs_pool_iter;
+	topo_hdl_t *libzfs_topo_hdl;
+	libzfs_fru_t **libzfs_fru_hash;
+	libzfs_fru_t *libzfs_fru_list;
+	char libzfs_chassis_id[256];
 };
+
 #define	ZFSSHARE_MISS	0x01	/* Didn't find entry in cache */
 
 struct zfs_handle {
@@ -75,8 +92,10 @@ struct zfs_handle {
 	dmu_objset_stats_t zfs_dmustats;
 	nvlist_t *zfs_props;
 	nvlist_t *zfs_user_props;
+	nvlist_t *zfs_recvd_props;
 	boolean_t zfs_mntcheck;
 	char *zfs_mntopts;
+	uint8_t *zfs_props_table;
 };
 
 /*
@@ -169,9 +188,6 @@ zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *);
 
 int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **);
 
-int zvol_create_link(libzfs_handle_t *, const char *);
-int zvol_remove_link(libzfs_handle_t *, const char *);
-int zpool_iter_zvol(zpool_handle_t *, int (*)(const char *, void *), void *);
 boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *);
 
 void namespace_clear(libzfs_handle_t *);
@@ -184,8 +200,11 @@ extern int zfs_init_libshare(libzfs_handle_t *, int);
 extern void zfs_uninit_libshare(libzfs_handle_t *);
 extern int zfs_parse_options(char *, zfs_share_proto_t);
 
-extern int zfs_unshare_proto(zfs_handle_t *zhp,
+extern int zfs_unshare_proto(zfs_handle_t *,
     const char *, zfs_share_proto_t *);
+
+extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c
index d67776889d350..fd3044b1da333 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Pool import support functions.
  *
@@ -41,15 +39,21 @@
  * using our derived config, and record the results.
  */
 
+#include <ctype.h>
 #include <devid.h>
 #include <dirent.h>
 #include <errno.h>
 #include <libintl.h>
+#include <stddef.h>
 #include <stdlib.h>
 #include <string.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include <fcntl.h>
+#include <sys/vtoc.h>
+#include <sys/dktp/fdisk.h>
+#include <sys/efi_partition.h>
+#include <thread_pool.h>
 
 #include <sys/vdev_impl.h>
 
@@ -388,8 +392,6 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
 	}
 
 	if (err) {
-		(void) zpool_standard_error(hdl, errno,
-		    dgettext(TEXT_DOMAIN, "cannot discover pools"));
 		zcmd_free_nvlists(&zc);
 		return (NULL);
 	}
@@ -403,6 +405,21 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config)
 	return (nvl);
 }
 
+/*
+ * Determine if the vdev id is a hole in the namespace.
+ */
+boolean_t
+vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id)
+{
+	for (int c = 0; c < holes; c++) {
+
+		/* Top-level is a hole */
+		if (hole_array[c] == id)
+			return (B_TRUE);
+	}
+	return (B_FALSE);
+}
+
 /*
  * Convert our list of pools into the definitive set of configurations.  We
  * start by picking the best config for each toplevel vdev.  Once that's done,
@@ -425,17 +442,20 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 	uint64_t version, guid;
 	uint_t children = 0;
 	nvlist_t **child = NULL;
+	uint_t holes;
+	uint64_t *hole_array, max_id;
 	uint_t c;
 	boolean_t isactive;
 	uint64_t hostid;
 	nvlist_t *nvl;
 	boolean_t found_one = B_FALSE;
+	boolean_t valid_top_config = B_FALSE;
 
 	if (nvlist_alloc(&ret, 0, 0) != 0)
 		goto nomem;
 
 	for (pe = pl->pools; pe != NULL; pe = pe->pe_next) {
-		uint64_t id;
+		uint64_t id, max_txg = 0;
 
 		if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0)
 			goto nomem;
@@ -463,6 +483,42 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 				}
 			}
 
+			/*
+			 * We rely on the fact that the max txg for the
+			 * pool will contain the most up-to-date information
+			 * about the valid top-levels in the vdev namespace.
+			 */
+			if (best_txg > max_txg) {
+				(void) nvlist_remove(config,
+				    ZPOOL_CONFIG_VDEV_CHILDREN,
+				    DATA_TYPE_UINT64);
+				(void) nvlist_remove(config,
+				    ZPOOL_CONFIG_HOLE_ARRAY,
+				    DATA_TYPE_UINT64_ARRAY);
+
+				max_txg = best_txg;
+				hole_array = NULL;
+				holes = 0;
+				max_id = 0;
+				valid_top_config = B_FALSE;
+
+				if (nvlist_lookup_uint64(tmp,
+				    ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) {
+					verify(nvlist_add_uint64(config,
+					    ZPOOL_CONFIG_VDEV_CHILDREN,
+					    max_id) == 0);
+					valid_top_config = B_TRUE;
+				}
+
+				if (nvlist_lookup_uint64_array(tmp,
+				    ZPOOL_CONFIG_HOLE_ARRAY, &hole_array,
+				    &holes) == 0) {
+					verify(nvlist_add_uint64_array(config,
+					    ZPOOL_CONFIG_HOLE_ARRAY,
+					    hole_array, holes) == 0);
+				}
+			}
+
 			if (!config_seen) {
 				/*
 				 * Copy the relevant pieces of data to the pool
@@ -522,6 +578,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 			    ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0);
 			verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID,
 			    &id) == 0);
+
 			if (id >= children) {
 				nvlist_t **newchild;
 
@@ -542,9 +599,74 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 
 		}
 
+		/*
+		 * If we have information about all the top-levels then
+		 * clean up the nvlist which we've constructed. This
+		 * means removing any extraneous devices that are
+		 * beyond the valid range or adding devices to the end
+		 * of our array which appear to be missing.
+		 */
+		if (valid_top_config) {
+			if (max_id < children) {
+				for (c = max_id; c < children; c++)
+					nvlist_free(child[c]);
+				children = max_id;
+			} else if (max_id > children) {
+				nvlist_t **newchild;
+
+				newchild = zfs_alloc(hdl, (max_id) *
+				    sizeof (nvlist_t *));
+				if (newchild == NULL)
+					goto nomem;
+
+				for (c = 0; c < children; c++)
+					newchild[c] = child[c];
+
+				free(child);
+				child = newchild;
+				children = max_id;
+			}
+		}
+
 		verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 		    &guid) == 0);
 
+		/*
+		 * The vdev namespace may contain holes as a result of
+		 * device removal. We must add them back into the vdev
+		 * tree before we process any missing devices.
+		 */
+		if (holes > 0) {
+			ASSERT(valid_top_config);
+
+			for (c = 0; c < children; c++) {
+				nvlist_t *holey;
+
+				if (child[c] != NULL ||
+				    !vdev_is_hole(hole_array, holes, c))
+					continue;
+
+				if (nvlist_alloc(&holey, NV_UNIQUE_NAME,
+				    0) != 0)
+					goto nomem;
+
+				/*
+				 * Holes in the namespace are treated as
+				 * "hole" top-level vdevs and have a
+				 * special flag set on them.
+				 */
+				if (nvlist_add_string(holey,
+				    ZPOOL_CONFIG_TYPE,
+				    VDEV_TYPE_HOLE) != 0 ||
+				    nvlist_add_uint64(holey,
+				    ZPOOL_CONFIG_ID, c) != 0 ||
+				    nvlist_add_uint64(holey,
+				    ZPOOL_CONFIG_GUID, 0ULL) != 0)
+					goto nomem;
+				child[c] = holey;
+			}
+		}
+
 		/*
 		 * Look for any missing top-level vdevs.  If this is the case,
 		 * create a faked up 'missing' vdev as a placeholder.  We cannot
@@ -552,7 +674,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 		 * certain checks to make sure the vdev IDs match their location
 		 * in the configuration.
 		 */
-		for (c = 0; c < children; c++)
+		for (c = 0; c < children; c++) {
 			if (child[c] == NULL) {
 				nvlist_t *missing;
 				if (nvlist_alloc(&missing, NV_UNIQUE_NAME,
@@ -570,6 +692,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 				}
 				child[c] = missing;
 			}
+		}
 
 		/*
 		 * Put all of this pool's top-level vdevs into a root vdev.
@@ -636,8 +759,11 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok)
 			continue;
 		}
 
-		if ((nvl = refresh_config(hdl, config)) == NULL)
-			goto error;
+		if ((nvl = refresh_config(hdl, config)) == NULL) {
+			nvlist_free(config);
+			config = NULL;
+			continue;
+		}
 
 		nvlist_free(config);
 		config = nvl;
@@ -777,6 +903,212 @@ zpool_read_label(int fd, nvlist_t **config)
 	return (0);
 }
 
+typedef struct rdsk_node {
+	char *rn_name;
+	int rn_dfd;
+	libzfs_handle_t *rn_hdl;
+	nvlist_t *rn_config;
+	avl_tree_t *rn_avl;
+	avl_node_t rn_node;
+	boolean_t rn_nozpool;
+} rdsk_node_t;
+
+static int
+slice_cache_compare(const void *arg1, const void *arg2)
+{
+	const char  *nm1 = ((rdsk_node_t *)arg1)->rn_name;
+	const char  *nm2 = ((rdsk_node_t *)arg2)->rn_name;
+	char *nm1slice, *nm2slice;
+	int rv;
+
+	/*
+	 * slices zero and two are the most likely to provide results,
+	 * so put those first
+	 */
+	nm1slice = strstr(nm1, "s0");
+	nm2slice = strstr(nm2, "s0");
+	if (nm1slice && !nm2slice) {
+		return (-1);
+	}
+	if (!nm1slice && nm2slice) {
+		return (1);
+	}
+	nm1slice = strstr(nm1, "s2");
+	nm2slice = strstr(nm2, "s2");
+	if (nm1slice && !nm2slice) {
+		return (-1);
+	}
+	if (!nm1slice && nm2slice) {
+		return (1);
+	}
+
+	rv = strcmp(nm1, nm2);
+	if (rv == 0)
+		return (0);
+	return (rv > 0 ? 1 : -1);
+}
+
+static void
+check_one_slice(avl_tree_t *r, char *diskname, uint_t partno,
+    diskaddr_t size, uint_t blksz)
+{
+	rdsk_node_t tmpnode;
+	rdsk_node_t *node;
+	char sname[MAXNAMELEN];
+
+	tmpnode.rn_name = &sname[0];
+	(void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u",
+	    diskname, partno);
+	/*
+	 * protect against division by zero for disk labels that
+	 * contain a bogus sector size
+	 */
+	if (blksz == 0)
+		blksz = DEV_BSIZE;
+	/* too small to contain a zpool? */
+	if ((size < (SPA_MINDEVSIZE / blksz)) &&
+	    (node = avl_find(r, &tmpnode, NULL)))
+		node->rn_nozpool = B_TRUE;
+}
+
+static void
+nozpool_all_slices(avl_tree_t *r, const char *sname)
+{
+	char diskname[MAXNAMELEN];
+	char *ptr;
+	int i;
+
+	(void) strncpy(diskname, sname, MAXNAMELEN);
+	if (((ptr = strrchr(diskname, 's')) == NULL) &&
+	    ((ptr = strrchr(diskname, 'p')) == NULL))
+		return;
+	ptr[0] = 's';
+	ptr[1] = '\0';
+	for (i = 0; i < NDKMAP; i++)
+		check_one_slice(r, diskname, i, 0, 1);
+	ptr[0] = 'p';
+	for (i = 0; i <= FD_NUMPART; i++)
+		check_one_slice(r, diskname, i, 0, 1);
+}
+
+static void
+check_slices(avl_tree_t *r, int fd, const char *sname)
+{
+	struct extvtoc vtoc;
+	struct dk_gpt *gpt;
+	char diskname[MAXNAMELEN];
+	char *ptr;
+	int i;
+
+	(void) strncpy(diskname, sname, MAXNAMELEN);
+	if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1]))
+		return;
+	ptr[1] = '\0';
+
+	if (read_extvtoc(fd, &vtoc) >= 0) {
+		for (i = 0; i < NDKMAP; i++)
+			check_one_slice(r, diskname, i,
+			    vtoc.v_part[i].p_size, vtoc.v_sectorsz);
+	} else if (efi_alloc_and_read(fd, &gpt) >= 0) {
+		/*
+		 * on x86 we'll still have leftover links that point
+		 * to slices s[9-15], so use NDKMAP instead
+		 */
+		for (i = 0; i < NDKMAP; i++)
+			check_one_slice(r, diskname, i,
+			    gpt->efi_parts[i].p_size, gpt->efi_lbasize);
+		/* nodes p[1-4] are never used with EFI labels */
+		ptr[0] = 'p';
+		for (i = 1; i <= FD_NUMPART; i++)
+			check_one_slice(r, diskname, i, 0, 1);
+		efi_free(gpt);
+	}
+}
+
+static void
+zpool_open_func(void *arg)
+{
+	rdsk_node_t *rn = arg;
+	struct stat64 statbuf;
+	nvlist_t *config;
+	int fd;
+
+	if (rn->rn_nozpool)
+		return;
+	if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) {
+		/* symlink to a device that's no longer there */
+		if (errno == ENOENT)
+			nozpool_all_slices(rn->rn_avl, rn->rn_name);
+		return;
+	}
+	/*
+	 * Ignore failed stats.  We only want regular
+	 * files, character devs and block devs.
+	 */
+	if (fstat64(fd, &statbuf) != 0 ||
+	    (!S_ISREG(statbuf.st_mode) &&
+	    !S_ISCHR(statbuf.st_mode) &&
+	    !S_ISBLK(statbuf.st_mode))) {
+		(void) close(fd);
+		return;
+	}
+	/* this file is too small to hold a zpool */
+	if (S_ISREG(statbuf.st_mode) &&
+	    statbuf.st_size < SPA_MINDEVSIZE) {
+		(void) close(fd);
+		return;
+	} else if (!S_ISREG(statbuf.st_mode)) {
+		/*
+		 * Try to read the disk label first so we don't have to
+		 * open a bunch of minor nodes that can't have a zpool.
+		 */
+		check_slices(rn->rn_avl, fd, rn->rn_name);
+	}
+
+	if ((zpool_read_label(fd, &config)) != 0) {
+		(void) close(fd);
+		(void) no_memory(rn->rn_hdl);
+		return;
+	}
+	(void) close(fd);
+
+
+	rn->rn_config = config;
+	if (config != NULL) {
+		assert(rn->rn_nozpool == B_FALSE);
+	}
+}
+
+/*
+ * Given a file descriptor, clear (zero) the label information.  This function
+ * is currently only used in the appliance stack as part of the ZFS sysevent
+ * module.
+ */
+int
+zpool_clear_label(int fd)
+{
+	struct stat64 statbuf;
+	int l;
+	vdev_label_t *label;
+	uint64_t size;
+
+	if (fstat64(fd, &statbuf) == -1)
+		return (0);
+	size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t);
+
+	if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL)
+		return (-1);
+
+	for (l = 0; l < VDEV_LABELS; l++) {
+		if (pwrite64(fd, label, sizeof (vdev_label_t),
+		    label_offset(size, l)) != sizeof (vdev_label_t))
+			return (-1);
+	}
+
+	free(label);
+	return (0);
+}
+
 /*
  * Given a list of directories to search, find all pools stored on disk.  This
  * includes partial pools which are not available to import.  If no args are
@@ -785,30 +1117,28 @@ zpool_read_label(int fd, nvlist_t **config)
  * to import a specific pool.
  */
 static nvlist_t *
-zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
-    boolean_t active_ok, char *poolname, uint64_t guid)
+zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg)
 {
-	int i;
+	int i, dirs = iarg->paths;
 	DIR *dirp = NULL;
 	struct dirent64 *dp;
 	char path[MAXPATHLEN];
-	char *end;
+	char *end, **dir = iarg->path;
 	size_t pathleft;
-	struct stat64 statbuf;
-	nvlist_t *ret = NULL, *config;
+	nvlist_t *ret = NULL;
 	static char *default_dir = "/dev/dsk";
-	int fd;
 	pool_list_t pools = { 0 };
 	pool_entry_t *pe, *penext;
 	vdev_entry_t *ve, *venext;
 	config_entry_t *ce, *cenext;
 	name_entry_t *ne, *nenext;
+	avl_tree_t slice_cache;
+	rdsk_node_t *slice;
+	void *cookie;
 
-	verify(poolname == NULL || guid == 0);
-
-	if (argc == 0) {
-		argc = 1;
-		argv = &default_dir;
+	if (dirs == 0) {
+		dirs = 1;
+		dir = &default_dir;
 	}
 
 	/*
@@ -816,15 +1146,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
 	 * possible device, organizing the information according to pool GUID
 	 * and toplevel GUID.
 	 */
-	for (i = 0; i < argc; i++) {
+	for (i = 0; i < dirs; i++) {
+		tpool_t *t;
 		char *rdsk;
 		int dfd;
 
 		/* use realpath to normalize the path */
-		if (realpath(argv[i], path) == 0) {
+		if (realpath(dir[i], path) == 0) {
 			(void) zfs_error_fmt(hdl, EZFS_BADPATH,
-			    dgettext(TEXT_DOMAIN, "cannot open '%s'"),
-			    argv[i]);
+			    dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]);
 			goto error;
 		}
 		end = &path[strlen(path)];
@@ -851,6 +1181,8 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
 			goto error;
 		}
 
+		avl_create(&slice_cache, slice_cache_compare,
+		    sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node));
 		/*
 		 * This is not MT-safe, but we have no MT consumers of libzfs
 		 */
@@ -860,46 +1192,53 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
 			    (name[1] == 0 || (name[1] == '.' && name[2] == 0)))
 				continue;
 
-			if ((fd = openat64(dfd, name, O_RDONLY)) < 0)
-				continue;
-
-			/*
-			 * Ignore failed stats.  We only want regular
-			 * files, character devs and block devs.
-			 */
-			if (fstat64(fd, &statbuf) != 0 ||
-			    (!S_ISREG(statbuf.st_mode) &&
-			    !S_ISCHR(statbuf.st_mode) &&
-			    !S_ISBLK(statbuf.st_mode))) {
-				(void) close(fd);
-				continue;
-			}
-
-			if ((zpool_read_label(fd, &config)) != 0) {
-				(void) close(fd);
-				(void) no_memory(hdl);
-				goto error;
-			}
-
-			(void) close(fd);
-
-			if (config != NULL) {
+			slice = zfs_alloc(hdl, sizeof (rdsk_node_t));
+			slice->rn_name = zfs_strdup(hdl, name);
+			slice->rn_avl = &slice_cache;
+			slice->rn_dfd = dfd;
+			slice->rn_hdl = hdl;
+			slice->rn_nozpool = B_FALSE;
+			avl_add(&slice_cache, slice);
+		}
+		/*
+		 * create a thread pool to do all of this in parallel;
+		 * rn_nozpool is not protected, so this is racy in that
+		 * multiple tasks could decide that the same slice can
+		 * not hold a zpool, which is benign.  Also choose
+		 * double the number of processors; we hold a lot of
+		 * locks in the kernel, so going beyond this doesn't
+		 * buy us much.
+		 */
+		t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN),
+		    0, NULL);
+		for (slice = avl_first(&slice_cache); slice;
+		    (slice = avl_walk(&slice_cache, slice,
+		    AVL_AFTER)))
+			(void) tpool_dispatch(t, zpool_open_func, slice);
+		tpool_wait(t);
+		tpool_destroy(t);
+
+		cookie = NULL;
+		while ((slice = avl_destroy_nodes(&slice_cache,
+		    &cookie)) != NULL) {
+			if (slice->rn_config != NULL) {
+				nvlist_t *config = slice->rn_config;
 				boolean_t matched = B_TRUE;
 
-				if (poolname != NULL) {
+				if (iarg->poolname != NULL) {
 					char *pname;
 
 					matched = nvlist_lookup_string(config,
 					    ZPOOL_CONFIG_POOL_NAME,
 					    &pname) == 0 &&
-					    strcmp(poolname, pname) == 0;
-				} else if (guid != 0) {
+					    strcmp(iarg->poolname, pname) == 0;
+				} else if (iarg->guid != 0) {
 					uint64_t this_guid;
 
 					matched = nvlist_lookup_uint64(config,
 					    ZPOOL_CONFIG_POOL_GUID,
 					    &this_guid) == 0 &&
-					    guid == this_guid;
+					    iarg->guid == this_guid;
 				}
 				if (!matched) {
 					nvlist_free(config);
@@ -907,17 +1246,20 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
 					continue;
 				}
 				/* use the non-raw path for the config */
-				(void) strlcpy(end, name, pathleft);
+				(void) strlcpy(end, slice->rn_name, pathleft);
 				if (add_config(hdl, &pools, path, config) != 0)
 					goto error;
 			}
+			free(slice->rn_name);
+			free(slice);
 		}
+		avl_destroy(&slice_cache);
 
 		(void) closedir(dirp);
 		dirp = NULL;
 	}
 
-	ret = get_configs(hdl, &pools, active_ok);
+	ret = get_configs(hdl, &pools, iarg->can_be_active);
 
 error:
 	for (pe = pools.pools; pe != NULL; pe = penext) {
@@ -951,27 +1293,12 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv,
 nvlist_t *
 zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv)
 {
-	return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, 0));
-}
+	importargs_t iarg = { 0 };
 
-nvlist_t *
-zpool_find_import_byname(libzfs_handle_t *hdl, int argc, char **argv,
-    char *pool)
-{
-	return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, pool, 0));
-}
+	iarg.paths = argc;
+	iarg.path = argv;
 
-nvlist_t *
-zpool_find_import_byguid(libzfs_handle_t *hdl, int argc, char **argv,
-    uint64_t guid)
-{
-	return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, guid));
-}
-
-nvlist_t *
-zpool_find_import_activeok(libzfs_handle_t *hdl, int argc, char **argv)
-{
-	return (zpool_find_import_impl(hdl, argc, argv, B_TRUE, NULL, 0));
+	return (zpool_find_import_impl(hdl, &iarg));
 }
 
 /*
@@ -1093,6 +1420,46 @@ zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile,
 	return (pools);
 }
 
+static int
+name_or_guid_exists(zpool_handle_t *zhp, void *data)
+{
+	importargs_t *import = data;
+	int found = 0;
+
+	if (import->poolname != NULL) {
+		char *pool_name;
+
+		verify(nvlist_lookup_string(zhp->zpool_config,
+		    ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0);
+		if (strcmp(pool_name, import->poolname) == 0)
+			found = 1;
+	} else {
+		uint64_t pool_guid;
+
+		verify(nvlist_lookup_uint64(zhp->zpool_config,
+		    ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0);
+		if (pool_guid == import->guid)
+			found = 1;
+	}
+
+	zpool_close(zhp);
+	return (found);
+}
+
+nvlist_t *
+zpool_search_import(libzfs_handle_t *hdl, importargs_t *import)
+{
+	verify(import->poolname == NULL || import->guid == 0);
+
+	if (import->unique)
+		import->exists = zpool_iter(hdl, name_or_guid_exists, import);
+
+	if (import->cachefile != NULL)
+		return (zpool_find_import_cached(hdl, import->cachefile,
+		    import->poolname, import->guid));
+
+	return (zpool_find_import_impl(hdl, import));
+}
 
 boolean_t
 find_guid(nvlist_t *nv, uint64_t guid)
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c
index 7c5c7f3ecaeed..62348b6cedc11 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -74,7 +74,6 @@
 #include <unistd.h>
 #include <zone.h>
 #include <sys/mntent.h>
-#include <sys/mnttab.h>
 #include <sys/mount.h>
 #include <sys/stat.h>
 
@@ -236,18 +235,9 @@ dir_is_empty(const char *dirname)
 boolean_t
 is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where)
 {
-	struct mnttab search = { 0 }, entry;
-
-	/*
-	 * Search for the entry in /etc/mnttab.  We don't bother getting the
-	 * mountpoint, as we can just search for the special device.  This will
-	 * also let us find mounts when the mountpoint is 'legacy'.
-	 */
-	search.mnt_special = (char *)special;
-	search.mnt_fstype = MNTTYPE_ZFS;
+	struct mnttab entry;
 
-	rewind(zfs_hdl->libzfs_mnttab);
-	if (getmntany(zfs_hdl->libzfs_mnttab, &entry, &search) != 0)
+	if (libzfs_mnttab_find(zfs_hdl, special, &entry) != 0)
 		return (B_FALSE);
 
 	if (where != NULL)
@@ -358,12 +348,14 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags)
 		} else {
 			zfs_error_aux(hdl, strerror(errno));
 		}
-
 		return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED,
 		    dgettext(TEXT_DOMAIN, "cannot mount '%s'"),
 		    zhp->zfs_name));
 	}
 
+	/* add the mounted entry into our cache */
+	libzfs_mnttab_add(hdl, zfs_get_name(zhp), mountpoint,
+	    mntopts);
 	return (0);
 }
 
@@ -389,26 +381,23 @@ unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags)
 int
 zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
 {
-	struct mnttab search = { 0 }, entry;
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	struct mnttab entry;
 	char *mntpt = NULL;
 
-	/* check to see if need to unmount the filesystem */
-	search.mnt_special = zhp->zfs_name;
-	search.mnt_fstype = MNTTYPE_ZFS;
-	rewind(zhp->zfs_hdl->libzfs_mnttab);
+	/* check to see if we need to unmount the filesystem */
 	if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
-	    getmntany(zhp->zfs_hdl->libzfs_mnttab, &entry, &search) == 0)) {
-
+	    libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)) {
 		/*
 		 * mountpoint may have come from a call to
 		 * getmnt/getmntany if it isn't NULL. If it is NULL,
-		 * we know it comes from getmntany which can then get
-		 * overwritten later. We strdup it to play it safe.
+		 * we know it comes from libzfs_mnttab_find which can
+		 * then get freed later. We strdup it to play it safe.
 		 */
 		if (mountpoint == NULL)
-			mntpt = zfs_strdup(zhp->zfs_hdl, entry.mnt_mountp);
+			mntpt = zfs_strdup(hdl, entry.mnt_mountp);
 		else
-			mntpt = zfs_strdup(zhp->zfs_hdl, mountpoint);
+			mntpt = zfs_strdup(hdl, mountpoint);
 
 		/*
 		 * Unshare and unmount the filesystem
@@ -416,11 +405,12 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags)
 		if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0)
 			return (-1);
 
-		if (unmount_one(zhp->zfs_hdl, mntpt, flags) != 0) {
+		if (unmount_one(hdl, mntpt, flags) != 0) {
 			free(mntpt);
 			(void) zfs_shareall(zhp);
 			return (-1);
 		}
+		libzfs_mnttab_remove(hdl, zhp->zfs_name);
 		free(mntpt);
 	}
 
@@ -849,7 +839,7 @@ unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint,
 	char *mntpt;
 	/*
 	 * Mountpoint could get trashed if libshare calls getmntany
-	 * which id does during API initialization, so strdup the
+	 * which it does during API initialization, so strdup the
 	 * value.
 	 */
 	mntpt = zfs_strdup(hdl, mountpoint);
@@ -887,18 +877,17 @@ int
 zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint,
     zfs_share_proto_t *proto)
 {
-	struct mnttab search = { 0 }, entry;
+	libzfs_handle_t *hdl = zhp->zfs_hdl;
+	struct mnttab entry;
 	char *mntpt = NULL;
 
 	/* check to see if need to unmount the filesystem */
-	search.mnt_special = (char *)zfs_get_name(zhp);
-	search.mnt_fstype = MNTTYPE_ZFS;
 	rewind(zhp->zfs_hdl->libzfs_mnttab);
 	if (mountpoint != NULL)
-		mntpt = zfs_strdup(zhp->zfs_hdl, mountpoint);
+		mountpoint = mntpt = zfs_strdup(hdl, mountpoint);
 
 	if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) &&
-	    getmntany(zhp->zfs_hdl->libzfs_mnttab, &entry, &search) == 0)) {
+	    libzfs_mnttab_find(hdl, zfs_get_name(zhp), &entry) == 0)) {
 		zfs_share_proto_t *curr_proto;
 
 		if (mountpoint == NULL)
@@ -907,8 +896,8 @@ zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint,
 		for (curr_proto = proto; *curr_proto != PROTO_END;
 		    curr_proto++) {
 
-			if (is_shared(zhp->zfs_hdl, mntpt, *curr_proto) &&
-			    unshare_one(zhp->zfs_hdl, zhp->zfs_name,
+			if (is_shared(hdl, mntpt, *curr_proto) &&
+			    unshare_one(hdl, zhp->zfs_name,
 			    mntpt, *curr_proto) != 0) {
 				if (mntpt != NULL)
 					free(mntpt);
@@ -1191,10 +1180,12 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
 
 	/*
 	 * And mount all the datasets, keeping track of which ones
-	 * succeeded or failed. By using zfs_alloc(), the good pointer
-	 * will always be non-NULL.
+	 * succeeded or failed.
 	 */
-	good = zfs_alloc(zhp->zpool_hdl, cb.cb_used * sizeof (int));
+	if ((good = zfs_alloc(zhp->zpool_hdl,
+	    cb.cb_used * sizeof (int))) == NULL)
+		goto out;
+
 	ret = 0;
 	for (i = 0; i < cb.cb_used; i++) {
 		if (zfs_mount(cb.cb_datasets[i], mntopts, flags) != 0)
@@ -1224,26 +1215,19 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags)
 	return (ret);
 }
 
-
+/*ARGSUSED1*/
 static int
-zvol_cb(const char *dataset, void *data)
+zvol_cb(zfs_handle_t *zhp, void *unused)
 {
-	libzfs_handle_t *hdl = data;
-	zfs_handle_t *zhp;
-
-	/*
-	 * Ignore snapshots and ignore failures from non-existant datasets.
-	 */
-	if (strchr(dataset, '@') != NULL ||
-	    (zhp = zfs_open(hdl, dataset, ZFS_TYPE_VOLUME)) == NULL)
-		return (0);
-
-	if (zfs_unshare_iscsi(zhp) != 0)
-		return (-1);
+	int error = 0;
 
+	if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM)
+		(void) zfs_iter_children(zhp, zvol_cb, NULL);
+	if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME)
+		error = zfs_unshare_iscsi(zhp);
 	zfs_close(zhp);
 
-	return (0);
+	return (error);
 }
 
 static int
@@ -1255,6 +1239,8 @@ mountpoint_compare(const void *a, const void *b)
 	return (strcmp(mountb, mounta));
 }
 
+/* alias for 2002/240 */
+#pragma weak zpool_unmount_datasets = zpool_disable_datasets
 /*
  * Unshare and unmount all datasets within the given pool.  We don't want to
  * rely on traversing the DSL to discover the filesystems within the pool,
@@ -1262,7 +1248,6 @@ mountpoint_compare(const void *a, const void *b)
  * arbitrarily (on I/O error, for example).  Instead, we walk /etc/mnttab and
  * gather all the filesystems that are currently mounted.
  */
-#pragma weak zpool_unmount_datasets = zpool_disable_datasets
 int
 zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 {
@@ -1270,6 +1255,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 	struct mnttab entry;
 	size_t namelen;
 	char **mountpoints = NULL;
+	zfs_handle_t *zfp;
 	zfs_handle_t **datasets = NULL;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 	int i;
@@ -1279,8 +1265,12 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force)
 	/*
 	 * First unshare all zvols.
 	 */
-	if (zpool_iter_zvol(zhp, zvol_cb, hdl) != 0)
-		return (-1);
+	zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name,
+	    ZFS_TYPE_FILESYSTEM);
+	if (zfp != NULL) {
+		(void) zfs_iter_children(zfp, zvol_cb, NULL);
+		zfs_close(zfp);
+	}
 
 	namelen = strlen(zhp->zpool_name);
 
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c
index 18ceb4859654e..3c0f46815b49a 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c
@@ -20,32 +20,72 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#include <alloca.h>
-#include <assert.h>
 #include <ctype.h>
 #include <errno.h>
 #include <devid.h>
-#include <dirent.h>
 #include <fcntl.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <strings.h>
 #include <unistd.h>
-#include <zone.h>
 #include <sys/efi_partition.h>
 #include <sys/vtoc.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/zio.h>
-#include <strings.h>
+#include <dlfcn.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
 #include "libzfs_impl.h"
+#include "zfs_comutil.h"
+
+const char *hist_event_table[LOG_END] = {
+	"invalid event",
+	"pool create",
+	"vdev add",
+	"pool remove",
+	"pool destroy",
+	"pool export",
+	"pool import",
+	"vdev attach",
+	"vdev replace",
+	"vdev detach",
+	"vdev online",
+	"vdev offline",
+	"vdev upgrade",
+	"pool clear",
+	"pool scrub",
+	"pool property set",
+	"create",
+	"clone",
+	"destroy",
+	"destroy_begin_sync",
+	"inherit",
+	"property set",
+	"quota set",
+	"permission update",
+	"permission remove",
+	"permission who remove",
+	"promote",
+	"receive",
+	"rename",
+	"reservation set",
+	"replay_inc_sync",
+	"replay_full_sync",
+	"rollback",
+	"snapshot",
+	"filesystem version upgrade",
+	"refquota set",
+	"refreservation set",
+	"pool scrub done",
+	"user hold",
+	"user release",
+	"pool split",
+};
 
 static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
 
@@ -55,6 +95,10 @@ static int read_efi_label(nvlist_t *config, diskaddr_t *sb);
 #define	BOOTCMD	"installboot(1M)"
 #endif
 
+#define	DISK_ROOT	"/dev/dsk"
+#define	RDISK_ROOT	"/dev/rdsk"
+#define	BACKUP_SLICE	"s2"
+
 /*
  * ====================================================================
  *   zpool property functions
@@ -188,6 +232,8 @@ zpool_state_to_name(vdev_state_t state, vdev_aux_t aux)
 	case VDEV_STATE_CANT_OPEN:
 		if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG)
 			return (gettext("FAULTED"));
+		else if (aux == VDEV_AUX_SPLIT_POOL)
+			return (gettext("SPLIT"));
 		else
 			return (gettext("UNAVAIL"));
 	case VDEV_STATE_FAULTED:
@@ -217,12 +263,39 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
 	uint_t vsc;
 
 	if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) {
-		if (prop == ZPOOL_PROP_NAME)
+		switch (prop) {
+		case ZPOOL_PROP_NAME:
 			(void) strlcpy(buf, zpool_get_name(zhp), len);
-		else if (prop == ZPOOL_PROP_HEALTH)
+			break;
+
+		case ZPOOL_PROP_HEALTH:
 			(void) strlcpy(buf, "FAULTED", len);
-		else
+			break;
+
+		case ZPOOL_PROP_GUID:
+			intval = zpool_get_prop_int(zhp, prop, &src);
+			(void) snprintf(buf, len, "%llu", intval);
+			break;
+
+		case ZPOOL_PROP_ALTROOT:
+		case ZPOOL_PROP_CACHEFILE:
+			if (zhp->zpool_props != NULL ||
+			    zpool_get_all_props(zhp) == 0) {
+				(void) strlcpy(buf,
+				    zpool_get_prop_string(zhp, prop, &src),
+				    len);
+				if (srctype != NULL)
+					*srctype = src;
+				return (0);
+			}
+			/* FALLTHROUGH */
+		default:
 			(void) strlcpy(buf, "-", len);
+			break;
+		}
+
+		if (srctype != NULL)
+			*srctype = src;
 		return (0);
 	}
 
@@ -241,8 +314,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
 
 		switch (prop) {
 		case ZPOOL_PROP_SIZE:
-		case ZPOOL_PROP_USED:
-		case ZPOOL_PROP_AVAILABLE:
+		case ZPOOL_PROP_ALLOCATED:
+		case ZPOOL_PROP_FREE:
 			(void) zfs_nicenum(intval, buf, len);
 			break;
 
@@ -251,6 +324,12 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
 			    (u_longlong_t)intval);
 			break;
 
+		case ZPOOL_PROP_DEDUPRATIO:
+			(void) snprintf(buf, len, "%llu.%02llux",
+			    (u_longlong_t)(intval / 100),
+			    (u_longlong_t)(intval % 100));
+			break;
+
 		case ZPOOL_PROP_HEALTH:
 			verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
 			    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
@@ -532,9 +611,6 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval)
 	    dgettext(TEXT_DOMAIN, "cannot set property for '%s'"),
 	    zhp->zpool_name);
 
-	if (zhp->zpool_props == NULL && zpool_get_all_props(zhp))
-		return (zfs_error(zhp->zpool_hdl, EZFS_POOLPROPS, errbuf));
-
 	if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0)
 		return (no_memory(zhp->zpool_hdl));
 
@@ -603,6 +679,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp)
 }
 
 
+/*
+ * Don't start the slice at the default block of 34; many storage
+ * devices will use a stripe width of 128k, so start there instead.
+ */
+#define	NEW_START_BLOCK	256
+
 /*
  * Validate the given pool name, optionally putting an extended error message in
  * 'buf'.
@@ -969,9 +1051,6 @@ zpool_destroy(zpool_handle_t *zhp)
 	    ZFS_TYPE_FILESYSTEM)) == NULL)
 		return (-1);
 
-	if (zpool_remove_zvol_links(zhp) != 0)
-		return (-1);
-
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 
 	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) {
@@ -1037,7 +1116,8 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 				    "device '%s' contains an EFI label and "
 				    "cannot be used on root pools."),
-				    zpool_vdev_name(hdl, NULL, spares[s]));
+				    zpool_vdev_name(hdl, NULL, spares[s],
+				    B_FALSE));
 				return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg));
 			}
 		}
@@ -1127,19 +1207,17 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot)
  * mounted datasets in the pool.
  */
 int
-zpool_export(zpool_handle_t *zhp, boolean_t force)
+zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce)
 {
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 
-	if (zpool_remove_zvol_links(zhp) != 0)
-		return (-1);
-
 	(void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN,
 	    "cannot export '%s'"), zhp->zpool_name);
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	zc.zc_cookie = force;
+	zc.zc_guid = hardforce;
 
 	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) {
 		switch (errno) {
@@ -1160,6 +1238,139 @@ zpool_export(zpool_handle_t *zhp, boolean_t force)
 	return (0);
 }
 
+int
+zpool_export(zpool_handle_t *zhp, boolean_t force)
+{
+	return (zpool_export_common(zhp, force, B_FALSE));
+}
+
+int
+zpool_export_force(zpool_handle_t *zhp)
+{
+	return (zpool_export_common(zhp, B_TRUE, B_TRUE));
+}
+
+static void
+zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun,
+    nvlist_t *rbi)
+{
+	uint64_t rewindto;
+	int64_t loss = -1;
+	struct tm t;
+	char timestr[128];
+
+	if (!hdl->libzfs_printerr || rbi == NULL)
+		return;
+
+	if (nvlist_lookup_uint64(rbi, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+		return;
+	(void) nvlist_lookup_int64(rbi, ZPOOL_CONFIG_REWIND_TIME, &loss);
+
+	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+	    strftime(timestr, 128, 0, &t) != 0) {
+		if (dryrun) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "Would be able to return %s "
+			    "to its state as of %s.\n"),
+			    name, timestr);
+		} else {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "Pool %s returned to its state as of %s.\n"),
+			    name, timestr);
+		}
+		if (loss > 120) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "%s approximately %lld "),
+			    dryrun ? "Would discard" : "Discarded",
+			    (loss + 30) / 60);
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "minutes of transactions.\n"));
+		} else if (loss > 0) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "%s approximately %lld "),
+			    dryrun ? "Would discard" : "Discarded", loss);
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "seconds of transactions.\n"));
+		}
+	}
+}
+
+void
+zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason,
+    nvlist_t *config)
+{
+	int64_t loss = -1;
+	uint64_t edata = UINT64_MAX;
+	uint64_t rewindto;
+	struct tm t;
+	char timestr[128];
+
+	if (!hdl->libzfs_printerr)
+		return;
+
+	if (reason >= 0)
+		(void) printf(dgettext(TEXT_DOMAIN, "action: "));
+	else
+		(void) printf(dgettext(TEXT_DOMAIN, "\t"));
+
+	/* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */
+	if (nvlist_lookup_uint64(config,
+	    ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0)
+		goto no_info;
+
+	(void) nvlist_lookup_int64(config, ZPOOL_CONFIG_REWIND_TIME, &loss);
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+	    &edata);
+
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "Recovery is possible, but will result in some data loss.\n"));
+
+	if (localtime_r((time_t *)&rewindto, &t) != NULL &&
+	    strftime(timestr, 128, 0, &t) != 0) {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "\tReturning the pool to its state as of %s\n"
+		    "\tshould correct the problem.  "),
+		    timestr);
+	} else {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "\tReverting the pool to an earlier state "
+		    "should correct the problem.\n\t"));
+	}
+
+	if (loss > 120) {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "Approximately %lld minutes of data\n"
+		    "\tmust be discarded, irreversibly.  "), (loss + 30) / 60);
+	} else if (loss > 0) {
+		(void) printf(dgettext(TEXT_DOMAIN,
+		    "Approximately %lld seconds of data\n"
+		    "\tmust be discarded, irreversibly.  "), loss);
+	}
+	if (edata != 0 && edata != UINT64_MAX) {
+		if (edata == 1) {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "After rewind, at least\n"
+			    "\tone persistent user-data error will remain.  "));
+		} else {
+			(void) printf(dgettext(TEXT_DOMAIN,
+			    "After rewind, several\n"
+			    "\tpersistent user-data errors will remain.  "));
+		}
+	}
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "Recovery can be attempted\n\tby executing 'zpool %s -F %s'.  "),
+	    reason >= 0 ? "clear" : "import", name);
+
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "A scrub of the pool\n"
+	    "\tis strongly recommended after recovery.\n"));
+	return;
+
+no_info:
+	(void) printf(dgettext(TEXT_DOMAIN,
+	    "Destroy and re-create the pool from\n\ta backup source.\n"));
+}
+
 /*
  * zpool_import() is a contracted interface. Should be kept the same
  * if possible.
@@ -1209,8 +1420,11 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
     nvlist_t *props, boolean_t importfaulted)
 {
 	zfs_cmd_t zc = { 0 };
+	zpool_rewind_policy_t policy;
+	nvlist_t *nvi = NULL;
 	char *thename;
 	char *origname;
+	uint64_t returned_size;
 	int ret;
 	char errbuf[1024];
 
@@ -1254,11 +1468,30 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
 		nvlist_free(props);
 		return (-1);
 	}
+	returned_size =  zc.zc_nvlist_conf_size + 512;
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, returned_size) != 0) {
+		nvlist_free(props);
+		return (-1);
+	}
 
 	zc.zc_cookie = (uint64_t)importfaulted;
 	ret = 0;
 	if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) {
 		char desc[1024];
+
+		(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+		zpool_get_rewind_policy(config, &policy);
+		/*
+		 * Dry-run failed, but we print out what success
+		 * looks like if we found a best txg
+		 */
+		if ((policy.zrp_request & ZPOOL_TRY_REWIND) && nvi) {
+			zpool_rewind_exclaim(hdl, newname ? origname : thename,
+			    B_TRUE, nvi);
+			nvlist_free(nvi);
+			return (-1);
+		}
+
 		if (newname == NULL)
 			(void) snprintf(desc, sizeof (desc),
 			    dgettext(TEXT_DOMAIN, "cannot import '%s'"),
@@ -1281,7 +1514,12 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
 			break;
 
 		default:
+			(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
 			(void) zpool_standard_error(hdl, errno, desc);
+			zpool_explain_recover(hdl,
+			    newname ? origname : thename, -errno, nvi);
+			nvlist_free(nvi);
+			break;
 		}
 
 		ret = -1;
@@ -1291,13 +1529,20 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname,
 		/*
 		 * This should never fail, but play it safe anyway.
 		 */
-		if (zpool_open_silent(hdl, thename, &zhp) != 0) {
+		if (zpool_open_silent(hdl, thename, &zhp) != 0)
 			ret = -1;
-		} else if (zhp != NULL) {
-			ret = zpool_create_zvol_links(zhp);
+		else if (zhp != NULL)
 			zpool_close(zhp);
+		(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+		zpool_get_rewind_policy(config, &policy);
+		if (policy.zrp_request &
+		    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+			zpool_rewind_exclaim(hdl, newname ? origname : thename,
+			    ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
+			    nvi);
 		}
-
+		nvlist_free(nvi);
+		return (0);
 	}
 
 	zcmd_free_nvlists(&zc);
@@ -1332,46 +1577,137 @@ zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type)
 }
 
 /*
+ * Find a vdev that matches the search criteria specified. We use the
+ * the nvpair name to determine how we should look for the device.
  * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL
  * spare; but FALSE if its an INUSE spare.
  */
 static nvlist_t *
-vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
-    boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
+vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare,
+    boolean_t *l2cache, boolean_t *log)
 {
 	uint_t c, children;
 	nvlist_t **child;
-	uint64_t theguid, present;
-	char *path;
-	uint64_t wholedisk = 0;
 	nvlist_t *ret;
 	uint64_t is_log;
+	char *srchkey;
+	nvpair_t *pair = nvlist_next_nvpair(search, NULL);
+
+	/* Nothing to look for */
+	if (search == NULL || pair == NULL)
+		return (NULL);
+
+	/* Obtain the key we will use to search */
+	srchkey = nvpair_name(pair);
+
+	switch (nvpair_type(pair)) {
+	case DATA_TYPE_UINT64: {
+		uint64_t srchval, theguid, present;
+
+		verify(nvpair_value_uint64(pair, &srchval) == 0);
+		if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) {
+			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+			    &present) == 0) {
+				/*
+				 * If the device has never been present since
+				 * import, the only reliable way to match the
+				 * vdev is by GUID.
+				 */
+				verify(nvlist_lookup_uint64(nv,
+				    ZPOOL_CONFIG_GUID, &theguid) == 0);
+				if (theguid == srchval)
+					return (nv);
+			}
+		}
+		break;
+	}
+
+	case DATA_TYPE_STRING: {
+		char *srchval, *val;
 
-	verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &theguid) == 0);
+		verify(nvpair_value_string(pair, &srchval) == 0);
+		if (nvlist_lookup_string(nv, srchkey, &val) != 0)
+			break;
 
-	if (search == NULL &&
-	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &present) == 0) {
 		/*
-		 * If the device has never been present since import, the only
-		 * reliable way to match the vdev is by GUID.
+		 * Search for the requested value. We special case the search
+		 * for ZPOOL_CONFIG_PATH when it's a wholedisk and when
+		 * Looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE).
+		 * Otherwise, all other searches are simple string compares.
 		 */
-		if (theguid == guid)
-			return (nv);
-	} else if (search != NULL &&
-	    nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) {
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
-		    &wholedisk);
-		if (wholedisk) {
+		if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && val) {
+			uint64_t wholedisk = 0;
+
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK,
+			    &wholedisk);
+			if (wholedisk) {
+				/*
+				 * For whole disks, the internal path has 's0',
+				 * but the path passed in by the user doesn't.
+				 */
+				if (strlen(srchval) == strlen(val) - 2 &&
+				    strncmp(srchval, val, strlen(srchval)) == 0)
+					return (nv);
+				break;
+			}
+		} else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) {
+			char *type, *idx, *end, *p;
+			uint64_t id, vdev_id;
+
+			/*
+			 * Determine our vdev type, keeping in mind
+			 * that the srchval is composed of a type and
+			 * vdev id pair (i.e. mirror-4).
+			 */
+			if ((type = strdup(srchval)) == NULL)
+				return (NULL);
+
+			if ((p = strrchr(type, '-')) == NULL) {
+				free(type);
+				break;
+			}
+			idx = p + 1;
+			*p = '\0';
+
 			/*
-			 * For whole disks, the internal path has 's0', but the
-			 * path passed in by the user doesn't.
+			 * If the types don't match then keep looking.
 			 */
-			if (strlen(search) == strlen(path) - 2 &&
-			    strncmp(search, path, strlen(search)) == 0)
+			if (strncmp(val, type, strlen(val)) != 0) {
+				free(type);
+				break;
+			}
+
+			verify(strncmp(type, VDEV_TYPE_RAIDZ,
+			    strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+			    strncmp(type, VDEV_TYPE_MIRROR,
+			    strlen(VDEV_TYPE_MIRROR)) == 0);
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+			    &id) == 0);
+
+			errno = 0;
+			vdev_id = strtoull(idx, &end, 10);
+
+			free(type);
+			if (errno != 0)
+				return (NULL);
+
+			/*
+			 * Now verify that we have the correct vdev id.
+			 */
+			if (vdev_id == id)
 				return (nv);
-		} else if (strcmp(search, path) == 0) {
-			return (nv);
 		}
+
+		/*
+		 * Common case
+		 */
+		if (strcmp(srchval, val) == 0)
+			return (nv);
+		break;
+	}
+
+	default:
+		break;
 	}
 
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN,
@@ -1379,7 +1715,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
 		return (NULL);
 
 	for (c = 0; c < children; c++) {
-		if ((ret = vdev_to_nvlist_iter(child[c], search, guid,
+		if ((ret = vdev_to_nvlist_iter(child[c], search,
 		    avail_spare, l2cache, NULL)) != NULL) {
 			/*
 			 * The 'is_log' value is only set for the toplevel
@@ -1400,7 +1736,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES,
 	    &child, &children) == 0) {
 		for (c = 0; c < children; c++) {
-			if ((ret = vdev_to_nvlist_iter(child[c], search, guid,
+			if ((ret = vdev_to_nvlist_iter(child[c], search,
 			    avail_spare, l2cache, NULL)) != NULL) {
 				*avail_spare = B_TRUE;
 				return (ret);
@@ -1411,7 +1747,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
 	if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE,
 	    &child, &children) == 0) {
 		for (c = 0; c < children; c++) {
-			if ((ret = vdev_to_nvlist_iter(child[c], search, guid,
+			if ((ret = vdev_to_nvlist_iter(child[c], search,
 			    avail_spare, l2cache, NULL)) != NULL) {
 				*l2cache = B_TRUE;
 				return (ret);
@@ -1422,24 +1758,62 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid,
 	return (NULL);
 }
 
+/*
+ * Given a physical path (minus the "/devices" prefix), find the
+ * associated vdev.
+ */
+nvlist_t *
+zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath,
+    boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log)
+{
+	nvlist_t *search, *nvroot, *ret;
+
+	verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0);
+
+	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+
+	*avail_spare = B_FALSE;
+	ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
+	nvlist_free(search);
+
+	return (ret);
+}
+
+/*
+ * Determine if we have an "interior" top-level vdev (i.e mirror/raidz).
+ */
+boolean_t
+zpool_vdev_is_interior(const char *name)
+{
+	if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 ||
+	    strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0)
+		return (B_TRUE);
+	return (B_FALSE);
+}
+
 nvlist_t *
 zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
     boolean_t *l2cache, boolean_t *log)
 {
 	char buf[MAXPATHLEN];
-	const char *search;
 	char *end;
-	nvlist_t *nvroot;
+	nvlist_t *nvroot, *search, *ret;
 	uint64_t guid;
 
+	verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
 	guid = strtoull(path, &end, 10);
 	if (guid != 0 && *end == '\0') {
-		search = NULL;
+		verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0);
+	} else if (zpool_vdev_is_interior(path)) {
+		verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0);
 	} else if (path[0] != '/') {
 		(void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path);
-		search = buf;
+		verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0);
 	} else {
-		search = path;
+		verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0);
 	}
 
 	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
@@ -1449,8 +1823,10 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare,
 	*l2cache = B_FALSE;
 	if (log != NULL)
 		*log = B_FALSE;
-	return (vdev_to_nvlist_iter(nvroot, search, guid, avail_spare,
-	    l2cache, log));
+	ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log);
+	nvlist_free(search);
+
+	return (ret);
 }
 
 static int
@@ -1467,106 +1843,178 @@ vdev_online(nvlist_t *nv)
 }
 
 /*
- * Get phys_path for a root pool
- * Return 0 on success; non-zeron on failure.
+ * Helper function for zpool_get_physpaths().
  */
-int
-zpool_get_physpath(zpool_handle_t *zhp, char *physpath)
+static int
+vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size,
+    size_t *bytes_written)
+{
+	size_t bytes_left, pos, rsz;
+	char *tmppath;
+	const char *format;
+
+	if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH,
+	    &tmppath) != 0)
+		return (EZFS_NODEVICE);
+
+	pos = *bytes_written;
+	bytes_left = physpath_size - pos;
+	format = (pos == 0) ? "%s" : " %s";
+
+	rsz = snprintf(physpath + pos, bytes_left, format, tmppath);
+	*bytes_written += rsz;
+
+	if (rsz >= bytes_left) {
+		/* if physpath was not copied properly, clear it */
+		if (bytes_left != 0) {
+			physpath[pos] = 0;
+		}
+		return (EZFS_NOSPC);
+	}
+	return (0);
+}
+
+static int
+vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size,
+    size_t *rsz, boolean_t is_spare)
+{
+	char *type;
+	int ret;
+
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0)
+		return (EZFS_INVALCONFIG);
+
+	if (strcmp(type, VDEV_TYPE_DISK) == 0) {
+		/*
+		 * An active spare device has ZPOOL_CONFIG_IS_SPARE set.
+		 * For a spare vdev, we only want to boot from the active
+		 * spare device.
+		 */
+		if (is_spare) {
+			uint64_t spare = 0;
+			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+			    &spare);
+			if (!spare)
+				return (EZFS_INVALCONFIG);
+		}
+
+		if (vdev_online(nv)) {
+			if ((ret = vdev_get_one_physpath(nv, physpath,
+			    phypath_size, rsz)) != 0)
+				return (ret);
+		}
+	} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 ||
+	    strcmp(type, VDEV_TYPE_REPLACING) == 0 ||
+	    (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) {
+		nvlist_t **child;
+		uint_t count;
+		int i, ret;
+
+		if (nvlist_lookup_nvlist_array(nv,
+		    ZPOOL_CONFIG_CHILDREN, &child, &count) != 0)
+			return (EZFS_INVALCONFIG);
+
+		for (i = 0; i < count; i++) {
+			ret = vdev_get_physpaths(child[i], physpath,
+			    phypath_size, rsz, is_spare);
+			if (ret == EZFS_NOSPC)
+				return (ret);
+		}
+	}
+
+	return (EZFS_POOL_INVALARG);
+}
+
+/*
+ * Get phys_path for a root pool config.
+ * Return 0 on success; non-zero on failure.
+ */
+static int
+zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size)
 {
+	size_t rsz;
 	nvlist_t *vdev_root;
 	nvlist_t **child;
 	uint_t count;
-	int i;
+	char *type;
 
-	/*
-	 * Make sure this is a root pool, as phys_path doesn't mean
-	 * anything to a non-root pool.
-	 */
-	if (!pool_is_bootable(zhp))
-		return (-1);
+	rsz = 0;
 
-	verify(nvlist_lookup_nvlist(zhp->zpool_config,
-	    ZPOOL_CONFIG_VDEV_TREE, &vdev_root) == 0);
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &vdev_root) != 0)
+		return (EZFS_INVALCONFIG);
 
-	if (nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
+	if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 ||
+	    nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN,
 	    &child, &count) != 0)
-		return (-2);
+		return (EZFS_INVALCONFIG);
 
-	for (i = 0; i < count; i++) {
-		nvlist_t **child2;
-		uint_t count2;
-		char *type;
-		char *tmppath;
-		int j;
+	/*
+	 * root pool can not have EFI labeled disks and can only have
+	 * a single top-level vdev.
+	 */
+	if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 ||
+	    pool_uses_efi(vdev_root))
+		return (EZFS_POOL_INVALARG);
 
-		if (nvlist_lookup_string(child[i], ZPOOL_CONFIG_TYPE, &type)
-		    != 0)
-			return (-3);
-
-		if (strcmp(type, VDEV_TYPE_DISK) == 0) {
-			if (!vdev_online(child[i]))
-				return (-8);
-			verify(nvlist_lookup_string(child[i],
-			    ZPOOL_CONFIG_PHYS_PATH, &tmppath) == 0);
-			(void) strncpy(physpath, tmppath, strlen(tmppath));
-		} else if (strcmp(type, VDEV_TYPE_MIRROR) == 0) {
-			if (nvlist_lookup_nvlist_array(child[i],
-			    ZPOOL_CONFIG_CHILDREN, &child2, &count2) != 0)
-				return (-4);
-
-			for (j = 0; j < count2; j++) {
-				if (!vdev_online(child2[j]))
-					return (-8);
-				if (nvlist_lookup_string(child2[j],
-				    ZPOOL_CONFIG_PHYS_PATH, &tmppath) != 0)
-					return (-5);
-
-				if ((strlen(physpath) + strlen(tmppath)) >
-				    MAXNAMELEN)
-					return (-6);
-
-				if (strlen(physpath) == 0) {
-					(void) strncpy(physpath, tmppath,
-					    strlen(tmppath));
-				} else {
-					(void) strcat(physpath, " ");
-					(void) strcat(physpath, tmppath);
-				}
-			}
-		} else {
-			return (-7);
-		}
-	}
+	(void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz,
+	    B_FALSE);
+
+	/* No online devices */
+	if (rsz == 0)
+		return (EZFS_NODEVICE);
 
 	return (0);
 }
 
 /*
- * Returns TRUE if the given guid corresponds to the given type.
- * This is used to check for hot spares (INUSE or not), and level 2 cache
- * devices.
+ * Get phys_path for a root pool
+ * Return 0 on success; non-zero on failure.
  */
-static boolean_t
-is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type)
+int
+zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size)
 {
-	uint64_t target_guid;
-	nvlist_t *nvroot;
-	nvlist_t **list;
-	uint_t count;
-	int i;
+	return (zpool_get_config_physpath(zhp->zpool_config, physpath,
+	    phypath_size));
+}
 
-	verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-	if (nvlist_lookup_nvlist_array(nvroot, type, &list, &count) == 0) {
-		for (i = 0; i < count; i++) {
-			verify(nvlist_lookup_uint64(list[i], ZPOOL_CONFIG_GUID,
-			    &target_guid) == 0);
-			if (guid == target_guid)
-				return (B_TRUE);
-		}
+/*
+ * If the device has being dynamically expanded then we need to relabel
+ * the disk to use the new unallocated space.
+ */
+static int
+zpool_relabel_disk(libzfs_handle_t *hdl, const char *name)
+{
+	char path[MAXPATHLEN];
+	char errbuf[1024];
+	int fd, error;
+	int (*_efi_use_whole_disk)(int);
+
+	if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT,
+	    "efi_use_whole_disk")) == NULL)
+		return (-1);
+
+	(void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name);
+
+	if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
+		    "relabel '%s': unable to open device"), name);
+		return (zfs_error(hdl, EZFS_OPENFAILED, errbuf));
 	}
 
-	return (B_FALSE);
+	/*
+	 * It's possible that we might encounter an error if the device
+	 * does not have any unallocated space left. If so, we simply
+	 * ignore that error and continue on.
+	 */
+	error = _efi_use_whole_disk(fd);
+	(void) close(fd);
+	if (error && error != VT_ENOSPC) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot "
+		    "relabel '%s': unable to read disk capacity"), name);
+		return (zfs_error(hdl, EZFS_NOCAP, errbuf));
+	}
+	return (0);
 }
 
 /*
@@ -1580,28 +2028,64 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags,
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 	nvlist_t *tgt;
-	boolean_t avail_spare, l2cache;
+	boolean_t avail_spare, l2cache, islog;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
 
-	(void) snprintf(msg, sizeof (msg),
-	    dgettext(TEXT_DOMAIN, "cannot online %s"), path);
+	if (flags & ZFS_ONLINE_EXPAND) {
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot expand %s"), path);
+	} else {
+		(void) snprintf(msg, sizeof (msg),
+		    dgettext(TEXT_DOMAIN, "cannot online %s"), path);
+	}
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
-	    NULL)) == NULL)
+	    &islog)) == NULL)
 		return (zfs_error(hdl, EZFS_NODEVICE, msg));
 
 	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
 
-	if (avail_spare ||
-	    is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE)
+	if (avail_spare)
 		return (zfs_error(hdl, EZFS_ISSPARE, msg));
 
+	if (flags & ZFS_ONLINE_EXPAND ||
+	    zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) {
+		char *pathname = NULL;
+		uint64_t wholedisk = 0;
+
+		(void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK,
+		    &wholedisk);
+		verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH,
+		    &pathname) == 0);
+
+		/*
+		 * XXX - L2ARC 1.0 devices can't support expansion.
+		 */
+		if (l2cache) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "cannot expand cache devices"));
+			return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg));
+		}
+
+		if (wholedisk) {
+			pathname += strlen(DISK_ROOT) + 1;
+			(void) zpool_relabel_disk(zhp->zpool_hdl, pathname);
+		}
+	}
+
 	zc.zc_cookie = VDEV_STATE_ONLINE;
 	zc.zc_obj = flags;
 
-	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0)
+	if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) {
+		if (errno == EINVAL) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split "
+			    "from this pool into a new one.  Use '%s' "
+			    "instead"), "zpool detach");
+			return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg));
+		}
 		return (zpool_standard_error(hdl, errno, msg));
+	}
 
 	*newstate = zc.zc_cookie;
 	return (0);
@@ -1629,8 +2113,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
 
 	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
 
-	if (avail_spare ||
-	    is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE)
+	if (avail_spare)
 		return (zfs_error(hdl, EZFS_ISSPARE, msg));
 
 	zc.zc_cookie = VDEV_STATE_OFFLINE;
@@ -1647,6 +2130,12 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
 		 */
 		return (zfs_error(hdl, EZFS_NOREPLICAS, msg));
 
+	case EEXIST:
+		/*
+		 * The log device has unplayed logs
+		 */
+		return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg));
+
 	default:
 		return (zpool_standard_error(hdl, errno, msg));
 	}
@@ -1656,7 +2145,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp)
  * Mark the given vdev faulted.
  */
 int
-zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
+zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
 {
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
@@ -1668,6 +2157,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	zc.zc_guid = guid;
 	zc.zc_cookie = VDEV_STATE_FAULTED;
+	zc.zc_obj = aux;
 
 	if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
 		return (0);
@@ -1690,7 +2180,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid)
  * Mark the given vdev degraded.
  */
 int
-zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid)
+zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux)
 {
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
@@ -1702,6 +2192,7 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid)
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	zc.zc_guid = guid;
 	zc.zc_cookie = VDEV_STATE_DEGRADED;
+	zc.zc_obj = aux;
 
 	if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0)
 		return (0);
@@ -1799,7 +2290,7 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 	verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL),
 	    ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0);
 
-	if ((newname = zpool_vdev_name(NULL, NULL, child[0])) == NULL)
+	if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL)
 		return (-1);
 
 	/*
@@ -1851,6 +2342,14 @@ zpool_vdev_attach(zpool_handle_t *zhp,
 			(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Please "
 			    "be sure to invoke %s to make '%s' bootable.\n"),
 			    BOOTCMD, new_disk);
+
+			/*
+			 * XXX need a better way to prevent user from
+			 * booting up a half-baked vdev.
+			 */
+			(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make "
+			    "sure to wait until resilver is done "
+			    "before rebooting.\n"));
 		}
 		return (0);
 	}
@@ -1978,6 +2477,257 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path)
 	return (-1);
 }
 
+/*
+ * Find a mirror vdev in the source nvlist.
+ *
+ * The mchild array contains a list of disks in one of the top-level mirrors
+ * of the source pool.  The schild array contains a list of disks that the
+ * user specified on the command line.  We loop over the mchild array to
+ * see if any entry in the schild array matches.
+ *
+ * If a disk in the mchild array is found in the schild array, we return
+ * the index of that entry.  Otherwise we return -1.
+ */
+static int
+find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren,
+    nvlist_t **schild, uint_t schildren)
+{
+	uint_t mc;
+
+	for (mc = 0; mc < mchildren; mc++) {
+		uint_t sc;
+		char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp,
+		    mchild[mc], B_FALSE);
+
+		for (sc = 0; sc < schildren; sc++) {
+			char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp,
+			    schild[sc], B_FALSE);
+			boolean_t result = (strcmp(mpath, spath) == 0);
+
+			free(spath);
+			if (result) {
+				free(mpath);
+				return (mc);
+			}
+		}
+
+		free(mpath);
+	}
+
+	return (-1);
+}
+
+/*
+ * Split a mirror pool.  If newroot points to null, then a new nvlist
+ * is generated and it is the responsibility of the caller to free it.
+ */
+int
+zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot,
+    nvlist_t *props, splitflags_t flags)
+{
+	zfs_cmd_t zc = { 0 };
+	char msg[1024];
+	nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL;
+	nvlist_t **varray = NULL, *zc_props = NULL;
+	uint_t c, children, newchildren, lastlog = 0, vcount, found = 0;
+	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	uint64_t vers;
+	boolean_t freelist = B_FALSE, memory_err = B_TRUE;
+	int retval = 0;
+
+	(void) snprintf(msg, sizeof (msg),
+	    dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name);
+
+	if (!zpool_name_valid(hdl, B_FALSE, newname))
+		return (zfs_error(hdl, EZFS_INVALIDNAME, msg));
+
+	if ((config = zpool_get_config(zhp, NULL)) == NULL) {
+		(void) fprintf(stderr, gettext("Internal error: unable to "
+		    "retrieve pool configuration\n"));
+		return (-1);
+	}
+
+	verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree)
+	    == 0);
+	verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0);
+
+	if (props) {
+		if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name,
+		    props, vers, B_TRUE, msg)) == NULL)
+			return (-1);
+	}
+
+	if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) != 0) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "Source pool is missing vdev tree"));
+		if (zc_props)
+			nvlist_free(zc_props);
+		return (-1);
+	}
+
+	varray = zfs_alloc(hdl, children * sizeof (nvlist_t *));
+	vcount = 0;
+
+	if (*newroot == NULL ||
+	    nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN,
+	    &newchild, &newchildren) != 0)
+		newchildren = 0;
+
+	for (c = 0; c < children; c++) {
+		uint64_t is_log = B_FALSE, is_hole = B_FALSE;
+		char *type;
+		nvlist_t **mchild, *vdev;
+		uint_t mchildren;
+		int entry;
+
+		/*
+		 * Unlike cache & spares, slogs are stored in the
+		 * ZPOOL_CONFIG_CHILDREN array.  We filter them out here.
+		 */
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG,
+		    &is_log);
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+		    &is_hole);
+		if (is_log || is_hole) {
+			/*
+			 * Create a hole vdev and put it in the config.
+			 */
+			if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0)
+				goto out;
+			if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE,
+			    VDEV_TYPE_HOLE) != 0)
+				goto out;
+			if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE,
+			    1) != 0)
+				goto out;
+			if (lastlog == 0)
+				lastlog = vcount;
+			varray[vcount++] = vdev;
+			continue;
+		}
+		lastlog = 0;
+		verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type)
+		    == 0);
+		if (strcmp(type, VDEV_TYPE_MIRROR) != 0) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "Source pool must be composed only of mirrors\n"));
+			retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
+			goto out;
+		}
+
+		verify(nvlist_lookup_nvlist_array(child[c],
+		    ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0);
+
+		/* find or add an entry for this top-level vdev */
+		if (newchildren > 0 &&
+		    (entry = find_vdev_entry(zhp, mchild, mchildren,
+		    newchild, newchildren)) >= 0) {
+			/* We found a disk that the user specified. */
+			vdev = mchild[entry];
+			++found;
+		} else {
+			/* User didn't specify a disk for this vdev. */
+			vdev = mchild[mchildren - 1];
+		}
+
+		if (nvlist_dup(vdev, &varray[vcount++], 0) != 0)
+			goto out;
+	}
+
+	/* did we find every disk the user specified? */
+	if (found != newchildren) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must "
+		    "include at most one disk from each mirror"));
+		retval = zfs_error(hdl, EZFS_INVALCONFIG, msg);
+		goto out;
+	}
+
+	/* Prepare the nvlist for populating. */
+	if (*newroot == NULL) {
+		if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0)
+			goto out;
+		freelist = B_TRUE;
+		if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE,
+		    VDEV_TYPE_ROOT) != 0)
+			goto out;
+	} else {
+		verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0);
+	}
+
+	/* Add all the children we found */
+	if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray,
+	    lastlog == 0 ? vcount : lastlog) != 0)
+		goto out;
+
+	/*
+	 * If we're just doing a dry run, exit now with success.
+	 */
+	if (flags.dryrun) {
+		memory_err = B_FALSE;
+		freelist = B_FALSE;
+		goto out;
+	}
+
+	/* now build up the config list & call the ioctl */
+	if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0)
+		goto out;
+
+	if (nvlist_add_nvlist(newconfig,
+	    ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 ||
+	    nvlist_add_string(newconfig,
+	    ZPOOL_CONFIG_POOL_NAME, newname) != 0 ||
+	    nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0)
+		goto out;
+
+	/*
+	 * The new pool is automatically part of the namespace unless we
+	 * explicitly export it.
+	 */
+	if (!flags.import)
+		zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT;
+	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
+	(void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string));
+	if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0)
+		goto out;
+	if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0)
+		goto out;
+
+	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) {
+		retval = zpool_standard_error(hdl, errno, msg);
+		goto out;
+	}
+
+	freelist = B_FALSE;
+	memory_err = B_FALSE;
+
+out:
+	if (varray != NULL) {
+		int v;
+
+		for (v = 0; v < vcount; v++)
+			nvlist_free(varray[v]);
+		free(varray);
+	}
+	zcmd_free_nvlists(&zc);
+	if (zc_props)
+		nvlist_free(zc_props);
+	if (newconfig)
+		nvlist_free(newconfig);
+	if (freelist) {
+		nvlist_free(*newroot);
+		*newroot = NULL;
+	}
+
+	if (retval != 0)
+		return (retval);
+
+	if (memory_err)
+		return (no_memory(hdl));
+
+	return (0);
+}
+
 /*
  * Remove the given device.  Currently, this is supported only for hot spares
  * and level 2 cache devices.
@@ -1988,24 +2738,34 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 	nvlist_t *tgt;
-	boolean_t avail_spare, l2cache;
+	boolean_t avail_spare, l2cache, islog;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	uint64_t version;
 
 	(void) snprintf(msg, sizeof (msg),
 	    dgettext(TEXT_DOMAIN, "cannot remove %s"), path);
 
 	(void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name));
 	if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache,
-	    NULL)) == 0)
+	    &islog)) == 0)
 		return (zfs_error(hdl, EZFS_NODEVICE, msg));
-
-	if (!avail_spare && !l2cache) {
+	/*
+	 * XXX - this should just go away.
+	 */
+	if (!avail_spare && !l2cache && !islog) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "only inactive hot spares or cache devices "
-		    "can be removed"));
+		    "only inactive hot spares, cache, top-level, "
+		    "or log devices can be removed"));
 		return (zfs_error(hdl, EZFS_NODEVICE, msg));
 	}
 
+	version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL);
+	if (islog && version < SPA_VERSION_HOLES) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "pool must be upgrade to support log removal"));
+		return (zfs_error(hdl, EZFS_BADVERSION, msg));
+	}
+
 	verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0);
 
 	if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0)
@@ -2018,13 +2778,15 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path)
  * Clear the errors for the pool, or the particular device if specified.
  */
 int
-zpool_clear(zpool_handle_t *zhp, const char *path)
+zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl)
 {
 	zfs_cmd_t zc = { 0 };
 	char msg[1024];
 	nvlist_t *tgt;
+	zpool_rewind_policy_t policy;
 	boolean_t avail_spare, l2cache;
 	libzfs_handle_t *hdl = zhp->zpool_hdl;
+	nvlist_t *nvi = NULL;
 
 	if (path)
 		(void) snprintf(msg, sizeof (msg),
@@ -2052,9 +2814,31 @@ zpool_clear(zpool_handle_t *zhp, const char *path)
 		    &zc.zc_guid) == 0);
 	}
 
-	if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0)
+	zpool_get_rewind_policy(rewindnvl, &policy);
+	zc.zc_cookie = policy.zrp_request;
+
+	if (zcmd_alloc_dst_nvlist(hdl, &zc, 8192) != 0)
+		return (-1);
+
+	if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0)
+		return (-1);
+
+	if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0 ||
+	    ((policy.zrp_request & ZPOOL_TRY_REWIND) &&
+	    errno != EPERM && errno != EACCES)) {
+		if (policy.zrp_request &
+		    (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) {
+			(void) zcmd_read_dst_nvlist(hdl, &zc, &nvi);
+			zpool_rewind_exclaim(hdl, zc.zc_name,
+			    ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0),
+			    nvi);
+			nvlist_free(nvi);
+		}
+		zcmd_free_nvlists(&zc);
 		return (0);
+	}
 
+	zcmd_free_nvlists(&zc);
 	return (zpool_standard_error(hdl, errno, msg));
 }
 
@@ -2081,173 +2865,6 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid)
 	return (zpool_standard_error(hdl, errno, msg));
 }
 
-/*
- * Iterate over all zvols in a given pool by walking the /dev/zvol/dsk/<pool>
- * hierarchy.
- */
-int
-zpool_iter_zvol(zpool_handle_t *zhp, int (*cb)(const char *, void *),
-    void *data)
-{
-	libzfs_handle_t *hdl = zhp->zpool_hdl;
-	char (*paths)[MAXPATHLEN];
-	size_t size = 4;
-	int curr, fd, base, ret = 0;
-	DIR *dirp;
-	struct dirent *dp;
-	struct stat st;
-
-	if ((base = open("/dev/zvol/dsk", O_RDONLY)) < 0)
-		return (errno == ENOENT ? 0 : -1);
-
-	if (fstatat(base, zhp->zpool_name, &st, 0) != 0) {
-		int err = errno;
-		(void) close(base);
-		return (err == ENOENT ? 0 : -1);
-	}
-
-	/*
-	 * Oddly this wasn't a directory -- ignore that failure since we
-	 * know there are no links lower in the (non-existant) hierarchy.
-	 */
-	if (!S_ISDIR(st.st_mode)) {
-		(void) close(base);
-		return (0);
-	}
-
-	if ((paths = zfs_alloc(hdl, size * sizeof (paths[0]))) == NULL) {
-		(void) close(base);
-		return (-1);
-	}
-
-	(void) strlcpy(paths[0], zhp->zpool_name, sizeof (paths[0]));
-	curr = 0;
-
-	while (curr >= 0) {
-		if (fstatat(base, paths[curr], &st, AT_SYMLINK_NOFOLLOW) != 0)
-			goto err;
-
-		if (S_ISDIR(st.st_mode)) {
-			if ((fd = openat(base, paths[curr], O_RDONLY)) < 0)
-				goto err;
-
-			if ((dirp = fdopendir(fd)) == NULL) {
-				(void) close(fd);
-				goto err;
-			}
-
-			while ((dp = readdir(dirp)) != NULL) {
-				if (dp->d_name[0] == '.')
-					continue;
-
-				if (curr + 1 == size) {
-					paths = zfs_realloc(hdl, paths,
-					    size * sizeof (paths[0]),
-					    size * 2 * sizeof (paths[0]));
-					if (paths == NULL) {
-						(void) closedir(dirp);
-						(void) close(fd);
-						goto err;
-					}
-
-					size *= 2;
-				}
-
-				(void) strlcpy(paths[curr + 1], paths[curr],
-				    sizeof (paths[curr + 1]));
-				(void) strlcat(paths[curr], "/",
-				    sizeof (paths[curr]));
-				(void) strlcat(paths[curr], dp->d_name,
-				    sizeof (paths[curr]));
-				curr++;
-			}
-
-			(void) closedir(dirp);
-
-		} else {
-			if ((ret = cb(paths[curr], data)) != 0)
-				break;
-		}
-
-		curr--;
-	}
-
-	free(paths);
-	(void) close(base);
-
-	return (ret);
-
-err:
-	free(paths);
-	(void) close(base);
-	return (-1);
-}
-
-typedef struct zvol_cb {
-	zpool_handle_t *zcb_pool;
-	boolean_t zcb_create;
-} zvol_cb_t;
-
-/*ARGSUSED*/
-static int
-do_zvol_create(zfs_handle_t *zhp, void *data)
-{
-	int ret = 0;
-
-	if (ZFS_IS_VOLUME(zhp)) {
-		(void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name);
-		ret = zfs_iter_snapshots(zhp, do_zvol_create, NULL);
-	}
-
-	if (ret == 0)
-		ret = zfs_iter_filesystems(zhp, do_zvol_create, NULL);
-
-	zfs_close(zhp);
-
-	return (ret);
-}
-
-/*
- * Iterate over all zvols in the pool and make any necessary minor nodes.
- */
-int
-zpool_create_zvol_links(zpool_handle_t *zhp)
-{
-	zfs_handle_t *zfp;
-	int ret;
-
-	/*
-	 * If the pool is unavailable, just return success.
-	 */
-	if ((zfp = make_dataset_handle(zhp->zpool_hdl,
-	    zhp->zpool_name)) == NULL)
-		return (0);
-
-	ret = zfs_iter_filesystems(zfp, do_zvol_create, NULL);
-
-	zfs_close(zfp);
-	return (ret);
-}
-
-static int
-do_zvol_remove(const char *dataset, void *data)
-{
-	zpool_handle_t *zhp = data;
-
-	return (zvol_remove_link(zhp->zpool_hdl, dataset));
-}
-
-/*
- * Iterate over all zvols in the pool and remove any minor nodes.  We iterate
- * by examining the /dev links so that a corrupted pool doesn't impede this
- * operation.
- */
-int
-zpool_remove_zvol_links(zpool_handle_t *zhp)
-{
-	return (zpool_iter_zvol(zhp, do_zvol_remove, zhp));
-}
-
 /*
  * Convert from a devid string to a path.
  */
@@ -2340,7 +2957,8 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path)
  * of these checks.
  */
 char *
-zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
+zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv,
+    boolean_t verbose)
 {
 	char *path, *devid;
 	uint64_t value;
@@ -2419,6 +3037,20 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv)
 			    (u_longlong_t)value);
 			path = buf;
 		}
+
+		/*
+		 * We identify each top-level vdev by using a <type-id>
+		 * naming convention.
+		 */
+		if (verbose) {
+			uint64_t id;
+
+			verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID,
+			    &id) == 0);
+			(void) snprintf(buf, sizeof (buf), "%s-%llu", path,
+			    (u_longlong_t)id);
+			path = buf;
+		}
 	}
 
 	return (zfs_strdup(hdl, path));
@@ -2637,7 +3269,7 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len)
  * into 'records'.  'leftover' is set to the number of bytes that weren't
  * processed as there wasn't a complete record.
  */
-static int
+int
 zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover,
     nvlist_t ***records, uint_t *numrecords)
 {
@@ -2766,14 +3398,6 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj,
 	free(mntpnt);
 }
 
-#define	RDISK_ROOT	"/dev/rdsk"
-#define	BACKUP_SLICE	"s2"
-/*
- * Don't start the slice at the default block of 34; many storage
- * devices will use a stripe width of 128k, so start there instead.
- */
-#define	NEW_START_BLOCK	256
-
 /*
  * Read the EFI label from the config, if a label does not exist then
  * pass back the error to the caller. If the caller has passed a non-NULL
@@ -2964,6 +3588,7 @@ supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf)
 	if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 ||
 	    strcmp(type, VDEV_TYPE_FILE) == 0 ||
 	    strcmp(type, VDEV_TYPE_LOG) == 0 ||
+	    strcmp(type, VDEV_TYPE_HOLE) == 0 ||
 	    strcmp(type, VDEV_TYPE_MISSING) == 0) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "vdev type '%s' is not supported"), type);
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c
index a3ed5cea8589b..c8d85c8b86024 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c
@@ -20,14 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <assert.h>
 #include <ctype.h>
 #include <errno.h>
-#include <libdevinfo.h>
 #include <libintl.h>
 #include <stdio.h>
 #include <stdlib.h>
@@ -36,22 +35,382 @@
 #include <stddef.h>
 #include <fcntl.h>
 #include <sys/mount.h>
-#include <sys/mntent.h>
-#include <sys/mnttab.h>
-#include <sys/avl.h>
-#include <stddef.h>
+#include <pthread.h>
+#include <umem.h>
 
 #include <libzfs.h>
 
 #include "zfs_namecheck.h"
 #include "zfs_prop.h"
+#include "zfs_fletcher.h"
 #include "libzfs_impl.h"
+#include <sha2.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
 
-#include <fletcher.c> /* XXX */
+/* in libzfs_dataset.c */
+extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *);
 
 static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t,
     int, avl_tree_t *, char **);
 
+static const zio_cksum_t zero_cksum = { 0 };
+
+typedef struct dedup_arg {
+	int	inputfd;
+	int	outputfd;
+	libzfs_handle_t  *dedup_hdl;
+} dedup_arg_t;
+
+typedef struct dataref {
+	uint64_t ref_guid;
+	uint64_t ref_object;
+	uint64_t ref_offset;
+} dataref_t;
+
+typedef struct dedup_entry {
+	struct dedup_entry	*dde_next;
+	zio_cksum_t dde_chksum;
+	uint64_t dde_prop;
+	dataref_t dde_ref;
+} dedup_entry_t;
+
+#define	MAX_DDT_PHYSMEM_PERCENT		20
+#define	SMALLEST_POSSIBLE_MAX_DDT_MB		128
+
+typedef struct dedup_table {
+	dedup_entry_t	**dedup_hash_array;
+	umem_cache_t	*ddecache;
+	uint64_t	max_ddt_size;  /* max dedup table size in bytes */
+	uint64_t	cur_ddt_size;  /* current dedup table size in bytes */
+	uint64_t	ddt_count;
+	int		numhashbits;
+	boolean_t	ddt_full;
+} dedup_table_t;
+
+static int
+high_order_bit(uint64_t n)
+{
+	int count;
+
+	for (count = 0; n != 0; count++)
+		n >>= 1;
+	return (count);
+}
+
+static size_t
+ssread(void *buf, size_t len, FILE *stream)
+{
+	size_t outlen;
+
+	if ((outlen = fread(buf, len, 1, stream)) == 0)
+		return (0);
+
+	return (outlen);
+}
+
+static void
+ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp,
+    zio_cksum_t *cs, uint64_t prop, dataref_t *dr)
+{
+	dedup_entry_t	*dde;
+
+	if (ddt->cur_ddt_size >= ddt->max_ddt_size) {
+		if (ddt->ddt_full == B_FALSE) {
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "Dedup table full.  Deduplication will continue "
+			    "with existing table entries"));
+			ddt->ddt_full = B_TRUE;
+		}
+		return;
+	}
+
+	if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT))
+	    != NULL) {
+		assert(*ddepp == NULL);
+		dde->dde_next = NULL;
+		dde->dde_chksum = *cs;
+		dde->dde_prop = prop;
+		dde->dde_ref = *dr;
+		*ddepp = dde;
+		ddt->cur_ddt_size += sizeof (dedup_entry_t);
+		ddt->ddt_count++;
+	}
+}
+
+/*
+ * Using the specified dedup table, do a lookup for an entry with
+ * the checksum cs.  If found, return the block's reference info
+ * in *dr. Otherwise, insert a new entry in the dedup table, using
+ * the reference information specified by *dr.
+ *
+ * return value:  true - entry was found
+ *		  false - entry was not found
+ */
+static boolean_t
+ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs,
+    uint64_t prop, dataref_t *dr)
+{
+	uint32_t hashcode;
+	dedup_entry_t **ddepp;
+
+	hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits);
+
+	for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL;
+	    ddepp = &((*ddepp)->dde_next)) {
+		if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) &&
+		    (*ddepp)->dde_prop == prop) {
+			*dr = (*ddepp)->dde_ref;
+			return (B_TRUE);
+		}
+	}
+	ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr);
+	return (B_FALSE);
+}
+
+static int
+cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd)
+{
+	fletcher_4_incremental_native(buf, len, zc);
+	return (write(outfd, buf, len));
+}
+
+/*
+ * This function is started in a separate thread when the dedup option
+ * has been requested.  The main send thread determines the list of
+ * snapshots to be included in the send stream and makes the ioctl calls
+ * for each one.  But instead of having the ioctl send the output to the
+ * the output fd specified by the caller of zfs_send()), the
+ * ioctl is told to direct the output to a pipe, which is read by the
+ * alternate thread running THIS function.  This function does the
+ * dedup'ing by:
+ *  1. building a dedup table (the DDT)
+ *  2. doing checksums on each data block and inserting a record in the DDT
+ *  3. looking for matching checksums, and
+ *  4.  sending a DRR_WRITE_BYREF record instead of a write record whenever
+ *      a duplicate block is found.
+ * The output of this function then goes to the output fd requested
+ * by the caller of zfs_send().
+ */
+static void *
+cksummer(void *arg)
+{
+	dedup_arg_t *dda = arg;
+	char *buf = malloc(1<<20);
+	dmu_replay_record_t thedrr;
+	dmu_replay_record_t *drr = &thedrr;
+	struct drr_begin *drrb = &thedrr.drr_u.drr_begin;
+	struct drr_end *drre = &thedrr.drr_u.drr_end;
+	struct drr_object *drro = &thedrr.drr_u.drr_object;
+	struct drr_write *drrw = &thedrr.drr_u.drr_write;
+	FILE *ofp;
+	int outfd;
+	dmu_replay_record_t wbr_drr = {0};
+	struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref;
+	dedup_table_t ddt;
+	zio_cksum_t stream_cksum;
+	uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE);
+	uint64_t numbuckets;
+
+	ddt.max_ddt_size =
+	    MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100,
+	    SMALLEST_POSSIBLE_MAX_DDT_MB<<20);
+
+	numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t));
+
+	/*
+	 * numbuckets must be a power of 2.  Increase number to
+	 * a power of 2 if necessary.
+	 */
+	if (!ISP2(numbuckets))
+		numbuckets = 1 << high_order_bit(numbuckets);
+
+	ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *));
+	ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0,
+	    NULL, NULL, NULL, NULL, NULL, 0);
+	ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *);
+	ddt.numhashbits = high_order_bit(numbuckets) - 1;
+	ddt.ddt_full = B_FALSE;
+
+	/* Initialize the write-by-reference block. */
+	wbr_drr.drr_type = DRR_WRITE_BYREF;
+	wbr_drr.drr_payloadlen = 0;
+
+	outfd = dda->outputfd;
+	ofp = fdopen(dda->inputfd, "r");
+	while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) {
+
+		switch (drr->drr_type) {
+		case DRR_BEGIN:
+		{
+			int	fflags;
+			ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0);
+
+			/* set the DEDUP feature flag for this stream */
+			fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+			fflags |= (DMU_BACKUP_FEATURE_DEDUP |
+			    DMU_BACKUP_FEATURE_DEDUPPROPS);
+			DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags);
+
+			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+			    &stream_cksum, outfd) == -1)
+				goto out;
+			if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+			    DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) {
+				int sz = drr->drr_payloadlen;
+
+				if (sz > 1<<20) {
+					free(buf);
+					buf = malloc(sz);
+				}
+				(void) ssread(buf, sz, ofp);
+				if (ferror(stdin))
+					perror("fread");
+				if (cksum_and_write(buf, sz, &stream_cksum,
+				    outfd) == -1)
+					goto out;
+			}
+			break;
+		}
+
+		case DRR_END:
+		{
+			/* use the recalculated checksum */
+			ZIO_SET_CHECKSUM(&drre->drr_checksum,
+			    stream_cksum.zc_word[0], stream_cksum.zc_word[1],
+			    stream_cksum.zc_word[2], stream_cksum.zc_word[3]);
+			if ((write(outfd, drr,
+			    sizeof (dmu_replay_record_t))) == -1)
+				goto out;
+			break;
+		}
+
+		case DRR_OBJECT:
+		{
+			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+			    &stream_cksum, outfd) == -1)
+				goto out;
+			if (drro->drr_bonuslen > 0) {
+				(void) ssread(buf,
+				    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
+				    ofp);
+				if (cksum_and_write(buf,
+				    P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8),
+				    &stream_cksum, outfd) == -1)
+					goto out;
+			}
+			break;
+		}
+
+		case DRR_FREEOBJECTS:
+		{
+			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+			    &stream_cksum, outfd) == -1)
+				goto out;
+			break;
+		}
+
+		case DRR_WRITE:
+		{
+			dataref_t	dataref;
+
+			(void) ssread(buf, drrw->drr_length, ofp);
+
+			/*
+			 * Use the existing checksum if it's dedup-capable,
+			 * else calculate a SHA256 checksum for it.
+			 */
+
+			if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum,
+			    zero_cksum) ||
+			    !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) {
+				SHA256_CTX	ctx;
+				zio_cksum_t	tmpsha256;
+
+				SHA256Init(&ctx);
+				SHA256Update(&ctx, buf, drrw->drr_length);
+				SHA256Final(&tmpsha256, &ctx);
+				drrw->drr_key.ddk_cksum.zc_word[0] =
+				    BE_64(tmpsha256.zc_word[0]);
+				drrw->drr_key.ddk_cksum.zc_word[1] =
+				    BE_64(tmpsha256.zc_word[1]);
+				drrw->drr_key.ddk_cksum.zc_word[2] =
+				    BE_64(tmpsha256.zc_word[2]);
+				drrw->drr_key.ddk_cksum.zc_word[3] =
+				    BE_64(tmpsha256.zc_word[3]);
+				drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256;
+				drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP;
+			}
+
+			dataref.ref_guid = drrw->drr_toguid;
+			dataref.ref_object = drrw->drr_object;
+			dataref.ref_offset = drrw->drr_offset;
+
+			if (ddt_update(dda->dedup_hdl, &ddt,
+			    &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop,
+			    &dataref)) {
+				/* block already present in stream */
+				wbr_drrr->drr_object = drrw->drr_object;
+				wbr_drrr->drr_offset = drrw->drr_offset;
+				wbr_drrr->drr_length = drrw->drr_length;
+				wbr_drrr->drr_toguid = drrw->drr_toguid;
+				wbr_drrr->drr_refguid = dataref.ref_guid;
+				wbr_drrr->drr_refobject =
+				    dataref.ref_object;
+				wbr_drrr->drr_refoffset =
+				    dataref.ref_offset;
+
+				wbr_drrr->drr_checksumtype =
+				    drrw->drr_checksumtype;
+				wbr_drrr->drr_checksumflags =
+				    drrw->drr_checksumtype;
+				wbr_drrr->drr_key.ddk_cksum =
+				    drrw->drr_key.ddk_cksum;
+				wbr_drrr->drr_key.ddk_prop =
+				    drrw->drr_key.ddk_prop;
+
+				if (cksum_and_write(&wbr_drr,
+				    sizeof (dmu_replay_record_t), &stream_cksum,
+				    outfd) == -1)
+					goto out;
+			} else {
+				/* block not previously seen */
+				if (cksum_and_write(drr,
+				    sizeof (dmu_replay_record_t), &stream_cksum,
+				    outfd) == -1)
+					goto out;
+				if (cksum_and_write(buf,
+				    drrw->drr_length,
+				    &stream_cksum, outfd) == -1)
+					goto out;
+			}
+			break;
+		}
+
+		case DRR_FREE:
+		{
+			if (cksum_and_write(drr, sizeof (dmu_replay_record_t),
+			    &stream_cksum, outfd) == -1)
+				goto out;
+			break;
+		}
+
+		default:
+			(void) printf("INVALID record type 0x%x\n",
+			    drr->drr_type);
+			/* should never happen, so assert */
+			assert(B_FALSE);
+		}
+	}
+out:
+	umem_cache_destroy(ddt.ddecache);
+	free(ddt.dedup_hash_array);
+	free(buf);
+	(void) fclose(ofp);
+
+	return (NULL);
+}
+
 /*
  * Routines for dealing with the AVL tree of fs-nvlists
  */
@@ -113,6 +472,9 @@ fsavl_destroy(avl_tree_t *avl)
 	free(avl);
 }
 
+/*
+ * Given an nvlist, produce an avl tree of snapshots, ordered by guid
+ */
 static avl_tree_t *
 fsavl_create(nvlist_t *fss)
 {
@@ -170,6 +532,7 @@ typedef struct send_data {
 	nvlist_t *snapprops;
 	const char *fromsnap;
 	const char *tosnap;
+	boolean_t recursive;
 
 	/*
 	 * The header nvlist is of the following format:
@@ -237,23 +600,50 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
 		zfs_prop_t prop = zfs_name_to_prop(propname);
 		nvlist_t *propnv;
 
-		if (!zfs_prop_user(propname) && zfs_prop_readonly(prop))
-			continue;
+		if (!zfs_prop_user(propname)) {
+			/*
+			 * Realistically, this should never happen.  However,
+			 * we want the ability to add DSL properties without
+			 * needing to make incompatible version changes.  We
+			 * need to ignore unknown properties to allow older
+			 * software to still send datasets containing these
+			 * properties, with the unknown properties elided.
+			 */
+			if (prop == ZPROP_INVAL)
+				continue;
+
+			if (zfs_prop_readonly(prop))
+				continue;
+		}
 
 		verify(nvpair_value_nvlist(elem, &propnv) == 0);
-		if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) {
-			/* these guys are modifyable, but have no source */
+		if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION ||
+		    prop == ZFS_PROP_REFQUOTA ||
+		    prop == ZFS_PROP_REFRESERVATION) {
+			char *source;
 			uint64_t value;
 			verify(nvlist_lookup_uint64(propnv,
 			    ZPROP_VALUE, &value) == 0);
 			if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT)
 				continue;
+			/*
+			 * May have no source before SPA_VERSION_RECVD_PROPS,
+			 * but is still modifiable.
+			 */
+			if (nvlist_lookup_string(propnv,
+			    ZPROP_SOURCE, &source) == 0) {
+				if ((strcmp(source, zhp->zfs_name) != 0) &&
+				    (strcmp(source,
+				    ZPROP_SOURCE_VAL_RECVD) != 0))
+					continue;
+			}
 		} else {
 			char *source;
 			if (nvlist_lookup_string(propnv,
 			    ZPROP_SOURCE, &source) != 0)
 				continue;
-			if (strcmp(source, zhp->zfs_name) != 0)
+			if ((strcmp(source, zhp->zfs_name) != 0) &&
+			    (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0))
 				continue;
 		}
 
@@ -272,12 +662,17 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv)
 	}
 }
 
+/*
+ * recursively generate nvlists describing datasets.  See comment
+ * for the data structure send_data_t above for description of contents
+ * of the nvlist.
+ */
 static int
 send_iterate_fs(zfs_handle_t *zhp, void *arg)
 {
 	send_data_t *sd = arg;
 	nvlist_t *nvfs, *nv;
-	int rv;
+	int rv = 0;
 	uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid;
 	uint64_t guid = zhp->zfs_dmustats.dds_guid;
 	char guidstring[64];
@@ -319,7 +714,8 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg)
 	nvlist_free(nvfs);
 
 	/* iterate over children */
-	rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
+	if (sd->recursive)
+		rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd);
 
 	sd->parent_fromsnap_guid = parent_fromsnap_guid_save;
 
@@ -329,7 +725,7 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg)
 
 static int
 gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
-    const char *tosnap, nvlist_t **nvlp, avl_tree_t **avlp)
+    const char *tosnap, boolean_t recursive, nvlist_t **nvlp, avl_tree_t **avlp)
 {
 	zfs_handle_t *zhp;
 	send_data_t sd = { 0 };
@@ -342,6 +738,7 @@ gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap,
 	VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0));
 	sd.fromsnap = fromsnap;
 	sd.tosnap = tosnap;
+	sd.recursive = recursive;
 
 	if ((error = send_iterate_fs(zhp, &sd)) != 0) {
 		nvlist_free(sd.fss);
@@ -403,7 +800,7 @@ zfs_snapshot_compare(const void *larg, const void *rarg)
 		return (0);
 }
 
-static int
+int
 zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data)
 {
 	int ret = 0;
@@ -434,13 +831,15 @@ typedef struct send_dump_data {
 	/* these are all just the short snapname (the part after the @) */
 	const char *fromsnap;
 	const char *tosnap;
-	char lastsnap[ZFS_MAXNAMELEN];
+	char prevsnap[ZFS_MAXNAMELEN];
 	boolean_t seenfrom, seento, replicate, doall, fromorigin;
 	boolean_t verbose;
 	int outfd;
 	boolean_t err;
 	nvlist_t *fss;
 	avl_tree_t *fsavl;
+	snapfilter_cb_t *filter_cb;
+	void *filter_cb_arg;
 } send_dump_data_t;
 
 /*
@@ -449,7 +848,7 @@ typedef struct send_dump_data {
  */
 static int
 dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
-    int outfd)
+    int outfd, boolean_t enoent_ok, boolean_t *got_enoent)
 {
 	zfs_cmd_t zc = { 0 };
 	libzfs_handle_t *hdl = zhp->zfs_hdl;
@@ -463,6 +862,8 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
 	zc.zc_cookie = outfd;
 	zc.zc_obj = fromorigin;
 
+	*got_enoent = B_FALSE;
+
 	if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) {
 		char errbuf[1024];
 		(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
@@ -476,6 +877,10 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin,
 			return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf));
 
 		case ENOENT:
+			if (enoent_ok) {
+				*got_enoent = B_TRUE;
+				return (0);
+			}
 			if (zfs_dataset_exists(hdl, zc.zc_name,
 			    ZFS_TYPE_SNAPSHOT)) {
 				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
@@ -512,13 +917,14 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
 	send_dump_data_t *sdd = arg;
 	const char *thissnap;
 	int err;
+	boolean_t got_enoent;
 
 	thissnap = strchr(zhp->zfs_name, '@') + 1;
 
 	if (sdd->fromsnap && !sdd->seenfrom &&
 	    strcmp(sdd->fromsnap, thissnap) == 0) {
 		sdd->seenfrom = B_TRUE;
-		(void) strcpy(sdd->lastsnap, thissnap);
+		(void) strcpy(sdd->prevsnap, thissnap);
 		zfs_close(zhp);
 		return (0);
 	}
@@ -528,20 +934,41 @@ dump_snapshot(zfs_handle_t *zhp, void *arg)
 		return (0);
 	}
 
+	if (strcmp(sdd->tosnap, thissnap) == 0)
+		sdd->seento = B_TRUE;
+
+	/*
+	 * If a filter function exists, call it to determine whether
+	 * this snapshot will be sent.
+	 */
+	if (sdd->filter_cb != NULL &&
+	    sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE) {
+		/*
+		 * This snapshot is filtered out.  Don't send it, and don't
+		 * set prevsnap, so it will be as if this snapshot didn't
+		 * exist, and the next accepted snapshot will be sent as
+		 * an incremental from the last accepted one, or as the
+		 * first (and full) snapshot in the case of a replication,
+		 * non-incremental send.
+		 */
+		zfs_close(zhp);
+		return (0);
+	}
+
 	/* send it */
 	if (sdd->verbose) {
 		(void) fprintf(stderr, "sending from @%s to %s\n",
-		    sdd->lastsnap, zhp->zfs_name);
+		    sdd->prevsnap, zhp->zfs_name);
 	}
 
-	err = dump_ioctl(zhp, sdd->lastsnap,
-	    sdd->lastsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
-	    sdd->outfd);
+	err = dump_ioctl(zhp, sdd->prevsnap,
+	    sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate),
+	    sdd->outfd, B_TRUE, &got_enoent);
 
-	if (!sdd->seento && strcmp(sdd->tosnap, thissnap) == 0)
-		sdd->seento = B_TRUE;
-
-	(void) strcpy(sdd->lastsnap, thissnap);
+	if (got_enoent)
+		err = 0;
+	else
+		(void) strcpy(sdd->prevsnap, thissnap);
 	zfs_close(zhp);
 	return (err);
 }
@@ -581,7 +1008,7 @@ dump_filesystem(zfs_handle_t *zhp, void *arg)
 	}
 
 	if (sdd->doall) {
-		sdd->seenfrom = sdd->seento = sdd->lastsnap[0] = 0;
+		sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0;
 		if (sdd->fromsnap == NULL || missingfrom)
 			sdd->seenfrom = B_TRUE;
 
@@ -594,12 +1021,18 @@ dump_filesystem(zfs_handle_t *zhp, void *arg)
 			    zhp->zfs_name, sdd->fromsnap);
 			sdd->err = B_TRUE;
 		} else if (!sdd->seento) {
-			(void) fprintf(stderr,
-			    "WARNING: could not send %s@%s:\n"
-			    "incremental source (%s@%s) "
-			    "is not earlier than it\n",
-			    zhp->zfs_name, sdd->tosnap,
-			    zhp->zfs_name, sdd->fromsnap);
+			if (sdd->fromsnap) {
+				(void) fprintf(stderr,
+				    "WARNING: could not send %s@%s:\n"
+				    "incremental source (%s@%s) "
+				    "is not earlier than it\n",
+				    zhp->zfs_name, sdd->tosnap,
+				    zhp->zfs_name, sdd->fromsnap);
+			} else {
+				(void) fprintf(stderr, "WARNING: "
+				    "could not send %s@%s: does not exist\n",
+				    zhp->zfs_name, sdd->tosnap);
+			}
 			sdd->err = B_TRUE;
 		}
 	} else {
@@ -612,10 +1045,16 @@ dump_filesystem(zfs_handle_t *zhp, void *arg)
 		if (snapzhp == NULL) {
 			rv = -1;
 		} else {
-			rv = dump_ioctl(snapzhp,
-			    missingfrom ? NULL : sdd->fromsnap,
-			    sdd->fromorigin || missingfrom,
-			    sdd->outfd);
+			if (sdd->filter_cb == NULL ||
+			    sdd->filter_cb(snapzhp, sdd->filter_cb_arg) ==
+			    B_TRUE) {
+				boolean_t got_enoent;
+
+				rv = dump_ioctl(snapzhp,
+				    missingfrom ? NULL : sdd->fromsnap,
+				    sdd->fromorigin || missingfrom,
+				    sdd->outfd, B_FALSE, &got_enoent);
+			}
 			sdd->seento = B_TRUE;
 			zfs_close(snapzhp);
 		}
@@ -681,20 +1120,39 @@ dump_filesystems(zfs_handle_t *rzhp, void *arg)
 }
 
 /*
- * Dumps a backup of tosnap, incremental from fromsnap if it isn't NULL.
- * If 'doall', dump all intermediate snaps.
- * If 'replicate', dump special header and do recursively.
+ * Generate a send stream for the dataset identified by the argument zhp.
+ *
+ * The content of the send stream is the snapshot identified by
+ * 'tosnap'.  Incremental streams are requested in two ways:
+ *     - from the snapshot identified by "fromsnap" (if non-null) or
+ *     - from the origin of the dataset identified by zhp, which must
+ *	 be a clone.  In this case, "fromsnap" is null and "fromorigin"
+ *	 is TRUE.
+ *
+ * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and
+ * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM)
+ * if "replicate" is set.  If "doall" is set, dump all the intermediate
+ * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall"
+ * case too. If "props" is set, send properties.
  */
 int
 zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
-    boolean_t replicate, boolean_t doall, boolean_t fromorigin,
-    boolean_t verbose, int outfd)
+    sendflags_t flags, int outfd, snapfilter_cb_t filter_func,
+    void *cb_arg)
 {
 	char errbuf[1024];
 	send_dump_data_t sdd = { 0 };
 	int err;
 	nvlist_t *fss = NULL;
 	avl_tree_t *fsavl = NULL;
+	char holdtag[128];
+	static uint64_t holdseq;
+	int spa_version;
+	boolean_t holdsnaps = B_FALSE;
+	pthread_t tid;
+	int pipefd[2];
+	dedup_arg_t dda = { 0 };
+	int featureflags = 0;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot send '%s'"), zhp->zfs_name);
@@ -705,15 +1163,47 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 		return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf));
 	}
 
-	if (replicate || doall) {
+	if (zfs_spa_version(zhp, &spa_version) == 0 &&
+	    spa_version >= SPA_VERSION_USERREFS)
+		holdsnaps = B_TRUE;
+
+	if (flags.dedup) {
+		featureflags |= (DMU_BACKUP_FEATURE_DEDUP |
+		    DMU_BACKUP_FEATURE_DEDUPPROPS);
+		if (err = pipe(pipefd)) {
+			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+			return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED,
+			    errbuf));
+		}
+		dda.outputfd = outfd;
+		dda.inputfd = pipefd[1];
+		dda.dedup_hdl = zhp->zfs_hdl;
+		if (err = pthread_create(&tid, NULL, cksummer, &dda)) {
+			(void) close(pipefd[0]);
+			(void) close(pipefd[1]);
+			zfs_error_aux(zhp->zfs_hdl, strerror(errno));
+			return (zfs_error(zhp->zfs_hdl,
+			    EZFS_THREADCREATEFAILED, errbuf));
+		}
+	}
+
+	if (flags.replicate || flags.doall || flags.props) {
 		dmu_replay_record_t drr = { 0 };
 		char *packbuf = NULL;
 		size_t buflen = 0;
 		zio_cksum_t zc = { 0 };
 
-		assert(fromsnap || doall);
+		if (holdsnaps) {
+			(void) snprintf(holdtag, sizeof (holdtag),
+			    ".send-%d-%llu", getpid(), (u_longlong_t)holdseq);
+			++holdseq;
+			err = zfs_hold_range(zhp, fromsnap, tosnap,
+			    holdtag, flags.replicate, B_TRUE);
+			if (err)
+				goto err_out;
+		}
 
-		if (replicate) {
+		if (flags.replicate || flags.props) {
 			nvlist_t *hdrnv;
 
 			VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0));
@@ -722,11 +1212,20 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 				    "fromsnap", fromsnap));
 			}
 			VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap));
+			if (!flags.replicate) {
+				VERIFY(0 == nvlist_add_boolean(hdrnv,
+				    "not_recursive"));
+			}
 
 			err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name,
-			    fromsnap, tosnap, &fss, &fsavl);
-			if (err)
-				return (err);
+			    fromsnap, tosnap, flags.replicate, &fss, &fsavl);
+			if (err) {
+				if (holdsnaps) {
+					(void) zfs_release_range(zhp, fromsnap,
+					    tosnap, holdtag, flags.replicate);
+				}
+				goto err_out;
+			}
 			VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss));
 			err = nvlist_pack(hdrnv, &packbuf, &buflen,
 			    NV_ENCODE_XDR, 0);
@@ -734,33 +1233,41 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 			if (err) {
 				fsavl_destroy(fsavl);
 				nvlist_free(fss);
-				return (zfs_standard_error(zhp->zfs_hdl,
-				    err, errbuf));
+				if (holdsnaps) {
+					(void) zfs_release_range(zhp, fromsnap,
+					    tosnap, holdtag, flags.replicate);
+				}
+				goto stderr_out;
 			}
 		}
 
 		/* write first begin record */
 		drr.drr_type = DRR_BEGIN;
 		drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
-		drr.drr_u.drr_begin.drr_version = DMU_BACKUP_HEADER_VERSION;
+		DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.drr_versioninfo,
+		    DMU_COMPOUNDSTREAM);
+		DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.drr_versioninfo,
+		    featureflags);
 		(void) snprintf(drr.drr_u.drr_begin.drr_toname,
 		    sizeof (drr.drr_u.drr_begin.drr_toname),
 		    "%s@%s", zhp->zfs_name, tosnap);
 		drr.drr_payloadlen = buflen;
-		fletcher_4_incremental_native(&drr, sizeof (drr), &zc);
-		err = write(outfd, &drr, sizeof (drr));
+		err = cksum_and_write(&drr, sizeof (drr), &zc, outfd);
 
 		/* write header nvlist */
-		if (err != -1) {
-			fletcher_4_incremental_native(packbuf, buflen, &zc);
-			err = write(outfd, packbuf, buflen);
+		if (err != -1 && packbuf != NULL) {
+			err = cksum_and_write(packbuf, buflen, &zc, outfd);
 		}
 		free(packbuf);
 		if (err == -1) {
 			fsavl_destroy(fsavl);
 			nvlist_free(fss);
-			return (zfs_standard_error(zhp->zfs_hdl,
-			    errno, errbuf));
+			if (holdsnaps) {
+				(void) zfs_release_range(zhp, fromsnap, tosnap,
+				    holdtag, flags.replicate);
+			}
+			err = errno;
+			goto stderr_out;
 		}
 
 		/* write end record */
@@ -772,8 +1279,12 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 			if (err == -1) {
 				fsavl_destroy(fsavl);
 				nvlist_free(fss);
-				return (zfs_standard_error(zhp->zfs_hdl,
-				    errno, errbuf));
+				err = errno;
+				if (holdsnaps) {
+					(void) zfs_release_range(zhp, fromsnap,
+					    tosnap, holdtag, flags.replicate);
+				}
+				goto stderr_out;
 			}
 		}
 	}
@@ -781,18 +1292,28 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 	/* dump each stream */
 	sdd.fromsnap = fromsnap;
 	sdd.tosnap = tosnap;
-	sdd.outfd = outfd;
-	sdd.replicate = replicate;
-	sdd.doall = doall;
-	sdd.fromorigin = fromorigin;
+	if (flags.dedup)
+		sdd.outfd = pipefd[0];
+	else
+		sdd.outfd = outfd;
+	sdd.replicate = flags.replicate;
+	sdd.doall = flags.doall;
+	sdd.fromorigin = flags.fromorigin;
 	sdd.fss = fss;
 	sdd.fsavl = fsavl;
-	sdd.verbose = verbose;
+	sdd.verbose = flags.verbose;
+	sdd.filter_cb = filter_func;
+	sdd.filter_cb_arg = cb_arg;
 	err = dump_filesystems(zhp, &sdd);
 	fsavl_destroy(fsavl);
 	nvlist_free(fss);
 
-	if (replicate || doall) {
+	if (flags.dedup) {
+		(void) close(pipefd[0]);
+		(void) pthread_join(tid, NULL);
+	}
+
+	if (flags.replicate || flags.doall || flags.props) {
 		/*
 		 * write final end record.  NB: want to do this even if
 		 * there was some error, because it might not be totally
@@ -800,6 +1321,10 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 		 */
 		dmu_replay_record_t drr = { 0 };
 		drr.drr_type = DRR_END;
+		if (holdsnaps) {
+			(void) zfs_release_range(zhp, fromsnap, tosnap,
+			    holdtag, flags.replicate);
+		}
 		if (write(outfd, &drr, sizeof (drr)) == -1) {
 			return (zfs_standard_error(zhp->zfs_hdl,
 			    errno, errbuf));
@@ -807,6 +1332,16 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap,
 	}
 
 	return (err || sdd.err);
+
+stderr_out:
+	err = zfs_standard_error(zhp->zfs_hdl, err, errbuf);
+err_out:
+	if (flags.dedup) {
+		(void) pthread_cancel(tid);
+		(void) pthread_join(tid, NULL);
+		(void) close(pipefd[0]);
+	}
+	return (err);
 }
 
 /*
@@ -892,11 +1427,12 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname,
 	if (err)
 		return (err);
 
+	zc.zc_objset_type = DMU_OST_ZFS;
+	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
+
 	if (tryname) {
 		(void) strcpy(newname, tryname);
 
-		zc.zc_objset_type = DMU_OST_ZFS;
-		(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
 		(void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value));
 
 		if (flags.verbose) {
@@ -951,12 +1487,18 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
 	int err = 0;
 	prop_changelist_t *clp;
 	zfs_handle_t *zhp;
+	boolean_t defer = B_FALSE;
+	int spa_version;
 
 	zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET);
 	if (zhp == NULL)
 		return (-1);
 	clp = changelist_gather(zhp, ZFS_PROP_NAME, 0,
 	    flags.force ? MS_FORCE : 0);
+	if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT &&
+	    zfs_spa_version(zhp, &spa_version) == 0 &&
+	    spa_version >= SPA_VERSION_USERREFS)
+		defer = B_TRUE;
 	zfs_close(zhp);
 	if (clp == NULL)
 		return (-1);
@@ -965,12 +1507,12 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
 		return (err);
 
 	zc.zc_objset_type = DMU_OST_ZFS;
+	zc.zc_defer_destroy = defer;
 	(void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name));
 
 	if (flags.verbose)
 		(void) printf("attempting destroy %s\n", zc.zc_name);
 	err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc);
-
 	if (err == 0) {
 		if (flags.verbose)
 			(void) printf("success\n");
@@ -980,8 +1522,14 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen,
 	(void) changelist_postfix(clp);
 	changelist_free(clp);
 
-	if (err != 0)
+	/*
+	 * Deferred destroy might destroy the snapshot or only mark it to be
+	 * destroyed later, and it returns success in either case.
+	 */
+	if (err != 0 || (defer && zfs_dataset_exists(hdl, name,
+	    ZFS_TYPE_SNAPSHOT))) {
 		err = recv_rename(hdl, name, NULL, baselen, newname, flags);
+	}
 
 	return (err);
 }
@@ -999,6 +1547,7 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg)
 
 	if (zhp->zfs_dmustats.dds_guid == gtnd->guid) {
 		(void) strcpy(gtnd->name, zhp->zfs_name);
+		zfs_close(zhp);
 		return (EEXIST);
 	}
 	err = zfs_iter_children(zhp, guid_to_name_cb, gtnd);
@@ -1097,11 +1646,15 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
 	char *tosnap, *fromsnap;
 	char newname[ZFS_MAXNAMELEN];
 	int error;
-	boolean_t needagain, progress;
+	boolean_t needagain, progress, recursive;
+	char *s1, *s2;
 
 	VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap));
 	VERIFY(0 == nvlist_lookup_string(stream_nv, "tosnap", &tosnap));
 
+	recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") ==
+	    ENOENT);
+
 	if (flags.dryrun)
 		return (0);
 
@@ -1109,7 +1662,7 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
 	needagain = progress = B_FALSE;
 
 	if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL,
-	    &local_nv, &local_avl)) != 0)
+	    recursive, &local_nv, &local_avl)) != 0)
 		return (error);
 
 	/*
@@ -1232,7 +1785,7 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
 			    stream_snapname, &props)) {
 				zfs_cmd_t zc = { 0 };
 
-				zc.zc_cookie = B_TRUE; /* clear current props */
+				zc.zc_cookie = B_TRUE; /* received */
 				(void) snprintf(zc.zc_name, sizeof (zc.zc_name),
 				    "%s@%s", fsname, nvpair_name(snapelem));
 				if (zcmd_write_src_nvlist(hdl, &zc,
@@ -1292,11 +1845,13 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs,
 		VERIFY(0 == nvlist_lookup_uint64(stream_nvfs,
 		    "parentfromsnap", &stream_parent_fromsnap_guid));
 
+		s1 = strrchr(fsname, '/');
+		s2 = strrchr(stream_fsname, '/');
+
 		/* check for rename */
 		if ((stream_parent_fromsnap_guid != 0 &&
 		    stream_parent_fromsnap_guid != parent_fromsnap_guid) ||
-		    strcmp(strrchr(fsname, '/'),
-		    strrchr(stream_fsname, '/')) != 0) {
+		    ((s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) {
 			nvlist_t *parent;
 			char tryname[ZFS_MAXNAMELEN];
 
@@ -1372,19 +1927,13 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
 
 	assert(drr->drr_type == DRR_BEGIN);
 	assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC);
-	assert(drr->drr_u.drr_begin.drr_version == DMU_BACKUP_HEADER_VERSION);
+	assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) ==
+	    DMU_COMPOUNDSTREAM);
 
 	/*
 	 * Read in the nvlist from the stream.
 	 */
 	if (drr->drr_payloadlen != 0) {
-		if (!flags.isprefix) {
-			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-			    "must use -d to receive replication "
-			    "(send -R) stream"));
-			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
-		}
-
 		error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen,
 		    &stream_nv, flags.byteswap, zc);
 		if (error) {
@@ -1490,11 +2039,28 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname,
 	return (error);
 }
 
+static void
+trunc_prop_errs(int truncated)
+{
+	ASSERT(truncated != 0);
+
+	if (truncated == 1)
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "1 more property could not be set\n"));
+	else
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN,
+		    "%d more properties could not be set\n"), truncated);
+}
+
 static int
 recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 {
 	dmu_replay_record_t *drr;
 	void *buf = malloc(1<<20);
+	char errbuf[1024];
+
+	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
+	    "cannot receive:"));
 
 	/* XXX would be great to use lseek if possible... */
 	drr = buf;
@@ -1507,7 +2073,11 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 		switch (drr->drr_type) {
 		case DRR_BEGIN:
 			/* NB: not to be used on v2 stream packages */
-			assert(drr->drr_payloadlen == 0);
+			if (drr->drr_payloadlen != 0) {
+				zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+				    "invalid substream header"));
+				return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+			}
 			break;
 
 		case DRR_END:
@@ -1534,12 +2104,15 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap)
 			    drr->drr_u.drr_write.drr_length, B_FALSE, NULL);
 			break;
 
+		case DRR_WRITE_BYREF:
 		case DRR_FREEOBJECTS:
 		case DRR_FREE:
 			break;
 
 		default:
-			assert(!"invalid record type");
+			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+			    "invalid record type"));
+			return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 		}
 	}
 
@@ -1562,12 +2135,14 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 	char *cp;
 	struct drr_begin *drrb = &drr->drr_u.drr_begin;
 	char errbuf[1024];
+	char prop_errbuf[1024];
 	char chopprefix[ZFS_MAXNAMELEN];
 	boolean_t newfs = B_FALSE;
 	boolean_t stream_wantsnewfs;
 	uint64_t parent_snapguid = 0;
 	prop_changelist_t *clp = NULL;
 	nvlist_t *snapprops_nvlist = NULL;
+	zprop_errflags_t prop_errflags;
 
 	begin_time = time(NULL);
 
@@ -1615,23 +2190,27 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 	(void) strcpy(chopprefix, drrb->drr_toname);
 	if (flags.isprefix) {
 		/*
-		 * They specified a fs with -d, we want to tack on
-		 * everything but the pool name stored in the stream
+		 * They specified a fs with -d or -e. We want to tack on
+		 * everything but the first element of the sent snapshot path
+		 * (all but the pool name) in the case of -d, or only the tail
+		 * of the sent snapshot path in the case of -e.
 		 */
 		if (strchr(tosnap, '@')) {
 			zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
-			    "argument - snapshot not allowed with -d"));
+			    "argument - snapshot not allowed with %s"),
+			    (flags.istail ? "-e" : "-d"));
 			return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf));
 		}
-		cp = strchr(chopprefix, '/');
+		cp = (flags.istail ? strrchr(chopprefix, '/') :
+		    strchr(chopprefix, '/'));
 		if (cp == NULL)
 			cp = strchr(chopprefix, '@');
 		*cp = '\0';
 	} else if (strchr(tosnap, '@') == NULL) {
 		/*
-		 * If they specified a filesystem without -d, we want to
-		 * tack on everything after the fs specified in the
-		 * first name from the stream.
+		 * If they specified a filesystem without -d or -e, we want to
+		 * tack on everything after the fs specified in the first name
+		 * from the stream.
 		 */
 		cp = strchr(chopprefix, '@');
 		*cp = '\0';
@@ -1641,6 +2220,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 	/*
 	 * Determine name of destination snapshot, store in zc_value.
 	 */
+	(void) strcpy(zc.zc_top_ds, tosnap);
 	(void) strcpy(zc.zc_value, tosnap);
 	(void) strncat(zc.zc_value, drrb->drr_toname+choplen,
 	    sizeof (zc.zc_value));
@@ -1767,21 +2347,17 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 			/* We can't do online recv in this case */
 			clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0);
 			if (clp == NULL) {
+				zfs_close(zhp);
 				zcmd_free_nvlists(&zc);
 				return (-1);
 			}
 			if (changelist_prefix(clp) != 0) {
 				changelist_free(clp);
+				zfs_close(zhp);
 				zcmd_free_nvlists(&zc);
 				return (-1);
 			}
 		}
-		if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME &&
-		    zvol_remove_link(hdl, zhp->zfs_name) != 0) {
-			zfs_close(zhp);
-			zcmd_free_nvlists(&zc);
-			return (-1);
-		}
 		zfs_close(zhp);
 	} else {
 		/*
@@ -1830,14 +2406,52 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 		return (recv_skip(hdl, infd, flags.byteswap));
 	}
 
+	zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf;
+	zc.zc_nvlist_dst_size = sizeof (prop_errbuf);
+
 	err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc);
 	ioctl_errno = errno;
+	prop_errflags = (zprop_errflags_t)zc.zc_obj;
+
+	if (err == 0) {
+		nvlist_t *prop_errors;
+		VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst,
+		    zc.zc_nvlist_dst_size, &prop_errors, 0));
+
+		nvpair_t *prop_err = NULL;
+
+		while ((prop_err = nvlist_next_nvpair(prop_errors,
+		    prop_err)) != NULL) {
+			char tbuf[1024];
+			zfs_prop_t prop;
+			int intval;
+
+			prop = zfs_name_to_prop(nvpair_name(prop_err));
+			(void) nvpair_value_int32(prop_err, &intval);
+			if (strcmp(nvpair_name(prop_err),
+			    ZPROP_N_MORE_ERRORS) == 0) {
+				trunc_prop_errs(intval);
+				break;
+			} else {
+				(void) snprintf(tbuf, sizeof (tbuf),
+				    dgettext(TEXT_DOMAIN,
+				    "cannot receive %s property on %s"),
+				    nvpair_name(prop_err), zc.zc_name);
+				zfs_setprop_error(hdl, prop, intval, tbuf);
+			}
+		}
+		nvlist_free(prop_errors);
+	}
+
+	zc.zc_nvlist_dst = 0;
+	zc.zc_nvlist_dst_size = 0;
 	zcmd_free_nvlists(&zc);
 
 	if (err == 0 && snapprops_nvlist) {
 		zfs_cmd_t zc2 = { 0 };
 
 		(void) strcpy(zc2.zc_name, zc.zc_value);
+		zc2.zc_cookie = B_TRUE; /* received */
 		if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) {
 			(void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2);
 			zcmd_free_nvlists(&zc2);
@@ -1860,7 +2474,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 		 * get a strange "does not exist" error message.
 		 */
 		*cp = '\0';
-		if (gather_nvlist(hdl, zc.zc_value, NULL, NULL,
+		if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE,
 		    &local_nv, &local_avl) == 0) {
 			*cp = '@';
 			fs = fsavl_find(local_avl, drrb->drr_toguid, NULL);
@@ -1872,14 +2486,13 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 					(void) printf("snap %s already exists; "
 					    "ignoring\n", zc.zc_value);
 				}
-				ioctl_err = recv_skip(hdl, infd,
+				err = ioctl_err = recv_skip(hdl, infd,
 				    flags.byteswap);
 			}
 		}
 		*cp = '@';
 	}
 
-
 	if (ioctl_err != 0) {
 		switch (ioctl_errno) {
 		case ENODEV:
@@ -1924,11 +2537,9 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 	}
 
 	/*
-	 * Mount or recreate the /dev links for the target filesystem
-	 * (if created, or if we tore them down to do an incremental
-	 * restore), and the /dev links for the new snapshot (if
-	 * created). Also mount any children of the target filesystem
-	 * if we did an incremental receive.
+	 * Mount the target filesystem (if created).  Also mount any
+	 * children of the target filesystem if we did a replication
+	 * receive (indicated by stream_avl being non-NULL).
 	 */
 	cp = strchr(zc.zc_value, '@');
 	if (cp && (ioctl_err == 0 || !newfs)) {
@@ -1940,11 +2551,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 		if (h != NULL) {
 			if (h->zfs_type == ZFS_TYPE_VOLUME) {
 				*cp = '@';
-				err = zvol_create_link(hdl, h->zfs_name);
-				if (err == 0 && ioctl_err == 0)
-					err = zvol_create_link(hdl,
-					    zc.zc_value);
-			} else if (newfs) {
+			} else if (newfs || stream_avl) {
 				/*
 				 * Track the first/top of hierarchy fs,
 				 * for mounting and sharing later.
@@ -1962,6 +2569,19 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap,
 		changelist_free(clp);
 	}
 
+	if (prop_errflags & ZPROP_ERR_NOCLEAR) {
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
+		    "failed to clear unreceived properties on %s"),
+		    zc.zc_name);
+		(void) fprintf(stderr, "\n");
+	}
+	if (prop_errflags & ZPROP_ERR_NORESTORE) {
+		(void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: "
+		    "failed to restore original properties on %s"),
+		    zc.zc_name);
+		(void) fprintf(stderr, "\n");
+	}
+
 	if (err || ioctl_err)
 		return (-1);
 
@@ -1991,6 +2611,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
 	struct drr_begin *drrb = &drr.drr_u.drr_begin;
 	char errbuf[1024];
 	zio_cksum_t zcksum = { 0 };
+	uint64_t featureflags;
+	int hdrtype;
 
 	(void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN,
 	    "cannot receive"));
@@ -2028,7 +2650,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
 		drr.drr_type = BSWAP_32(drr.drr_type);
 		drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen);
 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-		drrb->drr_version = BSWAP_64(drrb->drr_version);
+		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
 		drrb->drr_type = BSWAP_32(drrb->drr_type);
 		drrb->drr_flags = BSWAP_32(drrb->drr_flags);
@@ -2042,23 +2664,31 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
+	featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo);
+	hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo);
+
+	if (!DMU_STREAM_SUPPORTED(featureflags) ||
+	    (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) {
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "stream has unsupported feature, feature flags = %lx"),
+		    featureflags);
+		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
+	}
+
 	if (strchr(drrb->drr_toname, '@') == NULL) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid "
 		    "stream (bad snapshot name)"));
 		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 
-	if (drrb->drr_version == DMU_BACKUP_STREAM_VERSION) {
+	if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) {
 		return (zfs_receive_one(hdl, infd, tosnap, flags,
 		    &drr, &drr_noswap, stream_avl, top_zfs));
-	} else if (drrb->drr_version == DMU_BACKUP_HEADER_VERSION) {
+	} else {  /* must be DMU_COMPOUNDSTREAM */
+		assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) ==
+		    DMU_COMPOUNDSTREAM);
 		return (zfs_receive_package(hdl, infd, tosnap, flags,
 		    &drr, &zcksum, top_zfs));
-	} else {
-		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
-		    "stream is unsupported version %llu"),
-		    drrb->drr_version);
-		return (zfs_error(hdl, EZFS_BADSTREAM, errbuf));
 	}
 }
 
@@ -2077,7 +2707,7 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags,
 
 	err = zfs_receive_impl(hdl, tosnap, flags, infd, stream_avl, &top_zfs);
 
-	if (err == 0 && top_zfs) {
+	if (err == 0 && !flags.nomount && top_zfs) {
 		zfs_handle_t *zhp;
 		prop_changelist_t *clp;
 
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c
index c7eb04e74cac8..c4f907733f017 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -104,6 +104,13 @@ vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs)
 	return (state == VDEV_STATE_OFFLINE);
 }
 
+/* ARGSUSED */
+static int
+vdev_removed(uint64_t state, uint64_t aux, uint64_t errs)
+{
+	return (state == VDEV_STATE_REMOVED);
+}
+
 /*
  * Detect if any leaf devices that have seen errors or could not be opened.
  */
@@ -275,6 +282,12 @@ check_status(nvlist_t *config, boolean_t isimport)
 	if (find_vdev_problem(nvroot, vdev_offlined))
 		return (ZPOOL_STATUS_OFFLINE_DEV);
 
+	/*
+	 * Removed device
+	 */
+	if (find_vdev_problem(nvroot, vdev_removed))
+		return (ZPOOL_STATUS_REMOVED_DEV);
+
 	/*
 	 * Currently resilvering
 	 */
@@ -315,3 +328,68 @@ zpool_import_status(nvlist_t *config, char **msgid)
 
 	return (ret);
 }
+
+static void
+dump_ddt_stat(const ddt_stat_t *dds, int h)
+{
+	char refcnt[6];
+	char blocks[6], lsize[6], psize[6], dsize[6];
+	char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6];
+
+	if (dds == NULL || dds->dds_blocks == 0)
+		return;
+
+	if (h == -1)
+		(void) strcpy(refcnt, "Total");
+	else
+		zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt));
+
+	zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks));
+	zfs_nicenum(dds->dds_lsize, lsize, sizeof (lsize));
+	zfs_nicenum(dds->dds_psize, psize, sizeof (psize));
+	zfs_nicenum(dds->dds_dsize, dsize, sizeof (dsize));
+	zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks));
+	zfs_nicenum(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize));
+	zfs_nicenum(dds->dds_ref_psize, ref_psize, sizeof (ref_psize));
+	zfs_nicenum(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize));
+
+	(void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+	    refcnt,
+	    blocks, lsize, psize, dsize,
+	    ref_blocks, ref_lsize, ref_psize, ref_dsize);
+}
+
+/*
+ * Print the DDT histogram and the column totals.
+ */
+void
+zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh)
+{
+	int h;
+
+	(void) printf("\n");
+
+	(void) printf("bucket   "
+	    "           allocated             "
+	    "          referenced          \n");
+	(void) printf("______   "
+	    "______________________________   "
+	    "______________________________\n");
+
+	(void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+	    "refcnt",
+	    "blocks", "LSIZE", "PSIZE", "DSIZE",
+	    "blocks", "LSIZE", "PSIZE", "DSIZE");
+
+	(void) printf("%6s   %6s   %5s   %5s   %5s   %6s   %5s   %5s   %5s\n",
+	    "------",
+	    "------", "-----", "-----", "-----",
+	    "------", "-----", "-----", "-----");
+
+	for (h = 0; h < 64; h++)
+		dump_ddt_stat(&ddh->ddh_stat[h], h);
+
+	dump_ddt_stat(dds_total, -1);
+
+	(void) printf("\n");
+}
diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c
index 54de0f4b50a4c..a400dc9c1e114 100644
--- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c
+++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -94,8 +94,6 @@ libzfs_error_description(libzfs_handle_t *hdl)
 	case EZFS_VOLTOOBIG:
 		return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for "
 		    "this system"));
-	case EZFS_VOLHASDATA:
-		return (dgettext(TEXT_DOMAIN, "volume has data"));
 	case EZFS_INVALIDNAME:
 		return (dgettext(TEXT_DOMAIN, "invalid name"));
 	case EZFS_BADRESTORE:
@@ -142,8 +140,6 @@ libzfs_error_description(libzfs_handle_t *hdl)
 		return (dgettext(TEXT_DOMAIN,
 		    "iscsitgt service need to be enabled by "
 		    "a privileged user"));
-	case EZFS_DEVLINKS:
-		return (dgettext(TEXT_DOMAIN, "failed to create /dev links"));
 	case EZFS_PERM:
 		return (dgettext(TEXT_DOMAIN, "permission denied"));
 	case EZFS_NOSPC:
@@ -210,6 +206,23 @@ libzfs_error_description(libzfs_handle_t *hdl)
 	case EZFS_ACTIVE_SPARE:
 		return (dgettext(TEXT_DOMAIN, "pool has active shared spare "
 		    "device"));
+	case EZFS_UNPLAYED_LOGS:
+		return (dgettext(TEXT_DOMAIN, "log device has unplayed intent "
+		    "logs"));
+	case EZFS_REFTAG_RELE:
+		return (dgettext(TEXT_DOMAIN, "no such tag on this dataset"));
+	case EZFS_REFTAG_HOLD:
+		return (dgettext(TEXT_DOMAIN, "tag already exists on this "
+		    "dataset"));
+	case EZFS_TAGTOOLONG:
+		return (dgettext(TEXT_DOMAIN, "tag too long"));
+	case EZFS_PIPEFAILED:
+		return (dgettext(TEXT_DOMAIN, "pipe create failed"));
+	case EZFS_THREADCREATEFAILED:
+		return (dgettext(TEXT_DOMAIN, "thread create failed"));
+	case EZFS_POSTSPLIT_ONLINE:
+		return (dgettext(TEXT_DOMAIN, "disk was split from this pool "
+		    "into a new one"));
 	case EZFS_UNKNOWN:
 		return (dgettext(TEXT_DOMAIN, "unknown error"));
 	default:
@@ -364,8 +377,13 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 	case ENOTSUP:
 		zfs_verror(hdl, EZFS_BADVERSION, fmt, ap);
 		break;
+	case EAGAIN:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "pool I/O is currently suspended"));
+		zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
+		break;
 	default:
-		zfs_error_aux(hdl, strerror(errno));
+		zfs_error_aux(hdl, strerror(error));
 		zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap);
 		break;
 	}
@@ -437,6 +455,11 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...)
 	case EDQUOT:
 		zfs_verror(hdl, EZFS_NOSPC, fmt, ap);
 		return (-1);
+	case EAGAIN:
+		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
+		    "pool I/O is currently suspended"));
+		zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap);
+		break;
 
 	default:
 		zfs_error_aux(hdl, strerror(error));
@@ -480,7 +503,6 @@ zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize)
 
 	if ((ret = realloc(ptr, newsize)) == NULL) {
 		(void) no_memory(hdl);
-		free(ptr);
 		return (NULL);
 	}
 
@@ -576,6 +598,7 @@ libzfs_init(void)
 
 	zfs_prop_init();
 	zpool_prop_init();
+	libzfs_mnttab_init(hdl);
 
 	return (hdl);
 }
@@ -592,7 +615,9 @@ libzfs_fini(libzfs_handle_t *hdl)
 	if (hdl->libzfs_log_str)
 		(void) free(hdl->libzfs_log_str);
 	zpool_free_handles(hdl);
+	libzfs_fru_clear(hdl, B_TRUE);
 	namespace_clear(hdl);
+	libzfs_mnttab_fini(hdl);
 	free(hdl);
 }
 
@@ -667,7 +692,7 @@ int
 zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len)
 {
 	if (len == 0)
-		len = 2048;
+		len = 4*1024;
 	zc->zc_nvlist_dst_size = len;
 	if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t)
 	    zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == NULL)
@@ -793,16 +818,22 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 	    "PROPERTY"));
 	cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN,
 	    "VALUE"));
+	cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN,
+	    "RECEIVED"));
 	cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN,
 	    "SOURCE"));
 
+	/* first property is always NAME */
+	assert(cbp->cb_proplist->pl_prop ==
+	    ((type == ZFS_TYPE_POOL) ?  ZPOOL_PROP_NAME : ZFS_PROP_NAME));
+
 	/*
 	 * Go through and calculate the widths for each column.  For the
 	 * 'source' column, we kludge it up by taking the worst-case scenario of
 	 * inheriting from the longest name.  This is acceptable because in the
 	 * majority of cases 'SOURCE' is the last column displayed, and we don't
 	 * use the width anyway.  Note that the 'VALUE' column can be oversized,
-	 * if the name of the property is much longer the any values we find.
+	 * if the name of the property is much longer than any values we find.
 	 */
 	for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) {
 		/*
@@ -823,12 +854,21 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 		}
 
 		/*
-		 * 'VALUE' column
+		 * 'VALUE' column.  The first property is always the 'name'
+		 * property that was tacked on either by /sbin/zfs's
+		 * zfs_do_get() or when calling zprop_expand_list(), so we
+		 * ignore its width.  If the user specified the name property
+		 * to display, then it will be later in the list in any case.
 		 */
-		if ((pl->pl_prop != ZFS_PROP_NAME || !pl->pl_all) &&
+		if (pl != cbp->cb_proplist &&
 		    pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE])
 			cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width;
 
+		/* 'RECEIVED' column. */
+		if (pl != cbp->cb_proplist &&
+		    pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD])
+			cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width;
+
 		/*
 		 * 'NAME' and 'SOURCE' columns
 		 */
@@ -844,7 +884,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 	/*
 	 * Now go through and print the headers.
 	 */
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < ZFS_GET_NCOLS; i++) {
 		switch (cbp->cb_columns[i]) {
 		case GET_COL_NAME:
 			title = dgettext(TEXT_DOMAIN, "NAME");
@@ -855,6 +895,9 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 		case GET_COL_VALUE:
 			title = dgettext(TEXT_DOMAIN, "VALUE");
 			break;
+		case GET_COL_RECVD:
+			title = dgettext(TEXT_DOMAIN, "RECEIVED");
+			break;
 		case GET_COL_SOURCE:
 			title = dgettext(TEXT_DOMAIN, "SOURCE");
 			break;
@@ -863,7 +906,8 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 		}
 
 		if (title != NULL) {
-			if (i == 3 || cbp->cb_columns[i + 1] == 0)
+			if (i == (ZFS_GET_NCOLS - 1) ||
+			    cbp->cb_columns[i + 1] == GET_COL_NONE)
 				(void) printf("%s", title);
 			else
 				(void) printf("%-*s  ",
@@ -881,7 +925,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type)
 void
 zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
     const char *propname, const char *value, zprop_source_t sourcetype,
-    const char *source)
+    const char *source, const char *recvd_value)
 {
 	int i;
 	const char *str;
@@ -896,7 +940,7 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
 	if (cbp->cb_first)
 		zprop_print_headers(cbp, cbp->cb_type);
 
-	for (i = 0; i < 4; i++) {
+	for (i = 0; i < ZFS_GET_NCOLS; i++) {
 		switch (cbp->cb_columns[i]) {
 		case GET_COL_NAME:
 			str = name;
@@ -933,14 +977,21 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
 				    "inherited from %s", source);
 				str = buf;
 				break;
+			case ZPROP_SRC_RECEIVED:
+				str = "received";
+				break;
 			}
 			break;
 
+		case GET_COL_RECVD:
+			str = (recvd_value == NULL ? "-" : recvd_value);
+			break;
+
 		default:
 			continue;
 		}
 
-		if (cbp->cb_columns[i + 1] == 0)
+		if (cbp->cb_columns[i + 1] == GET_COL_NONE)
 			(void) printf("%s", str);
 		else if (cbp->cb_scripted)
 			(void) printf("%s\t", str);
@@ -948,7 +999,6 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp,
 			(void) printf("%-*s  ",
 			    cbp->cb_colwidths[cbp->cb_columns[i]],
 			    str);
-
 	}
 
 	(void) printf("\n");
@@ -1010,9 +1060,9 @@ zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num)
 		return (-1);
 	}
 
-	/* Rely on stroll() to process the numeric portion.  */
+	/* Rely on strtoull() to process the numeric portion.  */
 	errno = 0;
-	*num = strtoll(value, &end, 10);
+	*num = strtoull(value, &end, 10);
 
 	/*
 	 * Check for ERANGE, which indicates that the value is too large to fit
@@ -1202,7 +1252,7 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp,
 	 * dataset property,
 	 */
 	if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL ||
-	    !zfs_prop_user(propname))) {
+	    (!zfs_prop_user(propname) && !zfs_prop_userquota(propname)))) {
 		zfs_error_aux(hdl, dgettext(TEXT_DOMAIN,
 		    "invalid property '%s'"), propname);
 		return (zfs_error(hdl, EZFS_BADPROP,
diff --git a/external/cddl/osnet/dist/lib/libzpool/common/taskq.c b/external/cddl/osnet/dist/lib/libzpool/common/taskq.c
index 93acdcf8e4e37..142cd73f08f47 100644
--- a/external/cddl/osnet/dist/lib/libzpool/common/taskq.c
+++ b/external/cddl/osnet/dist/lib/libzpool/common/taskq.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -114,8 +114,13 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags)
 		mutex_exit(&tq->tq_lock);
 		return (0);
 	}
-	t->task_next = &tq->tq_task;
-	t->task_prev = tq->tq_task.task_prev;
+	if (tqflags & TQ_FRONT) {
+		t->task_next = tq->tq_task.task_next;
+		t->task_prev = &tq->tq_task;
+	} else {
+		t->task_next = &tq->tq_task;
+		t->task_prev = tq->tq_task.task_prev;
+	}
 	t->task_next->task_prev = t;
 	t->task_prev->task_next = t;
 	t->task_func = func;
@@ -174,6 +179,19 @@ taskq_create(const char *name, int nthreads, pri_t pri,
 	taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP);
 	int t;
 
+	if (flags & TASKQ_THREADS_CPU_PCT) {
+		int pct;
+		ASSERT3S(nthreads, >=, 0);
+		ASSERT3S(nthreads, <=, 100);
+		pct = MIN(nthreads, 100);
+		pct = MAX(pct, 0);
+
+		nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100;
+		nthreads = MAX(nthreads, 1);	/* need at least 1 thread */
+	} else {
+		ASSERT3S(nthreads, >=, 1);
+	}
+
 	rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL);
@@ -259,3 +277,10 @@ system_taskq_init(void)
 	system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512,
 	    TASKQ_DYNAMIC | TASKQ_PREPOPULATE);
 }
+
+void
+system_taskq_fini(void)
+{
+	taskq_destroy(system_taskq);
+	system_taskq = NULL; /* defensive */
+}
diff --git a/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c b/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c
index 91e0f611cbb91..1e425758c2495 100644
--- a/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c
+++ b/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Create and parse buffers containing CTF data.
  */
@@ -172,6 +170,12 @@ write_functions(iidesc_t *idp, ctf_buf_t *b)
 	}
 
 	nargs = idp->ii_nargs + (idp->ii_vargs != 0);
+
+	if (nargs > CTF_MAX_VLEN) {
+		terminate("function %s has too many args: %d > %d\n",
+		    idp->ii_name, nargs, CTF_MAX_VLEN);
+	}
+
 	fdata[0] = CTF_TYPE_INFO(CTF_K_FUNCTION, 1, nargs);
 	fdata[1] = idp->ii_dtype->t_id;
 	ctf_buf_write(b, fdata, sizeof (fdata));
@@ -312,6 +316,11 @@ write_type(tdesc_t *tp, ctf_buf_t *b)
 		for (i = 0, mp = tp->t_members; mp != NULL; mp = mp->ml_next)
 			i++; /* count up struct or union members */
 
+		if (i > CTF_MAX_VLEN) {
+			terminate("sou %s has too many members: %d > %d\n",
+			    tdesc_name(tp), i, CTF_MAX_VLEN);
+		}
+
 		if (tp->t_type == STRUCT)
 			ctt.ctt_info = CTF_TYPE_INFO(CTF_K_STRUCT, isroot, i);
 		else
@@ -351,6 +360,11 @@ write_type(tdesc_t *tp, ctf_buf_t *b)
 		for (i = 0, ep = tp->t_emem; ep != NULL; ep = ep->el_next)
 			i++; /* count up enum members */
 
+		if (i > CTF_MAX_VLEN) {
+			terminate("enum %s has too many values: %d > %d\n",
+			    tdesc_name(tp), i, CTF_MAX_VLEN);
+		}
+
 		ctt.ctt_info = CTF_TYPE_INFO(CTF_K_ENUM, isroot, i);
 		write_sized_type_rec(b, &ctt, tp->t_size);
 
@@ -387,8 +401,14 @@ write_type(tdesc_t *tp, ctf_buf_t *b)
 		break;
 
 	case FUNCTION:
-		ctt.ctt_info = CTF_TYPE_INFO(CTF_K_FUNCTION, isroot,
-		    tp->t_fndef->fn_nargs + tp->t_fndef->fn_vargs);
+		i = tp->t_fndef->fn_nargs + tp->t_fndef->fn_vargs;
+
+		if (i > CTF_MAX_VLEN) {
+			terminate("function %s has too many args: %d > %d\n",
+			    i, CTF_MAX_VLEN);
+		}
+
+		ctt.ctt_info = CTF_TYPE_INFO(CTF_K_FUNCTION, isroot, i);
 		ctt.ctt_type = tp->t_fndef->fn_ret->t_id;
 		write_unsized_type_rec(b, &ctt);
 
@@ -927,7 +947,7 @@ resurrect_types(ctf_header_t *h, tdata_t *td, tdesc_t **tdarr, int tdsize,
 
 		if (CTF_NAME_STID(ctt->ctt_name) != CTF_STRTAB_0)
 			parseterminate(
-				"Unable to cope with non-zero strtab id");
+			    "Unable to cope with non-zero strtab id");
 		if (CTF_NAME_OFFSET(ctt->ctt_name) != 0) {
 			tdp->t_name =
 			    xstrdup(sbuf + CTF_NAME_OFFSET(ctt->ctt_name));
diff --git a/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c b/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c
index 32d84829d70e5..295928586e136 100644
--- a/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c
+++ b/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Routines for manipulating tdesc and tdata structures
  */
@@ -86,9 +84,10 @@ tdesc_layouthash(int nbuckets, void *node)
 			 * Unnamed structures, which cannot have forward
 			 * declarations pointing to them.  We can therefore
 			 * incorporate the name of the first member into
-			 * the hash value.
+			 * the hash value, assuming there are any.
 			 */
-			name = tdp->t_members->ml_name;
+			if (tdp->t_members != NULL)
+				name = tdp->t_members->ml_name;
 			break;
 		case ENUM:
 			/* Use the first element in the hash value */
diff --git a/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c b/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c
index 91e3230737a6f..ea1ac53d44a97 100644
--- a/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c
+++ b/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -239,10 +239,16 @@ static void
 dtrace_nullop(void)
 {}
 
+static int
+dtrace_enable_nullop(void)
+{
+	return (0);
+}
+
 static dtrace_pops_t	dtrace_provider_ops = {
 	(void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop,
 	(void (*)(void *, struct modctl *))dtrace_nullop,
-	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
+	(int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop,
 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
 	(void (*)(void *, dtrace_id_t, void *))dtrace_nullop,
@@ -426,6 +432,7 @@ dtrace_load##bits(uintptr_t addr)					\
 #define	DTRACE_DYNHASH_SINK	1
 #define	DTRACE_DYNHASH_VALID	2
 
+#define	DTRACE_MATCH_FAIL	-1
 #define	DTRACE_MATCH_NEXT	0
 #define	DTRACE_MATCH_DONE	1
 #define	DTRACE_ANCHORED(probe)	((probe)->dtpr_func[0] != '\0')
@@ -6654,7 +6661,7 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
 {
 	dtrace_probe_t template, *probe;
 	dtrace_hash_t *hash = NULL;
-	int len, best = INT_MAX, nmatched = 0;
+	int len, rc, best = INT_MAX, nmatched = 0;
 	dtrace_id_t i;
 
 	ASSERT(MUTEX_HELD(&dtrace_lock));
@@ -6666,7 +6673,8 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
 	if (pkp->dtpk_id != DTRACE_IDNONE) {
 		if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL &&
 		    dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) {
-			(void) (*matched)(probe, arg);
+			if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL)
+				return (DTRACE_MATCH_FAIL);
 			nmatched++;
 		}
 		return (nmatched);
@@ -6713,8 +6721,12 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
 
 			nmatched++;
 
-			if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
+			if ((rc = (*matched)(probe, arg)) !=
+			    DTRACE_MATCH_NEXT) {
+				if (rc == DTRACE_MATCH_FAIL)
+					return (DTRACE_MATCH_FAIL);
 				break;
+			}
 		}
 
 		return (nmatched);
@@ -6733,8 +6745,11 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid,
 
 		nmatched++;
 
-		if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT)
+		if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) {
+			if (rc == DTRACE_MATCH_FAIL)
+				return (DTRACE_MATCH_FAIL);
 			break;
+		}
 	}
 
 	return (nmatched);
@@ -6954,7 +6969,7 @@ dtrace_unregister(dtrace_provider_id_t id)
 	dtrace_probe_t *probe, *first = NULL;
 
 	if (old->dtpv_pops.dtps_enable ==
-	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) {
+	    (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) {
 		/*
 		 * If DTrace itself is the provider, we're called with locks
 		 * already held.
@@ -7100,7 +7115,7 @@ dtrace_invalidate(dtrace_provider_id_t id)
 	dtrace_provider_t *pvp = (dtrace_provider_t *)id;
 
 	ASSERT(pvp->dtpv_pops.dtps_enable !=
-	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
+	    (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
 
 	mutex_enter(&dtrace_provider_lock);
 	mutex_enter(&dtrace_lock);
@@ -7141,7 +7156,7 @@ dtrace_condense(dtrace_provider_id_t id)
 	 * Make sure this isn't the dtrace provider itself.
 	 */
 	ASSERT(prov->dtpv_pops.dtps_enable !=
-	    (void (*)(void *, dtrace_id_t, void *))dtrace_nullop);
+	    (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop);
 
 	mutex_enter(&dtrace_provider_lock);
 	mutex_enter(&dtrace_lock);
@@ -8102,7 +8117,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs,
 			break;
 
 		default:
-			err += efunc(dp->dtdo_len - 1, "bad return size");
+			err += efunc(dp->dtdo_len - 1, "bad return size\n");
 		}
 	}
 
@@ -9095,7 +9110,7 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe)
 	return (ecb);
 }
 
-static void
+static int
 dtrace_ecb_enable(dtrace_ecb_t *ecb)
 {
 	dtrace_probe_t *probe = ecb->dte_probe;
@@ -9108,7 +9123,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
 		/*
 		 * This is the NULL probe -- there's nothing to do.
 		 */
-		return;
+		return (0);
 	}
 
 	if (probe->dtpr_ecb == NULL) {
@@ -9122,8 +9137,8 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
 		if (ecb->dte_predicate != NULL)
 			probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid;
 
-		prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
-		    probe->dtpr_id, probe->dtpr_arg);
+		return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg,
+		    probe->dtpr_id, probe->dtpr_arg));
 	} else {
 		/*
 		 * This probe is already active.  Swing the last pointer to
@@ -9136,6 +9151,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb)
 		probe->dtpr_predcache = 0;
 
 		dtrace_sync();
+		return (0);
 	}
 }
 
@@ -9919,7 +9935,9 @@ dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg)
 	if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL)
 		return (DTRACE_MATCH_DONE);
 
-	dtrace_ecb_enable(ecb);
+	if (dtrace_ecb_enable(ecb) < 0)
+		return (DTRACE_MATCH_FAIL);
+
 	return (DTRACE_MATCH_NEXT);
 }
 
@@ -10714,7 +10732,7 @@ static int
 dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
 {
 	int i = 0;
-	int matched = 0;
+	int total_matched = 0, matched = 0;
 
 	ASSERT(MUTEX_HELD(&cpu_lock));
 	ASSERT(MUTEX_HELD(&dtrace_lock));
@@ -10725,7 +10743,14 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
 		enab->dten_current = ep;
 		enab->dten_error = 0;
 
-		matched += dtrace_probe_enable(&ep->dted_probe, enab);
+		/*
+		 * If a provider failed to enable a probe then get out and
+		 * let the consumer know we failed.
+		 */
+		if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0)
+			return (EBUSY);
+
+		total_matched += matched;
 
 		if (enab->dten_error != 0) {
 			/*
@@ -10753,7 +10778,7 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched)
 
 	enab->dten_probegen = dtrace_probegen;
 	if (nmatched != NULL)
-		*nmatched = matched;
+		*nmatched = total_matched;
 
 	return (0);
 }
@@ -10991,7 +11016,8 @@ dtrace_dof_copyin(uintptr_t uarg, int *errp)
 
 	dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP);
 
-	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) {
+	if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 ||
+	    dof->dofh_loadsz != hdr.dofh_loadsz) {
 		kmem_free(dof, hdr.dofh_loadsz);
 		*errp = EFAULT;
 		return (NULL);
@@ -11719,6 +11745,13 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr,
 			}
 		}
 
+		if (DOF_SEC_ISLOADABLE(sec->dofs_type) &&
+		    !(sec->dofs_flags & DOF_SECF_LOAD)) {
+			dtrace_dof_error(dof, "loadable section with load "
+			    "flag unset");
+			return (-1);
+		}
+
 		if (!(sec->dofs_flags & DOF_SECF_LOAD))
 			continue; /* just ignore non-loadable sections */
 
@@ -14449,7 +14482,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p)
 	mutex_exit(&cpu_lock);
 
 	if (state == NULL) {
-		if (--dtrace_opens == 0)
+		if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
 			(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
 		mutex_exit(&dtrace_lock);
 		return (EAGAIN);
@@ -14485,7 +14518,12 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p)
 
 	dtrace_state_destroy(state);
 	ASSERT(dtrace_opens > 0);
-	if (--dtrace_opens == 0)
+
+	/*
+	 * Only relinquish control of the kernel debugger interface when there
+	 * are no consumers and no anonymous enablings.
+	 */
+	if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL)
 		(void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE);
 
 	mutex_exit(&dtrace_lock);
diff --git a/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c b/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c
index fee6d60a572ee..42263e4ef2745 100644
--- a/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c
+++ b/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -875,7 +875,7 @@ fasttrap_disable_callbacks(void)
 }
 
 /*ARGSUSED*/
-static void
+static int
 fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	fasttrap_probe_t *probe = parg;
@@ -903,7 +903,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
 	 * provider can't go away while we're in this code path.
 	 */
 	if (probe->ftp_prov->ftp_retired)
-		return;
+		return (0);
 
 	/*
 	 * If we can't find the process, it may be that we're in the context of
@@ -912,7 +912,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
 	 */
 	if ((p = sprlock(probe->ftp_pid)) == NULL) {
 		if ((curproc->p_flag & SFORKING) == 0)
-			return;
+			return (0);
 
 		mutex_enter(&pidlock);
 		p = prfind(probe->ftp_pid);
@@ -974,7 +974,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
 			 * drop our reference on the trap table entry.
 			 */
 			fasttrap_disable_callbacks();
-			return;
+			return (0);
 		}
 	}
 
@@ -982,6 +982,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg)
 	sprunlock(p);
 
 	probe->ftp_enabled = 1;
+	return (0);
 }
 
 /*ARGSUSED*/
@@ -1945,7 +1946,8 @@ fasttrap_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv)
 
 		probe = kmem_alloc(size, KM_SLEEP);
 
-		if (copyin(uprobe, probe, size) != 0) {
+		if (copyin(uprobe, probe, size) != 0 ||
+		    probe->ftps_noffs != noffs) {
 			kmem_free(probe, size);
 			return (EFAULT);
 		}
diff --git a/external/cddl/osnet/dist/uts/common/dtrace/lockstat.c b/external/cddl/osnet/dist/uts/common/dtrace/lockstat.c
index 55b3fcf8ff7bc..69c8b7254486e 100644
--- a/external/cddl/osnet/dist/uts/common/dtrace/lockstat.c
+++ b/external/cddl/osnet/dist/uts/common/dtrace/lockstat.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -83,7 +83,7 @@ static kmutex_t		lockstat_test;	/* for testing purposes only */
 static dtrace_provider_id_t lockstat_id;
 
 /*ARGSUSED*/
-static void
+static int
 lockstat_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	lockstat_probe_t *probe = parg;
@@ -102,6 +102,7 @@ lockstat_enable(void *arg, dtrace_id_t id, void *parg)
 	 */
 	mutex_enter(&lockstat_test);
 	mutex_exit(&lockstat_test);
+	return (0);
 }
 
 /*ARGSUSED*/
diff --git a/external/cddl/osnet/dist/uts/common/dtrace/profile.c b/external/cddl/osnet/dist/uts/common/dtrace/profile.c
index da8f58a378619..c1a2d1f1c12fe 100644
--- a/external/cddl/osnet/dist/uts/common/dtrace/profile.c
+++ b/external/cddl/osnet/dist/uts/common/dtrace/profile.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -360,7 +360,7 @@ profile_offline(void *arg, cpu_t *cpu, void *oarg)
 }
 
 /*ARGSUSED*/
-static void
+static int
 profile_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	profile_probe_t *prof = parg;
@@ -390,6 +390,7 @@ profile_enable(void *arg, dtrace_id_t id, void *parg)
 	} else {
 		prof->prof_cyclic = cyclic_add_omni(&omni);
 	}
+	return (0);
 }
 
 /*ARGSUSED*/
diff --git a/external/cddl/osnet/dist/uts/common/dtrace/sdt_subr.c b/external/cddl/osnet/dist/uts/common/dtrace/sdt_subr.c
index 20aabcc20867a..a89403ea75859 100644
--- a/external/cddl/osnet/dist/uts/common/dtrace/sdt_subr.c
+++ b/external/cddl/osnet/dist/uts/common/dtrace/sdt_subr.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/sdt_impl.h>
 
 static dtrace_pattr_t vtrace_attr = {
@@ -43,6 +41,14 @@ static dtrace_pattr_t info_attr = {
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
 };
 
+static dtrace_pattr_t fc_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+};
+
 static dtrace_pattr_t fpu_attr = {
 { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
@@ -83,6 +89,14 @@ static dtrace_pattr_t xpv_attr = {
 { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM },
 };
 
+static dtrace_pattr_t iscsi_attr = {
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN },
+{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA },
+{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA },
+};
+
 sdt_provider_t sdt_providers[] = {
 	{ "vtrace", "__vtrace_", &vtrace_attr, 0 },
 	{ "sysinfo", "__cpu_sysinfo_", &info_attr, 0 },
@@ -94,9 +108,12 @@ sdt_provider_t sdt_providers[] = {
 	{ "ip", "__ip_", &stab_attr, 0 },
 	{ "mib", "__mib_", &stab_attr, 0 },
 	{ "fsinfo", "__fsinfo_", &fsinfo_attr, 0 },
+	{ "iscsi", "__iscsi_", &iscsi_attr, 0 },
 	{ "nfsv3", "__nfsv3_", &stab_attr, 0 },
 	{ "nfsv4", "__nfsv4_", &stab_attr, 0 },
 	{ "xpv", "__xpv_", &xpv_attr, 0 },
+	{ "fc", "__fc_", &fc_attr, 0 },
+	{ "srp", "__srp_", &fc_attr, 0 },
 	{ "sysevent", "__sysevent_", &stab_attr, 0 },
 	{ "sdt", NULL, &sdt_attr, 0 },
 	{ NULL }
@@ -170,6 +187,73 @@ sdt_argdesc_t sdt_args[] = {
 	{ "fsinfo", NULL, 0, 0, "vnode_t *", "fileinfo_t *" },
 	{ "fsinfo", NULL, 1, 1, "int", "int" },
 
+	{ "iscsi", "async-send", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "async-send", 1, 1, "iscsi_async_evt_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "login-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "login-command", 1, 1, "iscsi_login_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "login-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "login-response", 1, 1, "iscsi_login_rsp_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "logout-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "logout-command", 1, 1, "iscsi_logout_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "logout-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "logout-response", 1, 1, "iscsi_logout_rsp_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "data-request", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "data-request", 1, 1, "iscsi_rtt_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "data-send", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "data-send", 1, 1, "iscsi_data_rsp_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "data-receive", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "data-receive", 1, 1, "iscsi_data_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "nop-send", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "nop-send", 1, 1, "iscsi_nop_in_hdr_t *", "iscsiinfo_t *" },
+	{ "iscsi", "nop-receive", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "nop-receive", 1, 1, "iscsi_nop_out_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "scsi-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "scsi-command", 1, 1, "iscsi_scsi_cmd_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "scsi-command", 2, 2, "scsi_task_t *", "scsicmd_t *" },
+	{ "iscsi", "scsi-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "scsi-response", 1, 1, "iscsi_scsi_rsp_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "task-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "task-command", 1, 1, "iscsi_scsi_task_mgt_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "task-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "task-response", 1, 1, "iscsi_scsi_task_mgt_rsp_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "text-command", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "text-command", 1, 1, "iscsi_text_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "text-response", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "text-response", 1, 1, "iscsi_text_rsp_hdr_t *",
+	    "iscsiinfo_t *" },
+	{ "iscsi", "xfer-start", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "xfer-start", 1, 0, "idm_conn_t *", "iscsiinfo_t *" },
+	{ "iscsi", "xfer-start", 2, 1, "uintptr_t", "xferinfo_t *" },
+	{ "iscsi", "xfer-start", 3, 2, "uint32_t"},
+	{ "iscsi", "xfer-start", 4, 3, "uintptr_t"},
+	{ "iscsi", "xfer-start", 5, 4, "uint32_t"},
+	{ "iscsi", "xfer-start", 6, 5, "uint32_t"},
+	{ "iscsi", "xfer-start", 7, 6, "uint32_t"},
+	{ "iscsi", "xfer-start", 8, 7, "int"},
+	{ "iscsi", "xfer-done", 0, 0, "idm_conn_t *", "conninfo_t *" },
+	{ "iscsi", "xfer-done", 1, 0, "idm_conn_t *", "iscsiinfo_t *" },
+	{ "iscsi", "xfer-done", 2, 1, "uintptr_t", "xferinfo_t *" },
+	{ "iscsi", "xfer-done", 3, 2, "uint32_t"},
+	{ "iscsi", "xfer-done", 4, 3, "uintptr_t"},
+	{ "iscsi", "xfer-done", 5, 4, "uint32_t"},
+	{ "iscsi", "xfer-done", 6, 5, "uint32_t"},
+	{ "iscsi", "xfer-done", 7, 6, "uint32_t"},
+	{ "iscsi", "xfer-done", 8, 7, "int"},
+
 	{ "nfsv3", "op-getattr-start", 0, 0, "struct svc_req *",
 	    "conninfo_t *" },
 	{ "nfsv3", "op-getattr-start", 1, 1, "nfsv3oparg_t *",
@@ -864,6 +948,154 @@ sdt_argdesc_t sdt_args[] = {
 	{ "xpv", "setvcpucontext-end", 0, 0, "int" },
 	{ "xpv", "setvcpucontext-start", 0, 0, "domid_t" },
 	{ "xpv", "setvcpucontext-start", 1, 1, "vcpu_guest_context_t *" },
+
+	{ "srp", "service-up", 0, 0, "srpt_session_t *", "conninfo_t *" },
+	{ "srp", "service-up", 1, 0, "srpt_session_t *", "srp_portinfo_t *" },
+	{ "srp", "service-down", 0, 0, "srpt_session_t *", "conninfo_t *" },
+	{ "srp", "service-down", 1, 0, "srpt_session_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "login-command", 0, 0, "srpt_session_t *", "conninfo_t *" },
+	{ "srp", "login-command", 1, 0, "srpt_session_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "login-command", 2, 1, "srp_login_req_t *",
+	    "srp_logininfo_t *" },
+	{ "srp", "login-response", 0, 0, "srpt_session_t *", "conninfo_t *" },
+	{ "srp", "login-response", 1, 0, "srpt_session_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "login-response", 2, 1, "srp_login_rsp_t *",
+	    "srp_logininfo_t *" },
+	{ "srp", "login-response", 3, 2, "srp_login_rej_t *" },
+	{ "srp", "logout-command", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+	{ "srp", "logout-command", 1, 0, "srpt_channel_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "task-command", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+	{ "srp", "task-command", 1, 0, "srpt_channel_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "task-command", 2, 1, "srp_cmd_req_t *", "srp_taskinfo_t *" },
+	{ "srp", "task-response", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+	{ "srp", "task-response", 1, 0, "srpt_channel_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "task-response", 2, 1, "srp_rsp_t *", "srp_taskinfo_t *" },
+	{ "srp", "task-response", 3, 2, "scsi_task_t *" },
+	{ "srp", "task-response", 4, 3, "int8_t" },
+	{ "srp", "scsi-command", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+	{ "srp", "scsi-command", 1, 0, "srpt_channel_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "scsi-command", 2, 1, "scsi_task_t *", "scsicmd_t *" },
+	{ "srp", "scsi-command", 3, 2, "srp_cmd_req_t *", "srp_taskinfo_t *" },
+	{ "srp", "scsi-response", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+	{ "srp", "scsi-response", 1, 0, "srpt_channel_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "scsi-response", 2, 1, "srp_rsp_t *", "srp_taskinfo_t *" },
+	{ "srp", "scsi-response", 3, 2, "scsi_task_t *" },
+	{ "srp", "scsi-response", 4, 3, "int8_t" },
+	{ "srp", "xfer-start", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+	{ "srp", "xfer-start", 1, 0, "srpt_channel_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "xfer-start", 2, 1, "ibt_wr_ds_t *", "xferinfo_t *" },
+	{ "srp", "xfer-start", 3, 2, "srpt_iu_t *", "srp_taskinfo_t *" },
+	{ "srp", "xfer-start", 4, 3, "ibt_send_wr_t *"},
+	{ "srp", "xfer-start", 5, 4, "uint32_t" },
+	{ "srp", "xfer-start", 6, 5, "uint32_t" },
+	{ "srp", "xfer-start", 7, 6, "uint32_t" },
+	{ "srp", "xfer-start", 8, 7, "uint32_t" },
+	{ "srp", "xfer-done", 0, 0, "srpt_channel_t *", "conninfo_t *" },
+	{ "srp", "xfer-done", 1, 0, "srpt_channel_t *",
+	    "srp_portinfo_t *" },
+	{ "srp", "xfer-done", 2, 1, "ibt_wr_ds_t *", "xferinfo_t *" },
+	{ "srp", "xfer-done", 3, 2, "srpt_iu_t *", "srp_taskinfo_t *" },
+	{ "srp", "xfer-done", 4, 3, "ibt_send_wr_t *"},
+	{ "srp", "xfer-done", 5, 4, "uint32_t" },
+	{ "srp", "xfer-done", 6, 5, "uint32_t" },
+	{ "srp", "xfer-done", 7, 6, "uint32_t" },
+	{ "srp", "xfer-done", 8, 7, "uint32_t" },
+
+	{ "fc", "link-up",   0, 0, "fct_i_local_port_t *", "conninfo_t *" },
+	{ "fc", "link-down", 0, 0, "fct_i_local_port_t *", "conninfo_t *" },
+	{ "fc", "fabric-login-start", 0, 0, "fct_i_local_port_t *",
+	    "conninfo_t *" },
+	{ "fc", "fabric-login-start", 1, 0, "fct_i_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "fabric-login-end", 0, 0, "fct_i_local_port_t *",
+	    "conninfo_t *" },
+	{ "fc", "fabric-login-end", 1, 0, "fct_i_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "rport-login-start", 0, 0, "fct_cmd_t *",
+	    "conninfo_t *" },
+	{ "fc", "rport-login-start", 1, 1, "fct_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "rport-login-start", 2, 2, "fct_i_remote_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "rport-login-start", 3, 3, "int", "int" },
+	{ "fc", "rport-login-end", 0, 0, "fct_cmd_t *",
+	    "conninfo_t *" },
+	{ "fc", "rport-login-end", 1, 1, "fct_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "rport-login-end", 2, 2, "fct_i_remote_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "rport-login-end", 3, 3, "int", "int" },
+	{ "fc", "rport-login-end", 4, 4, "int", "int" },
+	{ "fc", "rport-logout-start", 0, 0, "fct_cmd_t *",
+	    "conninfo_t *" },
+	{ "fc", "rport-logout-start", 1, 1, "fct_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "rport-logout-start", 2, 2, "fct_i_remote_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "rport-logout-start", 3, 3, "int", "int" },
+	{ "fc", "rport-logout-end", 0, 0, "fct_cmd_t *",
+	    "conninfo_t *" },
+	{ "fc", "rport-logout-end", 1, 1, "fct_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "rport-logout-end", 2, 2, "fct_i_remote_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "rport-logout-end", 3, 3, "int", "int" },
+	{ "fc", "scsi-command", 0, 0, "fct_cmd_t *",
+	    "conninfo_t *" },
+	{ "fc", "scsi-command", 1, 1, "fct_i_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "scsi-command", 2, 2, "scsi_task_t *",
+	    "scsicmd_t *" },
+	{ "fc", "scsi-command", 3, 3, "fct_i_remote_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "scsi-response", 0, 0, "fct_cmd_t *",
+	    "conninfo_t *" },
+	{ "fc", "scsi-response", 1, 1, "fct_i_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "scsi-response", 2, 2, "scsi_task_t *",
+	    "scsicmd_t *" },
+	{ "fc", "scsi-response", 3, 3, "fct_i_remote_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "xfer-start", 0, 0, "fct_cmd_t *",
+	    "conninfo_t *" },
+	{ "fc", "xfer-start", 1, 1, "fct_i_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "xfer-start", 2, 2, "scsi_task_t *",
+	    "scsicmd_t *" },
+	{ "fc", "xfer-start", 3, 3, "fct_i_remote_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "xfer-start", 4, 4, "stmf_data_buf_t *",
+	    "fc_xferinfo_t *" },
+	{ "fc", "xfer-done", 0, 0, "fct_cmd_t *",
+	    "conninfo_t *" },
+	{ "fc", "xfer-done", 1, 1, "fct_i_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "xfer-done", 2, 2, "scsi_task_t *",
+	    "scsicmd_t *" },
+	{ "fc", "xfer-done", 3, 3, "fct_i_remote_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "xfer-done", 4, 4, "stmf_data_buf_t *",
+	    "fc_xferinfo_t *" },
+	{ "fc", "rscn-receive", 0, 0, "fct_i_local_port_t *",
+	    "conninfo_t *" },
+	{ "fc", "rscn-receive", 1, 1, "int", "int"},
+	{ "fc", "abts-receive", 0, 0, "fct_cmd_t *",
+	    "conninfo_t *" },
+	{ "fc", "abts-receive", 1, 1, "fct_i_local_port_t *",
+	    "fc_port_info_t *" },
+	{ "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *",
+	    "fc_port_info_t *" },
+
+
 	{ NULL }
 };
 
diff --git a/external/cddl/osnet/dist/uts/common/dtrace/systrace.c b/external/cddl/osnet/dist/uts/common/dtrace/systrace.c
index fe7bee1ac85ff..b864041c450da 100644
--- a/external/cddl/osnet/dist/uts/common/dtrace/systrace.c
+++ b/external/cddl/osnet/dist/uts/common/dtrace/systrace.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -140,7 +140,7 @@ systrace_destroy(void *arg, dtrace_id_t id, void *parg)
 }
 
 /*ARGSUSED*/
-static void
+static int
 systrace_enable(void *arg, dtrace_id_t id, void *parg)
 {
 	int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg);
@@ -161,7 +161,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg)
 
 	if (enabled) {
 		ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall);
-		return;
+		return (0);
 	}
 
 	(void) casptr(&sysent[sysnum].sy_callc,
@@ -172,6 +172,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg)
 	    (void *)systrace_sysent32[sysnum].stsy_underlying,
 	    (void *)dtrace_systrace_syscall32);
 #endif
+	return (0);
 }
 
 /*ARGSUSED*/
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c b/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c
index e6bba841ace5c..d0bf26e5f3b00 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -119,11 +119,11 @@
 
 #include <sys/spa.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
 #include <sys/zfs_context.h>
 #include <sys/arc.h>
 #include <sys/refcount.h>
 #include <sys/vdev.h>
+#include <sys/vdev_impl.h>
 #ifdef _KERNEL
 #include <sys/vmsystm.h>
 #include <vm/anon.h>
@@ -132,6 +132,7 @@
 #endif
 #include <sys/callb.h>
 #include <sys/kstat.h>
+#include <zfs_fletcher.h>
 
 static kmutex_t		arc_reclaim_thr_lock;
 static kcondvar_t	arc_reclaim_thr_cv;	/* used to signal reclaim thr */
@@ -152,6 +153,12 @@ typedef enum arc_reclaim_strategy {
 /* number of seconds before growing cache again */
 static int		arc_grow_retry = 60;
 
+/* shift of arc_c for calculating both min and max arc_p */
+static int		arc_p_min_shift = 4;
+
+/* log2(fraction of arc to reclaim) */
+static int		arc_shrink_shift = 5;
+
 /*
  * minimum lifespan of a prefetch block in clock ticks
  * (initialized in arc_init())
@@ -171,7 +178,9 @@ static boolean_t arc_warm;
 uint64_t zfs_arc_max;
 uint64_t zfs_arc_min;
 uint64_t zfs_arc_meta_limit = 0;
-int zfs_mdcomp_disable = 0;
+int zfs_arc_grow_retry = 0;
+int zfs_arc_shrink_shift = 0;
+int zfs_arc_p_min_shift = 0;
 
 /*
  * Note that buffers can be in one of 6 states:
@@ -239,6 +248,9 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_recycle_miss;
 	kstat_named_t arcstat_mutex_miss;
 	kstat_named_t arcstat_evict_skip;
+	kstat_named_t arcstat_evict_l2_cached;
+	kstat_named_t arcstat_evict_l2_eligible;
+	kstat_named_t arcstat_evict_l2_ineligible;
 	kstat_named_t arcstat_hash_elements;
 	kstat_named_t arcstat_hash_elements_max;
 	kstat_named_t arcstat_hash_collisions;
@@ -250,10 +262,14 @@ typedef struct arc_stats {
 	kstat_named_t arcstat_c_max;
 	kstat_named_t arcstat_size;
 	kstat_named_t arcstat_hdr_size;
+	kstat_named_t arcstat_data_size;
+	kstat_named_t arcstat_other_size;
 	kstat_named_t arcstat_l2_hits;
 	kstat_named_t arcstat_l2_misses;
 	kstat_named_t arcstat_l2_feeds;
 	kstat_named_t arcstat_l2_rw_clash;
+	kstat_named_t arcstat_l2_read_bytes;
+	kstat_named_t arcstat_l2_write_bytes;
 	kstat_named_t arcstat_l2_writes_sent;
 	kstat_named_t arcstat_l2_writes_done;
 	kstat_named_t arcstat_l2_writes_error;
@@ -288,6 +304,9 @@ static arc_stats_t arc_stats = {
 	{ "recycle_miss",		KSTAT_DATA_UINT64 },
 	{ "mutex_miss",			KSTAT_DATA_UINT64 },
 	{ "evict_skip",			KSTAT_DATA_UINT64 },
+	{ "evict_l2_cached",		KSTAT_DATA_UINT64 },
+	{ "evict_l2_eligible",		KSTAT_DATA_UINT64 },
+	{ "evict_l2_ineligible",	KSTAT_DATA_UINT64 },
 	{ "hash_elements",		KSTAT_DATA_UINT64 },
 	{ "hash_elements_max",		KSTAT_DATA_UINT64 },
 	{ "hash_collisions",		KSTAT_DATA_UINT64 },
@@ -299,10 +318,14 @@ static arc_stats_t arc_stats = {
 	{ "c_max",			KSTAT_DATA_UINT64 },
 	{ "size",			KSTAT_DATA_UINT64 },
 	{ "hdr_size",			KSTAT_DATA_UINT64 },
+	{ "data_size",			KSTAT_DATA_UINT64 },
+	{ "other_size",			KSTAT_DATA_UINT64 },
 	{ "l2_hits",			KSTAT_DATA_UINT64 },
 	{ "l2_misses",			KSTAT_DATA_UINT64 },
 	{ "l2_feeds",			KSTAT_DATA_UINT64 },
 	{ "l2_rw_clash",		KSTAT_DATA_UINT64 },
+	{ "l2_read_bytes",		KSTAT_DATA_UINT64 },
+	{ "l2_write_bytes",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_sent",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_done",		KSTAT_DATA_UINT64 },
 	{ "l2_writes_error",		KSTAT_DATA_UINT64 },
@@ -323,7 +346,7 @@ static arc_stats_t arc_stats = {
 #define	ARCSTAT_INCR(stat, val) \
 	atomic_add_64(&arc_stats.stat.value.ui64, (val));
 
-#define	ARCSTAT_BUMP(stat) 	ARCSTAT_INCR(stat, 1)
+#define	ARCSTAT_BUMP(stat)	ARCSTAT_INCR(stat, 1)
 #define	ARCSTAT_BUMPDOWN(stat)	ARCSTAT_INCR(stat, -1)
 
 #define	ARCSTAT_MAX(stat, val) {					\
@@ -357,7 +380,7 @@ static arc_stats_t arc_stats = {
 	}
 
 kstat_t			*arc_ksp;
-static arc_state_t 	*arc_anon;
+static arc_state_t	*arc_anon;
 static arc_state_t	*arc_mru;
 static arc_state_t	*arc_mru_ghost;
 static arc_state_t	*arc_mfu;
@@ -380,6 +403,7 @@ static arc_state_t	*arc_l2c_only;
 
 static int		arc_no_grow;	/* Don't try to grow cache size */
 static uint64_t		arc_tempreserve;
+static uint64_t		arc_loaned_bytes;
 static uint64_t		arc_meta_used;
 static uint64_t		arc_meta_limit;
 static uint64_t		arc_meta_max = 0;
@@ -425,7 +449,7 @@ struct arc_buf_hdr {
 	/* immutable */
 	arc_buf_contents_t	b_type;
 	uint64_t		b_size;
-	spa_t			*b_spa;
+	uint64_t		b_spa;
 
 	/* protected by arc state mutex */
 	arc_state_t		*b_state;
@@ -447,7 +471,9 @@ static arc_buf_hdr_t arc_eviction_hdr;
 static void arc_get_data_buf(arc_buf_t *buf);
 static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock);
 static int arc_evict_needed(arc_buf_contents_t type);
-static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
+static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes);
+
+static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab);
 
 #define	GHOST_STATE(state)	\
 	((state) == arc_mru_ghost || (state) == arc_mfu_ghost ||	\
@@ -471,11 +497,11 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes);
 #define	ARC_L2_WRITING		(1 << 16)	/* L2ARC write in progress */
 #define	ARC_L2_EVICTED		(1 << 17)	/* evicted during I/O */
 #define	ARC_L2_WRITE_HEAD	(1 << 18)	/* head of write list */
-#define	ARC_STORED		(1 << 19)	/* has been store()d to */
 
 #define	HDR_IN_HASH_TABLE(hdr)	((hdr)->b_flags & ARC_IN_HASH_TABLE)
 #define	HDR_IO_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_IO_IN_PROGRESS)
 #define	HDR_IO_ERROR(hdr)	((hdr)->b_flags & ARC_IO_ERROR)
+#define	HDR_PREFETCH(hdr)	((hdr)->b_flags & ARC_PREFETCH)
 #define	HDR_FREED_IN_READ(hdr)	((hdr)->b_flags & ARC_FREED_IN_READ)
 #define	HDR_BUF_AVAILABLE(hdr)	((hdr)->b_flags & ARC_BUF_AVAILABLE)
 #define	HDR_FREE_IN_PROGRESS(hdr)	((hdr)->b_flags & ARC_FREE_IN_PROGRESS)
@@ -529,8 +555,9 @@ uint64_t zfs_crc64_table[256];
  */
 
 #define	L2ARC_WRITE_SIZE	(8 * 1024 * 1024)	/* initial write max */
-#define	L2ARC_HEADROOM		4		/* num of writes */
-#define	L2ARC_FEED_SECS		1		/* caching interval */
+#define	L2ARC_HEADROOM		2		/* num of writes */
+#define	L2ARC_FEED_SECS		1		/* caching interval secs */
+#define	L2ARC_FEED_MIN_MS	200		/* min caching interval ms */
 
 #define	l2arc_writes_sent	ARCSTAT(arcstat_l2_writes_sent)
 #define	l2arc_writes_done	ARCSTAT(arcstat_l2_writes_done)
@@ -542,7 +569,10 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE;	/* default max write size */
 uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE;	/* extra write during warmup */
 uint64_t l2arc_headroom = L2ARC_HEADROOM;	/* number of dev writes */
 uint64_t l2arc_feed_secs = L2ARC_FEED_SECS;	/* interval seconds */
+uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS;	/* min interval milliseconds */
 boolean_t l2arc_noprefetch = B_TRUE;		/* don't cache prefetch bufs */
+boolean_t l2arc_feed_again = B_TRUE;		/* turbo warmup */
+boolean_t l2arc_norw = B_TRUE;			/* no reads during writes */
 
 /*
  * L2ARC Internals
@@ -557,6 +587,7 @@ typedef struct l2arc_dev {
 	uint64_t		l2ad_end;	/* last addr on device */
 	uint64_t		l2ad_evict;	/* last addr eviction reached */
 	boolean_t		l2ad_first;	/* first sweep through */
+	boolean_t		l2ad_writing;	/* currently writing */
 	list_t			*l2ad_buflist;	/* buffer list */
 	list_node_t		l2ad_node;	/* device list node */
 } l2arc_dev_t;
@@ -587,7 +618,7 @@ typedef struct l2arc_write_callback {
 struct l2arc_buf_hdr {
 	/* protected by arc_buf_hdr  mutex */
 	l2arc_dev_t	*b_dev;			/* L2ARC device */
-	daddr_t		b_daddr;		/* disk address, offset byte */
+	uint64_t	b_daddr;		/* disk address, offset byte */
 };
 
 typedef struct l2arc_data_free {
@@ -607,9 +638,8 @@ static void l2arc_hdr_stat_add(void);
 static void l2arc_hdr_stat_remove(void);
 
 static uint64_t
-buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
+buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth)
 {
-	uintptr_t spav = (uintptr_t)spa;
 	uint8_t *vdva = (uint8_t *)dva;
 	uint64_t crc = -1ULL;
 	int i;
@@ -619,7 +649,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
 	for (i = 0; i < sizeof (dva_t); i++)
 		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF];
 
-	crc ^= (spav>>8) ^ birth;
+	crc ^= (spa>>8) ^ birth;
 
 	return (crc);
 }
@@ -635,7 +665,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth)
 	((buf)->b_birth == birth) && ((buf)->b_spa == spa)
 
 static arc_buf_hdr_t *
-buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
+buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp)
 {
 	uint64_t idx = BUF_HASH_INDEX(spa, dva, birth);
 	kmutex_t *hash_lock = BUF_HASH_LOCK(idx);
@@ -755,8 +785,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag)
 	refcount_create(&buf->b_refcnt);
 	cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL);
 	mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL);
+	arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 
-	ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE);
 	return (0);
 }
 
@@ -768,6 +798,8 @@ buf_cons(void *vbuf, void *unused, int kmflag)
 
 	bzero(buf, sizeof (arc_buf_t));
 	rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL);
+	arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS);
+
 	return (0);
 }
 
@@ -781,11 +813,11 @@ hdr_dest(void *vbuf, void *unused)
 {
 	arc_buf_hdr_t *buf = vbuf;
 
+	ASSERT(BUF_EMPTY(buf));
 	refcount_destroy(&buf->b_refcnt);
 	cv_destroy(&buf->b_cv);
 	mutex_destroy(&buf->b_freeze_lock);
-
-	ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE);
+	arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS);
 }
 
 /* ARGSUSED */
@@ -795,6 +827,7 @@ buf_dest(void *vbuf, void *unused)
 	arc_buf_t *buf = vbuf;
 
 	rw_destroy(&buf->b_lock);
+	arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS);
 }
 
 /*
@@ -1004,6 +1037,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 	ASSERT(new_state != old_state);
 	ASSERT(refcnt == 0 || ab->b_datacnt > 0);
 	ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state));
+	ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon);
+	ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon);
 
 	from_delta = to_delta = ab->b_datacnt * ab->b_size;
 
@@ -1081,15 +1116,49 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock)
 }
 
 void
-arc_space_consume(uint64_t space)
+arc_space_consume(uint64_t space, arc_space_type_t type)
 {
+	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+	switch (type) {
+	case ARC_SPACE_DATA:
+		ARCSTAT_INCR(arcstat_data_size, space);
+		break;
+	case ARC_SPACE_OTHER:
+		ARCSTAT_INCR(arcstat_other_size, space);
+		break;
+	case ARC_SPACE_HDRS:
+		ARCSTAT_INCR(arcstat_hdr_size, space);
+		break;
+	case ARC_SPACE_L2HDRS:
+		ARCSTAT_INCR(arcstat_l2_hdr_size, space);
+		break;
+	}
+
 	atomic_add_64(&arc_meta_used, space);
 	atomic_add_64(&arc_size, space);
 }
 
 void
-arc_space_return(uint64_t space)
+arc_space_return(uint64_t space, arc_space_type_t type)
 {
+	ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES);
+
+	switch (type) {
+	case ARC_SPACE_DATA:
+		ARCSTAT_INCR(arcstat_data_size, -space);
+		break;
+	case ARC_SPACE_OTHER:
+		ARCSTAT_INCR(arcstat_other_size, -space);
+		break;
+	case ARC_SPACE_HDRS:
+		ARCSTAT_INCR(arcstat_hdr_size, -space);
+		break;
+	case ARC_SPACE_L2HDRS:
+		ARCSTAT_INCR(arcstat_l2_hdr_size, -space);
+		break;
+	}
+
 	ASSERT(arc_meta_used >= space);
 	if (arc_meta_max < arc_meta_used)
 		arc_meta_max = arc_meta_used;
@@ -1126,7 +1195,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 	ASSERT(BUF_EMPTY(hdr));
 	hdr->b_size = size;
 	hdr->b_type = type;
-	hdr->b_spa = spa;
+	hdr->b_spa = spa_guid(spa);
 	hdr->b_state = arc_anon;
 	hdr->b_arc_access = 0;
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
@@ -1145,6 +1214,58 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type)
 	return (buf);
 }
 
+static char *arc_onloan_tag = "onloan";
+
+/*
+ * Loan out an anonymous arc buffer. Loaned buffers are not counted as in
+ * flight data by arc_tempreserve_space() until they are "returned". Loaned
+ * buffers must be returned to the arc before they can be used by the DMU or
+ * freed.
+ */
+arc_buf_t *
+arc_loan_buf(spa_t *spa, int size)
+{
+	arc_buf_t *buf;
+
+	buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA);
+
+	atomic_add_64(&arc_loaned_bytes, size);
+	return (buf);
+}
+
+/*
+ * Return a loaned arc buffer to the arc.
+ */
+void
+arc_return_buf(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr = buf->b_hdr;
+
+	ASSERT(buf->b_data != NULL);
+	(void) refcount_add(&hdr->b_refcnt, tag);
+	(void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag);
+
+	atomic_add_64(&arc_loaned_bytes, -hdr->b_size);
+}
+
+/* Detach an arc_buf from a dbuf (tag) */
+void
+arc_loan_inuse_buf(arc_buf_t *buf, void *tag)
+{
+	arc_buf_hdr_t *hdr;
+
+	rw_enter(&buf->b_lock, RW_WRITER);
+	ASSERT(buf->b_data != NULL);
+	hdr = buf->b_hdr;
+	(void) refcount_add(&hdr->b_refcnt, arc_onloan_tag);
+	(void) refcount_remove(&hdr->b_refcnt, tag);
+	buf->b_efunc = NULL;
+	buf->b_private = NULL;
+
+	atomic_add_64(&arc_loaned_bytes, hdr->b_size);
+	rw_exit(&buf->b_lock);
+}
+
 static arc_buf_t *
 arc_buf_clone(arc_buf_t *from)
 {
@@ -1152,6 +1273,8 @@ arc_buf_clone(arc_buf_t *from)
 	arc_buf_hdr_t *hdr = from->b_hdr;
 	uint64_t size = hdr->b_size;
 
+	ASSERT(hdr->b_state != arc_anon);
+
 	buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE);
 	buf->b_hdr = hdr;
 	buf->b_data = NULL;
@@ -1189,6 +1312,7 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag)
 
 	ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu);
 	add_reference(hdr, hash_lock, tag);
+	DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr);
 	arc_access(hdr, hash_lock);
 	mutex_exit(hash_lock);
 	ARCSTAT_BUMP(arcstat_hits);
@@ -1232,15 +1356,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all)
 		arc_buf_contents_t type = buf->b_hdr->b_type;
 
 		arc_cksum_verify(buf);
+
 		if (!recycle) {
 			if (type == ARC_BUFC_METADATA) {
 				arc_buf_data_free(buf->b_hdr, zio_buf_free,
 				    buf->b_data, size);
-				arc_space_return(size);
+				arc_space_return(size, ARC_SPACE_DATA);
 			} else {
 				ASSERT(type == ARC_BUFC_DATA);
 				arc_buf_data_free(buf->b_hdr,
 				    zio_data_buf_free, buf->b_data, size);
+				ARCSTAT_INCR(arcstat_data_size, -size);
 				atomic_add_64(&arc_size, -size);
 			}
 		}
@@ -1282,34 +1408,36 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr)
 	ASSERT(refcount_is_zero(&hdr->b_refcnt));
 	ASSERT3P(hdr->b_state, ==, arc_anon);
 	ASSERT(!HDR_IO_IN_PROGRESS(hdr));
-	ASSERT(!(hdr->b_flags & ARC_STORED));
+	l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr;
 
-	if (hdr->b_l2hdr != NULL) {
-		if (!MUTEX_HELD(&l2arc_buflist_mtx)) {
-			/*
-			 * To prevent arc_free() and l2arc_evict() from
-			 * attempting to free the same buffer at the same time,
-			 * a FREE_IN_PROGRESS flag is given to arc_free() to
-			 * give it priority.  l2arc_evict() can't destroy this
-			 * header while we are waiting on l2arc_buflist_mtx.
-			 *
-			 * The hdr may be removed from l2ad_buflist before we
-			 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
-			 */
+	if (l2hdr != NULL) {
+		boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx);
+		/*
+		 * To prevent arc_free() and l2arc_evict() from
+		 * attempting to free the same buffer at the same time,
+		 * a FREE_IN_PROGRESS flag is given to arc_free() to
+		 * give it priority.  l2arc_evict() can't destroy this
+		 * header while we are waiting on l2arc_buflist_mtx.
+		 *
+		 * The hdr may be removed from l2ad_buflist before we
+		 * grab l2arc_buflist_mtx, so b_l2hdr is rechecked.
+		 */
+		if (!buflist_held) {
 			mutex_enter(&l2arc_buflist_mtx);
-			if (hdr->b_l2hdr != NULL) {
-				list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist,
-				    hdr);
-			}
-			mutex_exit(&l2arc_buflist_mtx);
-		} else {
-			list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr);
+			l2hdr = hdr->b_l2hdr;
 		}
-		ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
-		kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t));
-		if (hdr->b_state == arc_l2c_only)
-			l2arc_hdr_stat_remove();
-		hdr->b_l2hdr = NULL;
+
+		if (l2hdr != NULL) {
+			list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
+			ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size);
+			kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
+			if (hdr->b_state == arc_l2c_only)
+				l2arc_hdr_stat_remove();
+			hdr->b_l2hdr = NULL;
+		}
+
+		if (!buflist_held)
+			mutex_exit(&l2arc_buflist_mtx);
 	}
 
 	if (!BUF_EMPTY(hdr)) {
@@ -1361,10 +1489,13 @@ arc_buf_free(arc_buf_t *buf, void *tag)
 
 		mutex_enter(hash_lock);
 		(void) remove_reference(hdr, hash_lock, tag);
-		if (hdr->b_datacnt > 1)
+		if (hdr->b_datacnt > 1) {
 			arc_buf_destroy(buf, FALSE, TRUE);
-		else
+		} else {
+			ASSERT(buf == hdr->b_buf);
+			ASSERT(buf->b_efunc == NULL);
 			hdr->b_flags |= ARC_BUF_AVAILABLE;
+		}
 		mutex_exit(hash_lock);
 	} else if (HDR_IO_IN_PROGRESS(hdr)) {
 		int destroy_hdr;
@@ -1398,6 +1529,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 	int no_callback = (buf->b_efunc == NULL);
 
 	if (hdr->b_state == arc_anon) {
+		ASSERT(hdr->b_datacnt == 1);
 		arc_buf_free(buf, tag);
 		return (no_callback);
 	}
@@ -1412,6 +1544,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag)
 			arc_buf_destroy(buf, FALSE, TRUE);
 	} else if (no_callback) {
 		ASSERT(hdr->b_buf == buf && buf->b_next == NULL);
+		ASSERT(buf->b_efunc == NULL);
 		hdr->b_flags |= ARC_BUF_AVAILABLE;
 	}
 	ASSERT(no_callback || hdr->b_datacnt > 1 ||
@@ -1440,7 +1573,7 @@ arc_buf_size(arc_buf_t *buf)
  * It may also return without evicting as much space as requested.
  */
 static void *
-arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
+arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle,
     arc_buf_contents_t type)
 {
 	arc_state_t *evicted_state;
@@ -1464,7 +1597,8 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
 		if (HDR_IO_IN_PROGRESS(ab) ||
 		    (spa && ab->b_spa != spa) ||
 		    (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) &&
-		    lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) {
+		    ddi_get_lbolt() - ab->b_arc_access <
+		    arc_min_prefetch_lifespan)) {
 			skipped++;
 			continue;
 		}
@@ -1508,6 +1642,21 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
 					    buf->b_data == stolen, TRUE);
 				}
 			}
+
+			if (ab->b_l2hdr) {
+				ARCSTAT_INCR(arcstat_evict_l2_cached,
+				    ab->b_size);
+			} else {
+				if (l2arc_write_eligible(ab->b_spa, ab)) {
+					ARCSTAT_INCR(arcstat_evict_l2_eligible,
+					    ab->b_size);
+				} else {
+					ARCSTAT_INCR(
+					    arcstat_evict_l2_ineligible,
+					    ab->b_size);
+				}
+			}
+
 			if (ab->b_datacnt == 0) {
 				arc_change_state(evicted_state, ab, hash_lock);
 				ASSERT(HDR_IN_HASH_TABLE(ab));
@@ -1566,13 +1715,14 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle,
  * bytes.  Destroy the buffers that are removed.
  */
 static void
-arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
+arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes)
 {
 	arc_buf_hdr_t *ab, *ab_prev;
 	list_t *list = &state->arcs_list[ARC_BUFC_DATA];
 	kmutex_t *hash_lock;
 	uint64_t bytes_deleted = 0;
 	uint64_t bufs_skipped = 0;
+	boolean_t have_lock;
 
 	ASSERT(GHOST_STATE(state));
 top:
@@ -1582,7 +1732,8 @@ arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
 		if (spa && ab->b_spa != spa)
 			continue;
 		hash_lock = HDR_LOCK(ab);
-		if (mutex_tryenter(hash_lock)) {
+		have_lock = MUTEX_HELD(hash_lock);
+		if (have_lock || mutex_tryenter(hash_lock)) {
 			ASSERT(!HDR_IO_IN_PROGRESS(ab));
 			ASSERT(ab->b_buf == NULL);
 			ARCSTAT_BUMP(arcstat_deleted);
@@ -1594,10 +1745,12 @@ arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
 				 * don't destroy the header.
 				 */
 				arc_change_state(arc_l2c_only, ab, hash_lock);
-				mutex_exit(hash_lock);
+				if (!have_lock)
+					mutex_exit(hash_lock);
 			} else {
 				arc_change_state(arc_anon, ab, hash_lock);
-				mutex_exit(hash_lock);
+				if (!have_lock)
+					mutex_exit(hash_lock);
 				arc_hdr_destroy(ab);
 			}
 
@@ -1635,61 +1788,63 @@ arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes)
 static void
 arc_adjust(void)
 {
-	int64_t top_sz, mru_over, arc_over, todelete;
+	int64_t adjustment, delta;
 
-	top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used;
+	/*
+	 * Adjust MRU size
+	 */
+
+	adjustment = MIN(arc_size - arc_c,
+	    arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p);
 
-	if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
-		int64_t toevict =
-		    MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p);
-		(void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA);
-		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
+	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) {
+		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment);
+		(void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA);
+		adjustment -= delta;
 	}
 
-	if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-		int64_t toevict =
-		    MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p);
-		(void) arc_evict(arc_mru, NULL, toevict, FALSE,
+	if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+		delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment);
+		(void) arc_evict(arc_mru, NULL, delta, FALSE,
 		    ARC_BUFC_METADATA);
-		top_sz = arc_anon->arcs_size + arc_mru->arcs_size;
 	}
 
-	mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c;
+	/*
+	 * Adjust MFU size
+	 */
 
-	if (mru_over > 0) {
-		if (arc_mru_ghost->arcs_size > 0) {
-			todelete = MIN(arc_mru_ghost->arcs_size, mru_over);
-			arc_evict_ghost(arc_mru_ghost, NULL, todelete);
-		}
+	adjustment = arc_size - arc_c;
+
+	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
+		delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]);
+		(void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA);
+		adjustment -= delta;
 	}
 
-	if ((arc_over = arc_size - arc_c) > 0) {
-		int64_t tbl_over;
+	if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
+		int64_t delta = MIN(adjustment,
+		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA]);
+		(void) arc_evict(arc_mfu, NULL, delta, FALSE,
+		    ARC_BUFC_METADATA);
+	}
 
-		if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) {
-			int64_t toevict =
-			    MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over);
-			(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
-			    ARC_BUFC_DATA);
-			arc_over = arc_size - arc_c;
-		}
+	/*
+	 * Adjust ghost lists
+	 */
 
-		if (arc_over > 0 &&
-		    arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) {
-			int64_t toevict =
-			    MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA],
-			    arc_over);
-			(void) arc_evict(arc_mfu, NULL, toevict, FALSE,
-			    ARC_BUFC_METADATA);
-		}
+	adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c;
 
-		tbl_over = arc_size + arc_mru_ghost->arcs_size +
-		    arc_mfu_ghost->arcs_size - arc_c * 2;
+	if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) {
+		delta = MIN(arc_mru_ghost->arcs_size, adjustment);
+		arc_evict_ghost(arc_mru_ghost, NULL, delta);
+	}
 
-		if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) {
-			todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over);
-			arc_evict_ghost(arc_mfu_ghost, NULL, todelete);
-		}
+	adjustment =
+	    arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c;
+
+	if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) {
+		delta = MIN(arc_mfu_ghost->arcs_size, adjustment);
+		arc_evict_ghost(arc_mfu_ghost, NULL, delta);
 	}
 }
 
@@ -1723,29 +1878,34 @@ arc_do_user_evicts(void)
 void
 arc_flush(spa_t *spa)
 {
+	uint64_t guid = 0;
+
+	if (spa)
+		guid = spa_guid(spa);
+
 	while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) {
-		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA);
 		if (spa)
 			break;
 	}
 	while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) {
-		(void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA);
+		(void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA);
 		if (spa)
 			break;
 	}
 	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) {
-		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA);
+		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA);
 		if (spa)
 			break;
 	}
 	while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) {
-		(void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA);
+		(void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA);
 		if (spa)
 			break;
 	}
 
-	arc_evict_ghost(arc_mru_ghost, spa, -1);
-	arc_evict_ghost(arc_mfu_ghost, spa, -1);
+	arc_evict_ghost(arc_mru_ghost, guid, -1);
+	arc_evict_ghost(arc_mfu_ghost, guid, -1);
 
 	mutex_enter(&arc_reclaim_thr_lock);
 	arc_do_user_evicts();
@@ -1753,8 +1913,6 @@ arc_flush(spa_t *spa)
 	ASSERT(spa || arc_eviction_list == NULL);
 }
 
-int arc_shrink_shift = 5;		/* log2(fraction of arc to reclaim) */
-
 void
 arc_shrink(void)
 {
@@ -1915,12 +2073,12 @@ arc_reclaim_thread(void)
 			}
 
 			/* reset the growth delay for every reclaim */
-			growtime = lbolt + (arc_grow_retry * hz);
+			growtime = ddi_get_lbolt() + (arc_grow_retry * hz);
 
 			arc_kmem_reap_now(last_reclaim);
 			arc_warm = B_TRUE;
 
-		} else if (arc_no_grow && lbolt >= growtime) {
+		} else if (arc_no_grow && ddi_get_lbolt() >= growtime) {
 			arc_no_grow = FALSE;
 		}
 
@@ -1934,7 +2092,7 @@ arc_reclaim_thread(void)
 		/* block until needed, or one second, whichever is shorter */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait(&arc_reclaim_thr_cv,
-		    &arc_reclaim_thr_lock, (lbolt + hz));
+		    &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz));
 		CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock);
 	}
 
@@ -1953,6 +2111,7 @@ static void
 arc_adapt(int bytes, arc_state_t *state)
 {
 	int mult;
+	uint64_t arc_p_min = (arc_c >> arc_p_min_shift);
 
 	if (state == arc_l2c_only)
 		return;
@@ -1970,12 +2129,15 @@ arc_adapt(int bytes, arc_state_t *state)
 		mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ?
 		    1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size));
 
-		arc_p = MIN(arc_c, arc_p + bytes * mult);
+		arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult);
 	} else if (state == arc_mfu_ghost) {
+		uint64_t delta;
+
 		mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ?
 		    1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size));
 
-		arc_p = MAX(0, (int64_t)arc_p - bytes * mult);
+		delta = MIN(bytes * mult, arc_p);
+		arc_p = MAX(arc_p_min, arc_p - delta);
 	}
 	ASSERT((int64_t)arc_p >= 0);
 
@@ -2073,10 +2235,11 @@ arc_get_data_buf(arc_buf_t *buf)
 	if (!arc_evict_needed(type)) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size);
+			arc_space_consume(size, ARC_SPACE_DATA);
 		} else {
 			ASSERT(type == ARC_BUFC_DATA);
 			buf->b_data = zio_data_buf_alloc(size);
+			ARCSTAT_INCR(arcstat_data_size, size);
 			atomic_add_64(&arc_size, size);
 		}
 		goto out;
@@ -2093,21 +2256,22 @@ arc_get_data_buf(arc_buf_t *buf)
 
 	if (state == arc_mru || state == arc_anon) {
 		uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size;
-		state = (arc_mfu->arcs_lsize[type] > 0 &&
+		state = (arc_mfu->arcs_lsize[type] >= size &&
 		    arc_p > mru_used) ? arc_mfu : arc_mru;
 	} else {
 		/* MFU cases */
 		uint64_t mfu_space = arc_c - arc_p;
-		state =  (arc_mru->arcs_lsize[type] > 0 &&
+		state =  (arc_mru->arcs_lsize[type] >= size &&
 		    mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu;
 	}
 	if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) {
 		if (type == ARC_BUFC_METADATA) {
 			buf->b_data = zio_buf_alloc(size);
-			arc_space_consume(size);
+			arc_space_consume(size, ARC_SPACE_DATA);
 		} else {
 			ASSERT(type == ARC_BUFC_DATA);
 			buf->b_data = zio_data_buf_alloc(size);
+			ARCSTAT_INCR(arcstat_data_size, size);
 			atomic_add_64(&arc_size, size);
 		}
 		ARCSTAT_BUMP(arcstat_recycle_miss);
@@ -2143,6 +2307,8 @@ arc_get_data_buf(arc_buf_t *buf)
 static void
 arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 {
+	clock_t now;
+
 	ASSERT(MUTEX_HELD(hash_lock));
 
 	if (buf->b_state == arc_anon) {
@@ -2153,11 +2319,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 */
 
 		ASSERT(buf->b_arc_access == 0);
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf);
 		arc_change_state(arc_mru, buf, hash_lock);
 
 	} else if (buf->b_state == arc_mru) {
+		now = ddi_get_lbolt();
+
 		/*
 		 * If this buffer is here because of a prefetch, then either:
 		 * - clear the flag if this is a "referencing" read
@@ -2173,7 +2341,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 				buf->b_flags &= ~ARC_PREFETCH;
 				ARCSTAT_BUMP(arcstat_mru_hits);
 			}
-			buf->b_arc_access = lbolt;
+			buf->b_arc_access = now;
 			return;
 		}
 
@@ -2182,13 +2350,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 * but it is still in the cache. Move it to the MFU
 		 * state.
 		 */
-		if (lbolt > buf->b_arc_access + ARC_MINTIME) {
+		if (now > buf->b_arc_access + ARC_MINTIME) {
 			/*
 			 * More than 125ms have passed since we
 			 * instantiated this buffer.  Move it to the
 			 * most frequently used state.
 			 */
-			buf->b_arc_access = lbolt;
+			buf->b_arc_access = now;
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 			arc_change_state(arc_mfu, buf, hash_lock);
 		}
@@ -2211,7 +2379,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 			DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		}
 
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 		arc_change_state(new_state, buf, hash_lock);
 
 		ARCSTAT_BUMP(arcstat_mru_ghost_hits);
@@ -2230,7 +2398,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 			ASSERT(list_link_active(&buf->b_arc_node));
 		}
 		ARCSTAT_BUMP(arcstat_mfu_hits);
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 	} else if (buf->b_state == arc_mfu_ghost) {
 		arc_state_t	*new_state = arc_mfu;
 		/*
@@ -2248,7 +2416,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 			new_state = arc_mru;
 		}
 
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		arc_change_state(new_state, buf, hash_lock);
 
@@ -2258,7 +2426,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock)
 		 * This buffer is on the 2nd Level ARC.
 		 */
 
-		buf->b_arc_access = lbolt;
+		buf->b_arc_access = ddi_get_lbolt();
 		DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf);
 		arc_change_state(arc_mfu, buf, hash_lock);
 	} else {
@@ -2309,7 +2477,7 @@ arc_read_done(zio_t *zio)
 	 * reason for it not to be found is if we were freed during the
 	 * read.
 	 */
-	found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth,
+	found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth,
 	    &hash_lock);
 
 	ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) ||
@@ -2323,7 +2491,7 @@ arc_read_done(zio_t *zio)
 	/* byteswap if necessary */
 	callback_list = hdr->b_acb;
 	ASSERT(callback_list != NULL);
-	if (BP_SHOULD_BYTESWAP(zio->io_bp)) {
+	if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) {
 		arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ?
 		    byteswap_uint64_array :
 		    dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap;
@@ -2332,6 +2500,16 @@ arc_read_done(zio_t *zio)
 
 	arc_cksum_compute(buf, B_FALSE);
 
+	if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) {
+		/*
+		 * Only call arc_access on anonymous buffers.  This is because
+		 * if we've issued an I/O for an evicted buffer, we've already
+		 * called arc_access (to prevent any simultaneous readers from
+		 * getting confused).
+		 */
+		arc_access(hdr, hash_lock);
+	}
+
 	/* create copies of the data buffer for the callers */
 	abuf = buf;
 	for (acb = callback_list; acb; acb = acb->acb_next) {
@@ -2345,8 +2523,11 @@ arc_read_done(zio_t *zio)
 	hdr->b_acb = NULL;
 	hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	ASSERT(!HDR_BUF_AVAILABLE(hdr));
-	if (abuf == buf)
+	if (abuf == buf) {
+		ASSERT(buf->b_efunc == NULL);
+		ASSERT(hdr->b_datacnt == 1);
 		hdr->b_flags |= ARC_BUF_AVAILABLE;
+	}
 
 	ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL);
 
@@ -2367,14 +2548,6 @@ arc_read_done(zio_t *zio)
 	cv_broadcast(&hdr->b_cv);
 
 	if (hash_lock) {
-		/*
-		 * Only call arc_access on anonymous buffers.  This is because
-		 * if we've issued an I/O for an evicted buffer, we've already
-		 * called arc_access (to prevent any simultaneous readers from
-		 * getting confused).
-		 */
-		if (zio->io_error == 0 && hdr->b_state == arc_anon)
-			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
 	} else {
 		/*
@@ -2425,16 +2598,14 @@ arc_read_done(zio_t *zio)
  *
  * Normal callers should use arc_read and pass the arc buffer and offset
  * for the bp.  But if you know you don't need locking, you can use
- * arc_read_nolock.  Callers cannot use a "done" function in a prefetch
- * call (i.e., with ARC_NOWAIT set).
+ * arc_read_bp.
  */
 int
-arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
 	int err;
-	arc_buf_hdr_t *hdr = pbuf->b_hdr;
 
 	ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt));
 	ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size);
@@ -2442,14 +2613,13 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
 
 	err = arc_read_nolock(pio, spa, bp, done, private, priority,
 	    zio_flags, arc_flags, zb);
-
-	ASSERT3P(hdr, ==, pbuf->b_hdr);
 	rw_exit(&pbuf->b_lock);
+
 	return (err);
 }
 
 int
-arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb)
 {
@@ -2457,9 +2627,11 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 	arc_buf_t *buf;
 	kmutex_t *hash_lock;
 	zio_t *rzio;
+	uint64_t guid = spa_guid(spa);
 
 top:
-	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+	    &hash_lock);
 	if (hdr && hdr->b_datacnt > 0) {
 
 		*arc_flags |= ARC_CACHED;
@@ -2482,7 +2654,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 				acb->acb_private = private;
 				if (pio != NULL)
 					acb->acb_zio_dummy = zio_null(pio,
-					    spa, NULL, NULL, zio_flags);
+					    spa, NULL, NULL, NULL, zio_flags);
 
 				ASSERT(acb->acb_done != NULL);
 				acb->acb_next = hdr->b_acb;
@@ -2513,6 +2685,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 			} else {
 				buf = arc_buf_clone(buf);
 			}
+
 		} else if (*arc_flags & ARC_PREFETCH &&
 		    refcount_count(&hdr->b_refcnt) == 0) {
 			hdr->b_flags |= ARC_PREFETCH;
@@ -2533,7 +2706,8 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 		uint64_t size = BP_GET_LSIZE(bp);
 		arc_callback_t	*acb;
 		vdev_t *vd = NULL;
-		daddr_t addr;
+		uint64_t addr;
+		boolean_t devw = B_FALSE;
 
 		if (hdr == NULL) {
 			/* this block is not in the cache */
@@ -2542,7 +2716,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 			buf = arc_buf_alloc(spa, size, private, type);
 			hdr = buf->b_hdr;
 			hdr->b_dva = *BP_IDENTITY(bp);
-			hdr->b_birth = bp->blk_birth;
+			hdr->b_birth = BP_PHYSICAL_BIRTH(bp);
 			hdr->b_cksum0 = bp->blk_cksum.zc_word[0];
 			exists = buf_hash_insert(hdr, &hash_lock);
 			if (exists) {
@@ -2588,7 +2762,6 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 			arc_get_data_buf(buf);
 			ASSERT(hdr->b_datacnt == 0);
 			hdr->b_datacnt = 1;
-
 		}
 
 		acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP);
@@ -2612,6 +2785,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 
 		if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL &&
 		    (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) {
+			devw = hdr->b_l2hdr->b_dev->l2ad_writing;
 			addr = hdr->b_l2hdr->b_daddr;
 			/*
 			 * Lock out device removal.
@@ -2624,14 +2798,14 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 		mutex_exit(hash_lock);
 
 		ASSERT3U(hdr->b_size, ==, size);
-		DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size,
-		    zbookmark_t *, zb);
+		DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp,
+		    uint64_t, size, zbookmark_t *, zb);
 		ARCSTAT_BUMP(arcstat_misses);
 		ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH),
 		    demand, prefetch, hdr->b_type != ARC_BUFC_METADATA,
 		    data, metadata, misses);
 
-		if (vd != NULL) {
+		if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) {
 			/*
 			 * Read from the L2ARC if the following are true:
 			 * 1. The L2ARC vdev was previously cached.
@@ -2639,9 +2813,11 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 			 * 3. This buffer isn't currently writing to the L2ARC.
 			 * 4. The L2ARC entry wasn't evicted, which may
 			 *    also have invalidated the vdev.
+			 * 5. This isn't prefetch and l2arc_noprefetch is set.
 			 */
 			if (hdr->b_l2hdr != NULL &&
-			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) {
+			    !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) &&
+			    !(l2arc_noprefetch && HDR_PREFETCH(hdr))) {
 				l2arc_read_callback_t *cb;
 
 				DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr);
@@ -2667,6 +2843,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 				    ZIO_FLAG_DONT_RETRY, B_FALSE);
 				DTRACE_PROBE2(l2arc__read, vdev_t *, vd,
 				    zio_t *, rzio);
+				ARCSTAT_INCR(arcstat_l2_read_bytes, size);
 
 				if (*arc_flags & ARC_NOWAIT) {
 					zio_nowait(rzio);
@@ -2686,6 +2863,14 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 					ARCSTAT_BUMP(arcstat_l2_rw_clash);
 				spa_config_exit(spa, SCL_L2ARC, vd);
 			}
+		} else {
+			if (vd != NULL)
+				spa_config_exit(spa, SCL_L2ARC, vd);
+			if (l2arc_ndev != 0) {
+				DTRACE_PROBE1(l2arc__miss,
+				    arc_buf_hdr_t *, hdr);
+				ARCSTAT_BUMP(arcstat_l2_misses);
+			}
 		}
 
 		rzio = zio_read(pio, spa, bp, buf->b_data, size,
@@ -2700,46 +2885,15 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
 	return (0);
 }
 
-/*
- * arc_read() variant to support pool traversal.  If the block is already
- * in the ARC, make a copy of it; otherwise, the caller will do the I/O.
- * The idea is that we don't want pool traversal filling up memory, but
- * if the ARC already has the data anyway, we shouldn't pay for the I/O.
- */
-int
-arc_tryread(spa_t *spa, blkptr_t *bp, void *data)
-{
-	arc_buf_hdr_t *hdr;
-	kmutex_t *hash_mtx;
-	int rc = 0;
-
-	hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx);
-
-	if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) {
-		arc_buf_t *buf = hdr->b_buf;
-
-		ASSERT(buf);
-		while (buf->b_data == NULL) {
-			buf = buf->b_next;
-			ASSERT(buf);
-		}
-		bcopy(buf->b_data, data, hdr->b_size);
-	} else {
-		rc = ENOENT;
-	}
-
-	if (hash_mtx)
-		mutex_exit(hash_mtx);
-
-	return (rc);
-}
-
 void
 arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private)
 {
 	ASSERT(buf->b_hdr != NULL);
 	ASSERT(buf->b_hdr->b_state != arc_anon);
 	ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL);
+	ASSERT(buf->b_efunc == NULL);
+	ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr));
+
 	buf->b_efunc = func;
 	buf->b_private = private;
 }
@@ -2838,13 +2992,13 @@ arc_release(arc_buf_t *buf, void *tag)
 	kmutex_t *hash_lock;
 	l2arc_buf_hdr_t *l2hdr;
 	uint64_t buf_size;
+	boolean_t released = B_FALSE;
 
 	rw_enter(&buf->b_lock, RW_WRITER);
 	hdr = buf->b_hdr;
 
 	/* this buffer is not on any list */
 	ASSERT(refcount_count(&hdr->b_refcnt) > 0);
-	ASSERT(!(hdr->b_flags & ARC_STORED));
 
 	if (hdr->b_state == arc_anon) {
 		/* this buffer is already released */
@@ -2853,12 +3007,12 @@ arc_release(arc_buf_t *buf, void *tag)
 		ASSERT(buf->b_efunc == NULL);
 		arc_buf_thaw(buf);
 		rw_exit(&buf->b_lock);
-		return;
+		released = B_TRUE;
+	} else {
+		hash_lock = HDR_LOCK(hdr);
+		mutex_enter(hash_lock);
 	}
 
-	hash_lock = HDR_LOCK(hdr);
-	mutex_enter(hash_lock);
-
 	l2hdr = hdr->b_l2hdr;
 	if (l2hdr) {
 		mutex_enter(&l2arc_buflist_mtx);
@@ -2866,6 +3020,9 @@ arc_release(arc_buf_t *buf, void *tag)
 		buf_size = hdr->b_size;
 	}
 
+	if (released)
+		goto out;
+
 	/*
 	 * Do we have more than one buf?
 	 */
@@ -2873,7 +3030,7 @@ arc_release(arc_buf_t *buf, void *tag)
 		arc_buf_hdr_t *nhdr;
 		arc_buf_t **bufp;
 		uint64_t blksz = hdr->b_size;
-		spa_t *spa = hdr->b_spa;
+		uint64_t spa = hdr->b_spa;
 		arc_buf_contents_t type = hdr->b_type;
 		uint32_t flags = hdr->b_flags;
 
@@ -2933,6 +3090,7 @@ arc_release(arc_buf_t *buf, void *tag)
 	buf->b_efunc = NULL;
 	buf->b_private = NULL;
 
+out:
 	if (l2hdr) {
 		list_remove(l2hdr->b_dev->l2ad_buflist, hdr);
 		kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t));
@@ -3011,11 +3169,16 @@ arc_write_done(zio_t *zio)
 	arc_buf_t *buf = callback->awcb_buf;
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 
-	hdr->b_acb = NULL;
+	ASSERT(hdr->b_acb == NULL);
+
+	if (zio->io_error == 0) {
+		hdr->b_dva = *BP_IDENTITY(zio->io_bp);
+		hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp);
+		hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
+	} else {
+		ASSERT(BUF_EMPTY(hdr));
+	}
 
-	hdr->b_dva = *BP_IDENTITY(zio->io_bp);
-	hdr->b_birth = zio->io_bp->blk_birth;
-	hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0];
 	/*
 	 * If the block to be written was all-zero, we may have
 	 * compressed it away.  In this case no write was performed
@@ -3026,6 +3189,8 @@ arc_write_done(zio_t *zio)
 		arc_buf_hdr_t *exists;
 		kmutex_t *hash_lock;
 
+		ASSERT(zio->io_error == 0);
+
 		arc_cksum_verify(buf);
 
 		exists = buf_hash_insert(hdr, &hash_lock);
@@ -3035,106 +3200,54 @@ arc_write_done(zio_t *zio)
 			 * sync-to-convergence, because we remove
 			 * buffers from the hash table when we arc_free().
 			 */
-			ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-			ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig),
-			    BP_IDENTITY(zio->io_bp)));
-			ASSERT3U(zio->io_bp_orig.blk_birth, ==,
-			    zio->io_bp->blk_birth);
-
-			ASSERT(refcount_is_zero(&exists->b_refcnt));
-			arc_change_state(arc_anon, exists, hash_lock);
-			mutex_exit(hash_lock);
-			arc_hdr_destroy(exists);
-			exists = buf_hash_insert(hdr, &hash_lock);
-			ASSERT3P(exists, ==, NULL);
+			if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+				if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp))
+					panic("bad overwrite, hdr=%p exists=%p",
+					    (void *)hdr, (void *)exists);
+				ASSERT(refcount_is_zero(&exists->b_refcnt));
+				arc_change_state(arc_anon, exists, hash_lock);
+				mutex_exit(hash_lock);
+				arc_hdr_destroy(exists);
+				exists = buf_hash_insert(hdr, &hash_lock);
+				ASSERT3P(exists, ==, NULL);
+			} else {
+				/* Dedup */
+				ASSERT(hdr->b_datacnt == 1);
+				ASSERT(hdr->b_state == arc_anon);
+				ASSERT(BP_GET_DEDUP(zio->io_bp));
+				ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
+			}
 		}
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 		/* if it's not anon, we are doing a scrub */
-		if (hdr->b_state == arc_anon)
+		if (!exists && hdr->b_state == arc_anon)
 			arc_access(hdr, hash_lock);
 		mutex_exit(hash_lock);
-	} else if (callback->awcb_done == NULL) {
-		int destroy_hdr;
-		/*
-		 * This is an anonymous buffer with no user callback,
-		 * destroy it if there are no active references.
-		 */
-		mutex_enter(&arc_eviction_mtx);
-		destroy_hdr = refcount_is_zero(&hdr->b_refcnt);
-		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
-		mutex_exit(&arc_eviction_mtx);
-		if (destroy_hdr)
-			arc_hdr_destroy(hdr);
 	} else {
 		hdr->b_flags &= ~ARC_IO_IN_PROGRESS;
 	}
-	hdr->b_flags &= ~ARC_STORED;
 
-	if (callback->awcb_done) {
-		ASSERT(!refcount_is_zero(&hdr->b_refcnt));
-		callback->awcb_done(zio, buf, callback->awcb_private);
-	}
+	ASSERT(!refcount_is_zero(&hdr->b_refcnt));
+	callback->awcb_done(zio, buf, callback->awcb_private);
 
 	kmem_free(callback, sizeof (arc_write_callback_t));
 }
 
-void
-write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp)
-{
-	boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata);
-
-	/* Determine checksum setting */
-	if (ismd) {
-		/*
-		 * Metadata always gets checksummed.  If the data
-		 * checksum is multi-bit correctable, and it's not a
-		 * ZBT-style checksum, then it's suitable for metadata
-		 * as well.  Otherwise, the metadata checksum defaults
-		 * to fletcher4.
-		 */
-		if (zio_checksum_table[wp->wp_oschecksum].ci_correctable &&
-		    !zio_checksum_table[wp->wp_oschecksum].ci_zbt)
-			zp->zp_checksum = wp->wp_oschecksum;
-		else
-			zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4;
-	} else {
-		zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum,
-		    wp->wp_oschecksum);
-	}
-
-	/* Determine compression setting */
-	if (ismd) {
-		/*
-		 * XXX -- we should design a compression algorithm
-		 * that specializes in arrays of bps.
-		 */
-		zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
-		    ZIO_COMPRESS_LZJB;
-	} else {
-		zp->zp_compress = zio_compress_select(wp->wp_dncompress,
-		    wp->wp_oscompress);
-	}
-
-	zp->zp_type = wp->wp_type;
-	zp->zp_level = wp->wp_level;
-	zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa));
-}
-
 zio_t *
-arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb)
+arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb)
 {
 	arc_buf_hdr_t *hdr = buf->b_hdr;
 	arc_write_callback_t *callback;
 	zio_t *zio;
-	zio_prop_t zp;
 
 	ASSERT(ready != NULL);
+	ASSERT(done != NULL);
 	ASSERT(!HDR_IO_ERROR(hdr));
 	ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0);
-	ASSERT(hdr->b_acb == 0);
+	ASSERT(hdr->b_acb == NULL);
 	if (l2arc)
 		hdr->b_flags |= ARC_L2CACHE;
 	callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP);
@@ -3143,36 +3256,25 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
 	callback->awcb_private = private;
 	callback->awcb_buf = buf;
 
-	write_policy(spa, wp, &zp);
-	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp,
+	zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp,
 	    arc_write_ready, arc_write_done, callback, priority, zio_flags, zb);
 
 	return (zio);
 }
 
-int
-arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
+void
+arc_free(spa_t *spa, const blkptr_t *bp)
 {
 	arc_buf_hdr_t *ab;
 	kmutex_t *hash_lock;
-	zio_t	*zio;
+	uint64_t guid = spa_guid(spa);
 
 	/*
-	 * If this buffer is in the cache, release it, so it
-	 * can be re-used.
+	 * If this buffer is in the cache, release it, so it can be re-used.
 	 */
-	ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock);
+	ab = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp),
+	    &hash_lock);
 	if (ab != NULL) {
-		/*
-		 * The checksum of blocks to free is not always
-		 * preserved (eg. on the deadlist).  However, if it is
-		 * nonzero, it should match what we have in the cache.
-		 */
-		ASSERT(bp->blk_cksum.zc_word[0] == 0 ||
-		    bp->blk_cksum.zc_word[0] == ab->b_cksum0 ||
-		    bp->blk_fill == BLK_FILL_ALREADY_FREED);
-
 		if (ab->b_state != arc_anon)
 			arc_change_state(arc_anon, ab, hash_lock);
 		if (HDR_IO_IN_PROGRESS(ab)) {
@@ -3191,44 +3293,20 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 			ab->b_buf->b_efunc = NULL;
 			ab->b_buf->b_private = NULL;
 			mutex_exit(hash_lock);
-		} else if (refcount_is_zero(&ab->b_refcnt)) {
+		} else {
+			ASSERT(refcount_is_zero(&ab->b_refcnt));
 			ab->b_flags |= ARC_FREE_IN_PROGRESS;
 			mutex_exit(hash_lock);
 			arc_hdr_destroy(ab);
 			ARCSTAT_BUMP(arcstat_deleted);
-		} else {
-			/*
-			 * We still have an active reference on this
-			 * buffer.  This can happen, e.g., from
-			 * dbuf_unoverride().
-			 */
-			ASSERT(!HDR_IN_HASH_TABLE(ab));
-			ab->b_arc_access = 0;
-			bzero(&ab->b_dva, sizeof (dva_t));
-			ab->b_birth = 0;
-			ab->b_cksum0 = 0;
-			ab->b_buf->b_efunc = NULL;
-			ab->b_buf->b_private = NULL;
-			mutex_exit(hash_lock);
 		}
 	}
-
-	zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED);
-
-	if (arc_flags & ARC_WAIT)
-		return (zio_wait(zio));
-
-	ASSERT(arc_flags & ARC_NOWAIT);
-	zio_nowait(zio);
-
-	return (0);
 }
 
 static int
-arc_memory_throttle(uint64_t reserve, uint64_t txg)
+arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg)
 {
 #ifdef _KERNEL
-	uint64_t inflight_data = arc_anon->arcs_size;
 	uint64_t available_memory = ptob(freemem);
 	static uint64_t page_load = 0;
 	static uint64_t last_txg = 0;
@@ -3290,6 +3368,7 @@ int
 arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 {
 	int error;
+	uint64_t anon_size;
 
 #ifdef ZFS_DEBUG
 	/*
@@ -3305,12 +3384,19 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 	if (reserve > arc_c)
 		return (ENOMEM);
 
+	/*
+	 * Don't count loaned bufs as in flight dirty data to prevent long
+	 * network delays from blocking transactions that are ready to be
+	 * assigned to a txg.
+	 */
+	anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0);
+
 	/*
 	 * Writes will, almost always, require additional memory allocations
 	 * in order to compress/encrypt/etc the data.  We therefor need to
 	 * make sure that there is sufficient available memory for this.
 	 */
-	if (error = arc_memory_throttle(reserve, txg))
+	if (error = arc_memory_throttle(reserve, anon_size, txg))
 		return (error);
 
 	/*
@@ -3320,8 +3406,9 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg)
 	 * Note: if two requests come in concurrently, we might let them
 	 * both succeed, when one of them should fail.  Not a huge deal.
 	 */
-	if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 &&
-	    arc_anon->arcs_size > arc_c / 4) {
+
+	if (reserve + arc_tempreserve + anon_size > arc_c / 2 &&
+	    anon_size > arc_c / 4) {
 		dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK "
 		    "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n",
 		    arc_tempreserve>>10,
@@ -3386,6 +3473,15 @@ arc_init(void)
 	if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0)
 		arc_c_min = arc_meta_limit / 2;
 
+	if (zfs_arc_grow_retry > 0)
+		arc_grow_retry = zfs_arc_grow_retry;
+
+	if (zfs_arc_shrink_shift > 0)
+		arc_shrink_shift = zfs_arc_shrink_shift;
+
+	if (zfs_arc_p_min_shift > 0)
+		arc_p_min_shift = zfs_arc_p_min_shift;
+
 	/* if kmem_flags are set, lets try to use less memory */
 	if (kmem_debugging())
 		arc_c = arc_c / 2;
@@ -3492,10 +3588,13 @@ arc_fini(void)
 	mutex_destroy(&arc_mru_ghost->arcs_mtx);
 	mutex_destroy(&arc_mfu->arcs_mtx);
 	mutex_destroy(&arc_mfu_ghost->arcs_mtx);
+	mutex_destroy(&arc_l2c_only->arcs_mtx);
 
 	mutex_destroy(&zfs_write_limit_lock);
 
 	buf_fini();
+
+	ASSERT(arc_loaned_bytes == 0);
 }
 
 /*
@@ -3623,8 +3722,70 @@ arc_fini(void)
  *
  * Tunables may be removed or added as future performance improvements are
  * integrated, and also may become zpool properties.
+ *
+ * There are three key functions that control how the L2ARC warms up:
+ *
+ *	l2arc_write_eligible()	check if a buffer is eligible to cache
+ *	l2arc_write_size()	calculate how much to write
+ *	l2arc_write_interval()	calculate sleep delay between writes
+ *
+ * These three functions determine what to write, how much, and how quickly
+ * to send writes.
  */
 
+static boolean_t
+l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab)
+{
+	/*
+	 * A buffer is *not* eligible for the L2ARC if it:
+	 * 1. belongs to a different spa.
+	 * 2. is already cached on the L2ARC.
+	 * 3. has an I/O in progress (it may be an incomplete read).
+	 * 4. is flagged not eligible (zfs property).
+	 */
+	if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL ||
+	    HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab))
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+static uint64_t
+l2arc_write_size(l2arc_dev_t *dev)
+{
+	uint64_t size;
+
+	size = dev->l2ad_write;
+
+	if (arc_warm == B_FALSE)
+		size += dev->l2ad_boost;
+
+	return (size);
+
+}
+
+static clock_t
+l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote)
+{
+	clock_t interval, next, now;
+
+	/*
+	 * If the ARC lists are busy, increase our write rate; if the
+	 * lists are stale, idle back.  This is achieved by checking
+	 * how much we previously wrote - if it was more than half of
+	 * what we wanted, schedule the next write much sooner.
+	 */
+	if (l2arc_feed_again && wrote > (wanted / 2))
+		interval = (hz * l2arc_feed_min_ms) / 1000;
+	else
+		interval = hz * l2arc_feed_secs;
+
+	now = ddi_get_lbolt();
+	next = MAX(now, MIN(now + interval, began + interval));
+
+	return (next);
+}
+
 static void
 l2arc_hdr_stat_add(void)
 {
@@ -3857,11 +4018,15 @@ l2arc_read_done(zio_t *zio)
 		 * storage now.  If there *is* a waiter, the caller must
 		 * issue the i/o in a context where it's OK to block.
 		 */
-		if (zio->io_waiter == NULL)
-			zio_nowait(zio_read(zio->io_parent,
-			    cb->l2rcb_spa, &cb->l2rcb_bp,
+		if (zio->io_waiter == NULL) {
+			zio_t *pio = zio_unique_parent(zio);
+
+			ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL);
+
+			zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp,
 			    buf->b_data, zio->io_size, arc_read_done, buf,
 			    zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb));
+		}
 	}
 
 	kmem_free(cb, sizeof (l2arc_read_callback_t));
@@ -4035,7 +4200,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
 	}
 	mutex_exit(&l2arc_buflist_mtx);
 
-	spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict));
+	vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0);
 	dev->l2ad_evict = taddr;
 }
 
@@ -4045,7 +4210,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all)
  * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid
  * for reading until they have completed writing.
  */
-static void
+static uint64_t
 l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 {
 	arc_buf_hdr_t *ab, *ab_prev, *head;
@@ -4057,6 +4222,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	boolean_t have_lock, full;
 	l2arc_write_callback_t *cb;
 	zio_t *pio, *wzio;
+	uint64_t guid = spa_guid(spa);
 
 	ASSERT(dev->l2ad_vdev != NULL);
 
@@ -4110,20 +4276,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				break;
 			}
 
-			if (ab->b_spa != spa) {
-				mutex_exit(hash_lock);
-				continue;
-			}
-
-			if (ab->b_l2hdr != NULL) {
-				/*
-				 * Already in L2ARC.
-				 */
-				mutex_exit(hash_lock);
-				continue;
-			}
-
-			if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) {
+			if (!l2arc_write_eligible(guid, ab)) {
 				mutex_exit(hash_lock);
 				continue;
 			}
@@ -4134,12 +4287,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 				break;
 			}
 
-			if (ab->b_buf == NULL) {
-				DTRACE_PROBE1(l2arc__buf__null, void *, ab);
-				mutex_exit(hash_lock);
-				continue;
-			}
-
 			if (pio == NULL) {
 				/*
 				 * Insert a dummy header on the buflist so
@@ -4206,27 +4353,32 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz)
 	if (pio == NULL) {
 		ASSERT3U(write_sz, ==, 0);
 		kmem_cache_free(hdr_cache, head);
-		return;
+		return (0);
 	}
 
 	ASSERT3U(write_sz, <=, target_sz);
 	ARCSTAT_BUMP(arcstat_l2_writes_sent);
+	ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz);
 	ARCSTAT_INCR(arcstat_l2_size, write_sz);
-	spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz);
+	vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0);
 
 	/*
 	 * Bump device hand to the device start if it is approaching the end.
 	 * l2arc_evict() will already have evicted ahead for this case.
 	 */
 	if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) {
-		spa_l2cache_space_update(dev->l2ad_vdev, 0,
-		    dev->l2ad_end - dev->l2ad_hand);
+		vdev_space_update(dev->l2ad_vdev,
+		    dev->l2ad_end - dev->l2ad_hand, 0, 0);
 		dev->l2ad_hand = dev->l2ad_start;
 		dev->l2ad_evict = dev->l2ad_start;
 		dev->l2ad_first = B_FALSE;
 	}
 
+	dev->l2ad_writing = B_TRUE;
 	(void) zio_wait(pio);
+	dev->l2ad_writing = B_FALSE;
+
+	return (write_sz);
 }
 
 /*
@@ -4239,20 +4391,19 @@ l2arc_feed_thread(void)
 	callb_cpr_t cpr;
 	l2arc_dev_t *dev;
 	spa_t *spa;
-	uint64_t size;
+	uint64_t size, wrote;
+	clock_t begin, next = ddi_get_lbolt();
 
 	CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG);
 
 	mutex_enter(&l2arc_feed_thr_lock);
 
 	while (l2arc_thread_exit == 0) {
-		/*
-		 * Pause for l2arc_feed_secs seconds between writes.
-		 */
 		CALLB_CPR_SAFE_BEGIN(&cpr);
 		(void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock,
-		    lbolt + (hz * l2arc_feed_secs));
+		    next);
 		CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock);
+		next = ddi_get_lbolt() + hz;
 
 		/*
 		 * Quick check for L2ARC devices.
@@ -4263,6 +4414,7 @@ l2arc_feed_thread(void)
 			continue;
 		}
 		mutex_exit(&l2arc_dev_mtx);
+		begin = ddi_get_lbolt();
 
 		/*
 		 * This selects the next l2arc device to write to, and in
@@ -4291,9 +4443,7 @@ l2arc_feed_thread(void)
 
 		ARCSTAT_BUMP(arcstat_l2_feeds);
 
-		size = dev->l2ad_write;
-		if (arc_warm == B_FALSE)
-			size += dev->l2ad_boost;
+		size = l2arc_write_size(dev);
 
 		/*
 		 * Evict L2ARC buffers that will be overwritten.
@@ -4303,7 +4453,12 @@ l2arc_feed_thread(void)
 		/*
 		 * Write ARC buffers.
 		 */
-		l2arc_write_buffers(spa, dev, size);
+		wrote = l2arc_write_buffers(spa, dev, size);
+
+		/*
+		 * Calculate interval between writes.
+		 */
+		next = l2arc_write_interval(begin, size, wrote);
 		spa_config_exit(spa, SCL_L2ARC, dev);
 	}
 
@@ -4334,7 +4489,7 @@ l2arc_vdev_present(vdev_t *vd)
  * validated the vdev and opened it.
  */
 void
-l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
+l2arc_add_vdev(spa_t *spa, vdev_t *vd)
 {
 	l2arc_dev_t *adddev;
 
@@ -4348,11 +4503,12 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
 	adddev->l2ad_vdev = vd;
 	adddev->l2ad_write = l2arc_write_max;
 	adddev->l2ad_boost = l2arc_write_boost;
-	adddev->l2ad_start = start;
-	adddev->l2ad_end = end;
+	adddev->l2ad_start = VDEV_LABEL_START_SIZE;
+	adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd);
 	adddev->l2ad_hand = adddev->l2ad_start;
 	adddev->l2ad_evict = adddev->l2ad_start;
 	adddev->l2ad_first = B_TRUE;
+	adddev->l2ad_writing = B_FALSE;
 	ASSERT3U(adddev->l2ad_write, >, 0);
 
 	/*
@@ -4363,7 +4519,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end)
 	list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t),
 	    offsetof(arc_buf_hdr_t, b_l2node));
 
-	spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0);
+	vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand);
 
 	/*
 	 * Add device to global list
@@ -4458,7 +4614,7 @@ l2arc_fini(void)
 void
 l2arc_start(void)
 {
-	if (!(spa_mode & FWRITE))
+	if (!(spa_mode_global & FWRITE))
 		return;
 
 	(void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0,
@@ -4468,7 +4624,7 @@ l2arc_start(void)
 void
 l2arc_stop(void)
 {
-	if (!(spa_mode & FWRITE))
+	if (!(spa_mode_global & FWRITE))
 		return;
 
 	mutex_enter(&l2arc_feed_thr_lock);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c b/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c
index 93b7741d77be2..e03dd2e6f98c2 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c
@@ -19,13 +19,27 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <sys/bplist.h>
 #include <sys/zfs_context.h>
 
+void
+bplist_init(bplist_t *bpl)
+{
+	bzero(bpl, sizeof (*bpl));
+	mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+}
+
+void
+bplist_fini(bplist_t *bpl)
+{
+	ASSERT(bpl->bpl_queue == NULL);
+	mutex_destroy(&bpl->bpl_lock);
+}
+
 static int
 bplist_hold(bplist_t *bpl)
 {
@@ -208,12 +222,13 @@ bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
 	bparray[off].blk_fill = 0;
 
 	/* The bplist will compress better if we can leave off the checksum */
-	bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
+	if (!BP_GET_DEDUP(&bparray[off]))
+		bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum));
 
 	dmu_buf_will_dirty(bpl->bpl_dbuf, tx);
 	bpl->bpl_phys->bpl_entries++;
 	bpl->bpl_phys->bpl_bytes +=
-	    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp);
+	    bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp);
 	if (bpl->bpl_havecomp) {
 		bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp);
 		bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp);
@@ -223,8 +238,14 @@ bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx)
 	return (0);
 }
 
+void
+bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	VERIFY(bplist_enqueue(bpl, bp, tx) == 0);
+}
+
 /*
- * Deferred entry; will be written later by bplist_sync().
+ * Deferred entry; will be processed later by bplist_sync().
  */
 void
 bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
@@ -240,7 +261,7 @@ bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp)
 }
 
 void
-bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
+bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx)
 {
 	bplist_q_t *bpq;
 
@@ -248,7 +269,7 @@ bplist_sync(bplist_t *bpl, dmu_tx_t *tx)
 	while ((bpq = bpl->bpl_queue) != NULL) {
 		bpl->bpl_queue = bpq->bpq_next;
 		mutex_exit(&bpl->bpl_lock);
-		VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx));
+		func(arg, &bpq->bpq_blk, tx);
 		kmem_free(bpq, sizeof (*bpq));
 		mutex_enter(&bpl->bpl_lock);
 	}
@@ -311,12 +332,12 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp)
 }
 
 /*
- * Return (in *dasizep) the amount of space on the deadlist which is:
+ * Return (in *dsizep) the amount of space on the deadlist which is:
  * mintxg < blk_birth <= maxtxg
  */
 int
 bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
-    uint64_t *dasizep)
+    uint64_t *dsizep)
 {
 	uint64_t size = 0;
 	uint64_t itor = 0;
@@ -331,19 +352,18 @@ bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg,
 		mutex_enter(&bpl->bpl_lock);
 		err = bplist_hold(bpl);
 		if (err == 0)
-			*dasizep = bpl->bpl_phys->bpl_bytes;
+			*dsizep = bpl->bpl_phys->bpl_bytes;
 		mutex_exit(&bpl->bpl_lock);
 		return (err);
 	}
 
 	while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) {
 		if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) {
-			size +=
-			    bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp);
+			size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp);
 		}
 	}
 	if (err == ENOENT)
 		err = 0;
-	*dasizep = size;
+	*dsizep = size;
 	return (err);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c
index d04610317a4ea..1608f7d3c1cf1 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,10 +38,6 @@
 static void dbuf_destroy(dmu_buf_impl_t *db);
 static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
 static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx);
-static arc_done_func_t dbuf_write_ready;
-static arc_done_func_t dbuf_write_done;
-static zio_done_func_t dbuf_skip_write_ready;
-static zio_done_func_t dbuf_skip_write_done;
 
 /*
  * Global data structures and functions for the dbuf cache.
@@ -109,7 +105,7 @@ dmu_buf_impl_t *
 dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;
 	uint64_t obj = dn->dn_object;
 	uint64_t hv = DBUF_HASH(os, obj, level, blkid);
 	uint64_t idx = hv & h->hash_table_mask;
@@ -140,7 +136,7 @@ static dmu_buf_impl_t *
 dbuf_hash_insert(dmu_buf_impl_t *db)
 {
 	dbuf_hash_table_t *h = &dbuf_hash_table;
-	objset_impl_t *os = db->db_objset;
+	objset_t *os = db->db_objset;
 	uint64_t obj = db->db.db_object;
 	int level = db->db_level;
 	uint64_t blkid = db->db_blkid;
@@ -285,6 +281,7 @@ static void
 dbuf_verify(dmu_buf_impl_t *db)
 {
 	dnode_t *dn = db->db_dnode;
+	dbuf_dirty_record_t *dr;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
 
@@ -310,13 +307,19 @@ dbuf_verify(dmu_buf_impl_t *db)
 		ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size);
 	}
 
+	for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next)
+		ASSERT(dr->dr_dbuf == db);
+
+	for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next)
+		ASSERT(dr->dr_dbuf == db);
+
 	/*
 	 * We can't assert that db_size matches dn_datablksz because it
 	 * can be momentarily different when another thread is doing
 	 * dnode_set_blksz().
 	 */
 	if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) {
-		dbuf_dirty_record_t *dr = db->db_data_pending;
+		dr = db->db_data_pending;
 		/*
 		 * It should only be modified in syncing context, so
 		 * make sure we only have one copy of the data.
@@ -329,7 +332,7 @@ dbuf_verify(dmu_buf_impl_t *db)
 		if (db->db_parent == dn->dn_dbuf) {
 			/* db is pointed to by the dnode */
 			/* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */
-			if (db->db.db_object == DMU_META_DNODE_OBJECT)
+			if (DMU_OBJECT_IS_SPECIAL(db->db.db_object))
 				ASSERT(db->db_parent == NULL);
 			else
 				ASSERT(db->db_parent != NULL);
@@ -403,6 +406,29 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf)
 	}
 }
 
+/*
+ * Loan out an arc_buf for read.  Return the loaned arc_buf.
+ */
+arc_buf_t *
+dbuf_loan_arcbuf(dmu_buf_impl_t *db)
+{
+	arc_buf_t *abuf;
+
+	mutex_enter(&db->db_mtx);
+	if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) {
+		int blksz = db->db.db_size;
+		mutex_exit(&db->db_mtx);
+		abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz);
+		bcopy(db->db.db_data, abuf->b_data, blksz);
+	} else {
+		abuf = db->db_buf;
+		arc_loan_inuse_buf(abuf, db);
+		dbuf_set_data(db, NULL);
+		mutex_exit(&db->db_mtx);
+	}
+	return (abuf);
+}
+
 uint64_t
 dbuf_whichblock(dnode_t *dn, uint64_t offset)
 {
@@ -465,15 +491,15 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 	ASSERT(db->db_buf == NULL);
 
 	if (db->db_blkid == DB_BONUS_BLKID) {
-		int bonuslen = dn->dn_bonuslen;
+		int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen);
 
 		ASSERT3U(bonuslen, <=, db->db.db_size);
 		db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN);
-		arc_space_consume(DN_MAX_BONUSLEN);
+		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		if (bonuslen < DN_MAX_BONUSLEN)
 			bzero(db->db.db_data, DN_MAX_BONUSLEN);
-		bcopy(DN_BONUS(dn->dn_phys), db->db.db_data,
-		    bonuslen);
+		if (bonuslen)
+			bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen);
 		dbuf_update_data(db);
 		db->db_state = DB_CACHED;
 		mutex_exit(&db->db_mtx);
@@ -505,11 +531,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags)
 	if (DBUF_IS_L2CACHEABLE(db))
 		aflags |= ARC_L2CACHE;
 
-	zb.zb_objset = db->db_objset->os_dsl_dataset ?
-	    db->db_objset->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = db->db.db_object;
-	zb.zb_level = db->db_level;
-	zb.zb_blkid = db->db_blkid;
+	SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ?
+	    db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    db->db.db_object, db->db_level, db->db_blkid);
 
 	dbuf_add_ref(db, NULL);
 	/* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */
@@ -665,7 +689,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg)
 	if (db->db_blkid == DB_BONUS_BLKID) {
 		/* Note that the data bufs here are zio_bufs */
 		dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN);
-		arc_space_consume(DN_MAX_BONUSLEN);
+		arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN);
 	} else if (refcount_count(&db->db_holds) > db->db_dirtycnt) {
 		int size = db->db.db_size;
@@ -682,6 +706,7 @@ void
 dbuf_unoverride(dbuf_dirty_record_t *dr)
 {
 	dmu_buf_impl_t *db = dr->dr_dbuf;
+	blkptr_t *bp = &dr->dt.dl.dr_overridden_by;
 	uint64_t txg = dr->dr_txg;
 
 	ASSERT(MUTEX_HELD(&db->db_mtx));
@@ -692,13 +717,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
 	    dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN)
 		return;
 
+	ASSERT(db->db_data_pending != dr);
+
 	/* free this block */
-	if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) {
-		/* XXX can get silent EIO here */
-		(void) dsl_free(NULL,
-		    spa_get_dsl(db->db_dnode->dn_objset->os_spa),
-		    txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT);
-	}
+	if (!BP_IS_HOLE(bp))
+		dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp);
+
 	dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
 	/*
 	 * Release the already-written buffer, so we leave it in
@@ -894,7 +918,7 @@ dbuf_dirty_record_t *
 dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 {
 	dnode_t *dn = db->db_dnode;
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;
 	dbuf_dirty_record_t **drp, *dr;
 	int drop_struct_lock = FALSE;
 	boolean_t do_free_accounting = B_FALSE;
@@ -908,15 +932,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	 * Shouldn't dirty a regular buffer in syncing context.  Private
 	 * objects may be dirtied in syncing context, but only if they
 	 * were already pre-dirtied in open context.
-	 * XXX We may want to prohibit dirtying in syncing context even
-	 * if they did pre-dirty.
 	 */
 	ASSERT(!dmu_tx_is_syncing(tx) ||
 	    BP_IS_HOLE(dn->dn_objset->os_rootbp) ||
-	    dn->dn_object == DMU_META_DNODE_OBJECT ||
-	    dn->dn_objset->os_dsl_dataset == NULL ||
-	    dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir));
-
+	    DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+	    dn->dn_objset->os_dsl_dataset == NULL);
 	/*
 	 * We make this assert for private objects as well, but after we
 	 * check if we're already dirty.  They are allowed to re-dirty
@@ -965,7 +985,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 			 * we now need to reset its state.
 			 */
 			dbuf_unoverride(dr);
-			if (db->db.db_object != DMU_META_DNODE_OBJECT)
+			if (db->db.db_object != DMU_META_DNODE_OBJECT &&
+			    db->db_state != DB_NOFILL)
 				arc_buf_thaw(db->db_buf);
 		}
 		mutex_exit(&db->db_mtx);
@@ -975,7 +996,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	/*
 	 * Only valid if not already dirty.
 	 */
-	ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
+	ASSERT(dn->dn_object == 0 ||
+	    dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx ==
 	    (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN));
 
 	ASSERT3U(dn->dn_nlevels, >, db->db_level);
@@ -987,15 +1009,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 
 	/*
 	 * We should only be dirtying in syncing context if it's the
-	 * mos, a spa os, or we're initializing the os.  However, we are
-	 * allowed to dirty in syncing context provided we already
-	 * dirtied it in open context.  Hence we must make this
-	 * assertion only if we're not already dirty.
+	 * mos or we're initializing the os or it's a special object.
+	 * However, we are allowed to dirty in syncing context provided
+	 * we already dirtied it in open context.  Hence we must make
+	 * this assertion only if we're not already dirty.
 	 */
-	ASSERT(!dmu_tx_is_syncing(tx) ||
-	    os->os_dsl_dataset == NULL ||
-	    !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) ||
-	    !BP_IS_HOLE(os->os_rootbp));
+	ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) ||
+	    os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp));
 	ASSERT(db->db.db_size != 0);
 
 	dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size);
@@ -1005,7 +1025,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		 * Update the accounting.
 		 * Note: we delay "free accounting" until after we drop
 		 * the db_mtx.  This keeps us from grabbing other locks
-		 * (and possibly deadlocking) in bp_get_dasize() while
+		 * (and possibly deadlocking) in bp_get_dsize() while
 		 * also holding the db_mtx.
 		 */
 		dnode_willuse_space(dn, db->db.db_size, tx);
@@ -1084,7 +1104,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	} else if (do_free_accounting) {
 		blkptr_t *bp = db->db_blkptr;
 		int64_t willfree = (bp && !BP_IS_HOLE(bp)) ?
-		    bp_get_dasize(os->os_spa, bp) : db->db.db_size;
+		    bp_get_dsize(os->os_spa, bp) : db->db.db_size;
 		/*
 		 * This is only a guess -- if the dbuf is dirty
 		 * in a previous txg, we don't know how much
@@ -1165,7 +1185,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	ASSERT(db->db_blkid != DB_BONUS_BLKID);
 
 	mutex_enter(&db->db_mtx);
-
 	/*
 	 * If this buffer is not dirty, we're done.
 	 */
@@ -1177,6 +1196,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 		return (0);
 	}
 	ASSERT(dr->dr_txg == txg);
+	ASSERT(dr->dr_dbuf == db);
 
 	/*
 	 * If this buffer is currently held, we cannot undirty
@@ -1236,7 +1256,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) {
 		arc_buf_t *buf = db->db_buf;
 
-		ASSERT(arc_released(buf));
+		ASSERT(db->db_state == DB_NOFILL || arc_released(buf));
 		dbuf_set_data(db, NULL);
 		VERIFY(arc_buf_remove_ref(buf, db) == 1);
 		dbuf_evict(db);
@@ -1311,6 +1331,70 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx)
 	mutex_exit(&db->db_mtx);
 }
 
+/*
+ * Directly assign a provided arc buf to a given dbuf if it's not referenced
+ * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf.
+ */
+void
+dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx)
+{
+	ASSERT(!refcount_is_zero(&db->db_holds));
+	ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT);
+	ASSERT(db->db_blkid != DB_BONUS_BLKID);
+	ASSERT(db->db_level == 0);
+	ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA);
+	ASSERT(buf != NULL);
+	ASSERT(arc_buf_size(buf) == db->db.db_size);
+	ASSERT(tx->tx_txg != 0);
+
+	arc_return_buf(buf, db);
+	ASSERT(arc_released(buf));
+
+	mutex_enter(&db->db_mtx);
+
+	while (db->db_state == DB_READ || db->db_state == DB_FILL)
+		cv_wait(&db->db_changed, &db->db_mtx);
+
+	ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED);
+
+	if (db->db_state == DB_CACHED &&
+	    refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) {
+		mutex_exit(&db->db_mtx);
+		(void) dbuf_dirty(db, tx);
+		bcopy(buf->b_data, db->db.db_data, db->db.db_size);
+		VERIFY(arc_buf_remove_ref(buf, db) == 1);
+		xuio_stat_wbuf_copied();
+		return;
+	}
+
+	xuio_stat_wbuf_nocopy();
+	if (db->db_state == DB_CACHED) {
+		dbuf_dirty_record_t *dr = db->db_last_dirty;
+
+		ASSERT(db->db_buf != NULL);
+		if (dr != NULL && dr->dr_txg == tx->tx_txg) {
+			ASSERT(dr->dt.dl.dr_data == db->db_buf);
+			if (!arc_released(db->db_buf)) {
+				ASSERT(dr->dt.dl.dr_override_state ==
+				    DR_OVERRIDDEN);
+				arc_release(db->db_buf, db);
+			}
+			dr->dt.dl.dr_data = buf;
+			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+		} else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) {
+			arc_release(db->db_buf, db);
+			VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1);
+		}
+		db->db_buf = NULL;
+	}
+	ASSERT(db->db_buf == NULL);
+	dbuf_set_data(db, buf);
+	db->db_state = DB_FILL;
+	mutex_exit(&db->db_mtx);
+	(void) dbuf_dirty(db, tx);
+	dbuf_fill_done(db, tx);
+}
+
 /*
  * "Clear" the contents of this dbuf.  This will mark the dbuf
  * EVICTING and clear *most* of its references.  Unfortunetely,
@@ -1341,7 +1425,7 @@ dbuf_clear(dmu_buf_impl_t *db)
 		ASSERT(db->db.db_data != NULL);
 		if (db->db_blkid == DB_BONUS_BLKID) {
 			zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN);
-			arc_space_return(DN_MAX_BONUSLEN);
+			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		}
 		db->db.db_data = NULL;
 		db->db_state = DB_UNCACHED;
@@ -1431,7 +1515,7 @@ static dmu_buf_impl_t *
 dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
     dmu_buf_impl_t *parent, blkptr_t *blkptr)
 {
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;
 	dmu_buf_impl_t *db, *odb;
 
 	ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock));
@@ -1463,7 +1547,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 		db->db.db_offset = DB_BONUS_BLKID;
 		db->db_state = DB_UNCACHED;
 		/* the bonus dbuf is not placed in the hash table */
-		arc_space_consume(sizeof (dmu_buf_impl_t));
+		arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 		return (db);
 	} else {
 		int blocksize =
@@ -1490,7 +1574,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
 	list_insert_head(&dn->dn_dbufs, db);
 	db->db_state = DB_UNCACHED;
 	mutex_exit(&dn->dn_dbufs_mtx);
-	arc_space_consume(sizeof (dmu_buf_impl_t));
+	arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 
 	if (parent && parent != dn->dn_dbuf)
 		dbuf_add_ref(parent, db);
@@ -1559,7 +1643,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
 	ASSERT(db->db_data_pending == NULL);
 
 	kmem_cache_free(dbuf_cache, db);
-	arc_space_return(sizeof (dmu_buf_impl_t));
+	arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER);
 }
 
 void
@@ -1592,13 +1676,12 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid)
 	if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) {
 		if (bp && !BP_IS_HOLE(bp)) {
 			arc_buf_t *pbuf;
+			dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset;
 			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
 			zbookmark_t zb;
-			zb.zb_objset = dn->dn_objset->os_dsl_dataset ?
-			    dn->dn_objset->os_dsl_dataset->ds_object : 0;
-			zb.zb_object = dn->dn_object;
-			zb.zb_level = 0;
-			zb.zb_blkid = blkid;
+
+			SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+			    dn->dn_object, 0, blkid);
 
 			if (db)
 				pbuf = db->db_buf;
@@ -1743,10 +1826,21 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag)
 #pragma weak dmu_buf_rele = dbuf_rele
 void
 dbuf_rele(dmu_buf_impl_t *db, void *tag)
+{
+	mutex_enter(&db->db_mtx);
+	dbuf_rele_and_unlock(db, tag);
+}
+
+/*
+ * dbuf_rele() for an already-locked dbuf.  This is necessary to allow
+ * db_dirtycnt and db_holds to be updated atomically.
+ */
+void
+dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag)
 {
 	int64_t holds;
 
-	mutex_enter(&db->db_mtx);
+	ASSERT(MUTEX_HELD(&db->db_mtx));
 	DBUF_VERIFY(db);
 
 	holds = refcount_remove(&db->db_holds, tag);
@@ -1855,6 +1949,19 @@ dmu_buf_get_user(dmu_buf_t *db_fake)
 	return (db->db_user_ptr);
 }
 
+boolean_t
+dmu_buf_freeable(dmu_buf_t *dbuf)
+{
+	boolean_t res = B_FALSE;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf;
+
+	if (db->db_blkptr)
+		res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset,
+		    db->db_blkptr->blk_birth);
+
+	return (res);
+}
+
 static void
 dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db)
 {
@@ -1941,9 +2048,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 	arc_buf_t **datap = &dr->dt.dl.dr_data;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
 	dnode_t *dn = db->db_dnode;
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
-	int blksz;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
@@ -1980,19 +2086,19 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen);
 		if (*datap != db->db.db_data) {
 			zio_buf_free(*datap, DN_MAX_BONUSLEN);
-			arc_space_return(DN_MAX_BONUSLEN);
+			arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER);
 		}
 		db->db_data_pending = NULL;
 		drp = &db->db_last_dirty;
 		while (*drp != dr)
 			drp = &(*drp)->dr_next;
 		ASSERT(dr->dr_next == NULL);
+		ASSERT(dr->dr_dbuf == db);
 		*drp = dr->dr_next;
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
 		ASSERT(db->db_dirtycnt > 0);
 		db->db_dirtycnt -= 1;
-		mutex_exit(&db->db_mtx);
-		dbuf_rele(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 		return;
 	}
 
@@ -2014,67 +2120,26 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx)
 		ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN);
 	}
 
-	/*
-	 * If this dbuf has already been written out via an immediate write,
-	 * just complete the write by copying over the new block pointer and
-	 * updating the accounting via the write-completion functions.
-	 */
-	if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		zio_t zio_fake;
-
-		zio_fake.io_private = &db;
-		zio_fake.io_error = 0;
-		zio_fake.io_bp = db->db_blkptr;
-		zio_fake.io_bp_orig = *db->db_blkptr;
-		zio_fake.io_txg = txg;
-		zio_fake.io_flags = 0;
-
-		*db->db_blkptr = dr->dt.dl.dr_overridden_by;
-		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
-		db->db_data_pending = dr;
-		dr->dr_zio = &zio_fake;
-		mutex_exit(&db->db_mtx);
-
-		ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp),
-		    BP_IDENTITY(&zio_fake.io_bp_orig)) ||
-		    BP_IS_HOLE(zio_fake.io_bp));
-
-		if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg))
-			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
-			    &zio_fake.io_bp_orig, dn->dn_zio, tx);
-
-		dbuf_write_ready(&zio_fake, db->db_buf, db);
-		dbuf_write_done(&zio_fake, db->db_buf, db);
-
-		return;
-	}
-
-	if (db->db_state != DB_NOFILL) {
-		blksz = arc_buf_size(*datap);
-
-		if (dn->dn_object != DMU_META_DNODE_OBJECT) {
-			/*
-			 * If this buffer is currently "in use" (i.e., there
-			 * are active holds and db_data still references it),
-			 * then make a copy before we start the write so that
-			 * any modifications from the open txg will not leak
-			 * into this write.
-			 *
-			 * NOTE: this copy does not need to be made for
-			 * objects only modified in the syncing context (e.g.
-			 * DNONE_DNODE blocks).
-			 */
-			if (refcount_count(&db->db_holds) > 1 &&
-			    *datap == db->db_buf) {
-				arc_buf_contents_t type =
-				    DBUF_GET_BUFC_TYPE(db);
-				*datap =
-				    arc_buf_alloc(os->os_spa, blksz, db, type);
-				bcopy(db->db.db_data, (*datap)->b_data, blksz);
-			}
-		}
-
-		ASSERT(*datap != NULL);
+	if (db->db_state != DB_NOFILL &&
+	    dn->dn_object != DMU_META_DNODE_OBJECT &&
+	    refcount_count(&db->db_holds) > 1 &&
+	    dr->dt.dl.dr_override_state != DR_OVERRIDDEN &&
+	    *datap == db->db_buf) {
+		/*
+		 * If this buffer is currently "in use" (i.e., there
+		 * are active holds and db_data still references it),
+		 * then make a copy before we start the write so that
+		 * any modifications from the open txg will not leak
+		 * into this write.
+		 *
+		 * NOTE: this copy does not need to be made for
+		 * objects only modified in the syncing context (e.g.
+		 * DNONE_DNODE blocks).
+		 */
+		int blksz = arc_buf_size(*datap);
+		arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db);
+		*datap = arc_buf_alloc(os->os_spa, blksz, db, type);
+		bcopy(db->db.db_data, (*datap)->b_data, blksz);
 	}
 	db->db_data_pending = dr;
 
@@ -2115,130 +2180,27 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx)
 	}
 }
 
-static void
-dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
-{
-	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dnode_t *dn = db->db_dnode;
-	objset_impl_t *os = dn->dn_objset;
-	dmu_buf_impl_t *parent = db->db_parent;
-	uint64_t txg = tx->tx_txg;
-	zbookmark_t zb;
-	writeprops_t wp = { 0 };
-	zio_t *zio;
-
-	if (!BP_IS_HOLE(db->db_blkptr) &&
-	    (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) {
-		/*
-		 * Private object buffers are released here rather
-		 * than in dbuf_dirty() since they are only modified
-		 * in the syncing context and we don't want the
-		 * overhead of making multiple copies of the data.
-		 */
-		arc_release(data, db);
-	} else if (db->db_state != DB_NOFILL) {
-		ASSERT(arc_released(data));
-		/* XXX why do we need to thaw here? */
-		arc_buf_thaw(data);
-	}
-
-	if (parent != dn->dn_dbuf) {
-		ASSERT(parent && parent->db_data_pending);
-		ASSERT(db->db_level == parent->db_level-1);
-		ASSERT(arc_released(parent->db_buf));
-		zio = parent->db_data_pending->dr_zio;
-	} else {
-		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
-		ASSERT3P(db->db_blkptr, ==,
-		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
-		zio = dn->dn_zio;
-	}
-
-	ASSERT(db->db_level == 0 || data == db->db_buf);
-	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
-	ASSERT(zio);
-
-	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = db->db.db_object;
-	zb.zb_level = db->db_level;
-	zb.zb_blkid = db->db_blkid;
-
-	wp.wp_type = dn->dn_type;
-	wp.wp_level = db->db_level;
-	wp.wp_copies = os->os_copies;
-	wp.wp_dncompress = dn->dn_compress;
-	wp.wp_oscompress = os->os_compress;
-	wp.wp_dnchecksum = dn->dn_checksum;
-	wp.wp_oschecksum = os->os_checksum;
-
-	if (BP_IS_OLDER(db->db_blkptr, txg))
-		(void) dsl_dataset_block_kill(
-		    os->os_dsl_dataset, db->db_blkptr, zio, tx);
-
-	if (db->db_state == DB_NOFILL) {
-		zio_prop_t zp = { 0 };
-
-		write_policy(os->os_spa, &wp, &zp);
-		dr->dr_zio = zio_write(zio, os->os_spa,
-		    txg, db->db_blkptr, NULL,
-		    db->db.db_size, &zp, dbuf_skip_write_ready,
-		    dbuf_skip_write_done, db, ZIO_PRIORITY_ASYNC_WRITE,
-		    ZIO_FLAG_MUSTSUCCEED, &zb);
-	} else {
-		dr->dr_zio = arc_write(zio, os->os_spa, &wp,
-		    DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr,
-		    data, dbuf_write_ready, dbuf_write_done, db,
-		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-	}
-}
-
-/* wrapper function for dbuf_write_ready bypassing ARC */
-static void
-dbuf_skip_write_ready(zio_t *zio)
-{
-	blkptr_t *bp = zio->io_bp;
-
-	if (!BP_IS_GANG(bp))
-		zio_skip_write(zio);
-
-	dbuf_write_ready(zio, NULL, zio->io_private);
-}
-
-/* wrapper function for dbuf_write_done bypassing ARC */
-static void
-dbuf_skip_write_done(zio_t *zio)
-{
-	dbuf_write_done(zio, NULL, zio->io_private);
-}
-
 /* ARGSUSED */
 static void
 dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
-	dnode_t *dn = db->db_dnode;
-	objset_impl_t *os = dn->dn_objset;
 	blkptr_t *bp = zio->io_bp;
 	blkptr_t *bp_orig = &zio->io_bp_orig;
+	dnode_t *dn = db->db_dnode;
+	spa_t *spa = zio->io_spa;
+	int64_t delta;
 	uint64_t fill = 0;
-	int old_size, new_size, i;
+	int i;
 
 	ASSERT(db->db_blkptr == bp);
 
-	dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", "");
-
-	old_size = bp_get_dasize(os->os_spa, bp_orig);
-	new_size = bp_get_dasize(os->os_spa, bp);
-
-	dnode_diduse_space(dn, new_size - old_size);
+	delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig);
+	dnode_diduse_space(dn, delta - zio->io_prev_space_delta);
+	zio->io_prev_space_delta = delta;
 
 	if (BP_IS_HOLE(bp)) {
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		dmu_tx_t *tx = os->os_synctx;
-
-		if (bp_orig->blk_birth == tx->tx_txg)
-			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
-		ASSERT3U(bp->blk_fill, ==, 0);
+		ASSERT(bp->blk_fill == 0);
 		return;
 	}
 
@@ -2269,9 +2231,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 		for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) {
 			if (BP_IS_HOLE(ibp))
 				continue;
-			ASSERT3U(BP_GET_LSIZE(ibp), ==,
-			    db->db_level == 1 ? dn->dn_datablksz :
-			    (1<<dn->dn_phys->dn_indblkshift));
 			fill += ibp->blk_fill;
 		}
 	}
@@ -2279,17 +2238,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb)
 	bp->blk_fill = fill;
 
 	mutex_exit(&db->db_mtx);
-
-	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
-		ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
-	} else {
-		dsl_dataset_t *ds = os->os_dsl_dataset;
-		dmu_tx_t *tx = os->os_synctx;
-
-		if (bp_orig->blk_birth == tx->tx_txg)
-			(void) dsl_dataset_block_kill(ds, bp_orig, zio, tx);
-		dsl_dataset_block_born(ds, bp, tx);
-	}
 }
 
 /* ARGSUSED */
@@ -2297,37 +2245,50 @@ static void
 dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 {
 	dmu_buf_impl_t *db = vdb;
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	dnode_t *dn = db->db_dnode;
+	objset_t *os = dn->dn_objset;
 	uint64_t txg = zio->io_txg;
 	dbuf_dirty_record_t **drp, *dr;
 
 	ASSERT3U(zio->io_error, ==, 0);
+	ASSERT(db->db_blkptr == bp);
+
+	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
+		ASSERT(BP_EQUAL(bp, bp_orig));
+	} else {
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		dmu_tx_t *tx = os->os_synctx;
+
+		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+		dsl_dataset_block_born(ds, bp, tx);
+	}
 
 	mutex_enter(&db->db_mtx);
 
+	DBUF_VERIFY(db);
+
 	drp = &db->db_last_dirty;
 	while ((dr = *drp) != db->db_data_pending)
 		drp = &dr->dr_next;
 	ASSERT(!list_link_active(&dr->dr_dirty_node));
 	ASSERT(dr->dr_txg == txg);
+	ASSERT(dr->dr_dbuf == db);
 	ASSERT(dr->dr_next == NULL);
 	*drp = dr->dr_next;
 
 	if (db->db_level == 0) {
 		ASSERT(db->db_blkid != DB_BONUS_BLKID);
 		ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
-
 		if (db->db_state != DB_NOFILL) {
 			if (dr->dt.dl.dr_data != db->db_buf)
 				VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data,
 				    db) == 1);
-			else if (!BP_IS_HOLE(db->db_blkptr))
+			else if (!arc_released(db->db_buf))
 				arc_set_callback(db->db_buf, dbuf_do_evict, db);
-			else
-				ASSERT(arc_released(db->db_buf));
 		}
 	} else {
-		dnode_t *dn = db->db_dnode;
-
 		ASSERT(list_head(&dr->dt.di.dr_children) == NULL);
 		ASSERT3U(db->db.db_size, ==, 1<<dn->dn_phys->dn_indblkshift);
 		if (!BP_IS_HOLE(db->db_blkptr)) {
@@ -2348,9 +2309,122 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb)
 	ASSERT(db->db_dirtycnt > 0);
 	db->db_dirtycnt -= 1;
 	db->db_data_pending = NULL;
+	dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
+}
+
+static void
+dbuf_write_nofill_ready(zio_t *zio)
+{
+	dbuf_write_ready(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_nofill_done(zio_t *zio)
+{
+	dbuf_write_done(zio, NULL, zio->io_private);
+}
+
+static void
+dbuf_write_override_ready(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+
+	dbuf_write_ready(zio, NULL, db);
+}
+
+static void
+dbuf_write_override_done(zio_t *zio)
+{
+	dbuf_dirty_record_t *dr = zio->io_private;
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	blkptr_t *obp = &dr->dt.dl.dr_overridden_by;
+
+	mutex_enter(&db->db_mtx);
+	if (!BP_EQUAL(zio->io_bp, obp)) {
+		if (!BP_IS_HOLE(obp))
+			dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp);
+		arc_release(dr->dt.dl.dr_data, db);
+	}
 	mutex_exit(&db->db_mtx);
 
-	dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", "");
+	dbuf_write_done(zio, NULL, db);
+}
+
+static void
+dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx)
+{
+	dmu_buf_impl_t *db = dr->dr_dbuf;
+	dnode_t *dn = db->db_dnode;
+	objset_t *os = dn->dn_objset;
+	dmu_buf_impl_t *parent = db->db_parent;
+	uint64_t txg = tx->tx_txg;
+	zbookmark_t zb;
+	zio_prop_t zp;
+	zio_t *zio;
 
-	dbuf_rele(db, (void *)(uintptr_t)txg);
+	if (db->db_state != DB_NOFILL) {
+		if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) {
+			/*
+			 * Private object buffers are released here rather
+			 * than in dbuf_dirty() since they are only modified
+			 * in the syncing context and we don't want the
+			 * overhead of making multiple copies of the data.
+			 */
+			if (BP_IS_HOLE(db->db_blkptr)) {
+				arc_buf_thaw(data);
+			} else {
+				arc_release(data, db);
+			}
+		}
+	}
+
+	if (parent != dn->dn_dbuf) {
+		ASSERT(parent && parent->db_data_pending);
+		ASSERT(db->db_level == parent->db_level-1);
+		ASSERT(arc_released(parent->db_buf));
+		zio = parent->db_data_pending->dr_zio;
+	} else {
+		ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1);
+		ASSERT3P(db->db_blkptr, ==,
+		    &dn->dn_phys->dn_blkptr[db->db_blkid]);
+		zio = dn->dn_zio;
+	}
+
+	ASSERT(db->db_level == 0 || data == db->db_buf);
+	ASSERT3U(db->db_blkptr->blk_birth, <=, txg);
+	ASSERT(zio);
+
+	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	dmu_write_policy(os, dn, db->db_level,
+	    db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp);
+
+	if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
+		ASSERT(db->db_state != DB_NOFILL);
+		dr->dr_zio = zio_write(zio, os->os_spa, txg,
+		    db->db_blkptr, data->b_data, arc_buf_size(data), &zp,
+		    dbuf_write_override_ready, dbuf_write_override_done, dr,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+		mutex_enter(&db->db_mtx);
+		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+		zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by,
+		    dr->dt.dl.dr_copies);
+		mutex_exit(&db->db_mtx);
+	} else if (db->db_state == DB_NOFILL) {
+		ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF);
+		dr->dr_zio = zio_write(zio, os->os_spa, txg,
+		    db->db_blkptr, NULL, db->db.db_size, &zp,
+		    dbuf_write_nofill_ready, dbuf_write_nofill_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE,
+		    ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb);
+	} else {
+		ASSERT(arc_released(data));
+		dr->dr_zio = arc_write(zio, os->os_spa, txg,
+		    db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp,
+		    dbuf_write_ready, dbuf_write_done, db,
+		    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
+	}
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c b/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c
new file mode 100644
index 0000000000000..afe72af7db11c
--- /dev/null
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c
@@ -0,0 +1,1064 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/spa_impl.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <sys/arc.h>
+#include <sys/dsl_pool.h>
+#include <sys/zio_checksum.h>
+#include <sys/zio_compress.h>
+
+static const ddt_ops_t *ddt_ops[DDT_TYPES] = {
+	&ddt_zap_ops,
+};
+
+static const char *ddt_class_name[DDT_CLASSES] = {
+	"ditto",
+	"duplicate",
+	"unique",
+};
+
+static void
+ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+	objset_t *os = ddt->ddt_os;
+	uint64_t *objectp = &ddt->ddt_object[type][class];
+	boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup;
+	char name[DDT_NAMELEN];
+
+	ddt_object_name(ddt, type, class, name);
+
+	ASSERT(*objectp == 0);
+	VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0);
+	ASSERT(*objectp != 0);
+
+	VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, objectp, tx) == 0);
+
+	VERIFY(zap_add(os, spa->spa_ddt_stat_object, name,
+	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+	    &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static void
+ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+	spa_t *spa = ddt->ddt_spa;
+	objset_t *os = ddt->ddt_os;
+	uint64_t *objectp = &ddt->ddt_object[type][class];
+	char name[DDT_NAMELEN];
+
+	ddt_object_name(ddt, type, class, name);
+
+	ASSERT(*objectp != 0);
+	ASSERT(ddt_object_count(ddt, type, class) == 0);
+	ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class]));
+	VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0);
+	VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0);
+	VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0);
+
+	*objectp = 0;
+}
+
+static int
+ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	char name[DDT_NAMELEN];
+	int error;
+
+	ddt_object_name(ddt, type, class, name);
+
+	error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name,
+	    sizeof (uint64_t), 1, &ddt->ddt_object[type][class]);
+
+	if (error)
+		return (error);
+
+	error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+	    &ddt->ddt_histogram[type][class]);
+
+	ASSERT(error == 0);
+	return (error);
+}
+
+static void
+ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_tx_t *tx)
+{
+	char name[DDT_NAMELEN];
+
+	ddt_object_name(ddt, type, class, name);
+
+	VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name,
+	    sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t),
+	    &ddt->ddt_histogram[type][class], tx) == 0);
+}
+
+static int
+ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde)
+{
+	if (!ddt_object_exists(ddt, type, class))
+		return (ENOENT);
+
+	return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde));
+}
+
+static int
+ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_update(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde, tx));
+}
+
+static int
+ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde, tx));
+}
+
+int
+ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    uint64_t *walk, ddt_entry_t *dde)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os,
+	    ddt->ddt_object[type][class], dde, walk));
+}
+
+uint64_t
+ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	ASSERT(ddt_object_exists(ddt, type, class));
+
+	return (ddt_ops[type]->ddt_op_count(ddt->ddt_os,
+	    ddt->ddt_object[type][class]));
+}
+
+int
+ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    dmu_object_info_t *doi)
+{
+	if (!ddt_object_exists(ddt, type, class))
+		return (ENOENT);
+
+	return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class],
+	    doi));
+}
+
+boolean_t
+ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class)
+{
+	return (!!ddt->ddt_object[type][class]);
+}
+
+void
+ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class,
+    char *name)
+{
+	(void) sprintf(name, DMU_POOL_DDT,
+	    zio_checksum_table[ddt->ddt_checksum].ci_name,
+	    ddt_ops[type]->ddt_op_name, ddt_class_name[class]);
+}
+
+void
+ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg)
+{
+	ASSERT(txg != 0);
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		bp->blk_dva[d] = ddp->ddp_dva[d];
+	BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth);
+}
+
+void
+ddt_bp_create(enum zio_checksum checksum,
+    const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp)
+{
+	BP_ZERO(bp);
+
+	if (ddp != NULL)
+		ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth);
+
+	bp->blk_cksum = ddk->ddk_cksum;
+
+	BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk));
+	BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk));
+	BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk));
+	BP_SET_CHECKSUM(bp, checksum);
+	BP_SET_TYPE(bp, DMU_OT_NONE);
+	BP_SET_LEVEL(bp, 0);
+	BP_SET_DEDUP(bp, 0);
+	BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+}
+
+void
+ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp)
+{
+	ddk->ddk_cksum = bp->blk_cksum;
+	ddk->ddk_prop = 0;
+
+	DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp));
+	DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp));
+	DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp));
+}
+
+void
+ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp)
+{
+	ASSERT(ddp->ddp_phys_birth == 0);
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		ddp->ddp_dva[d] = bp->blk_dva[d];
+	ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp);
+}
+
+void
+ddt_phys_clear(ddt_phys_t *ddp)
+{
+	bzero(ddp, sizeof (*ddp));
+}
+
+void
+ddt_phys_addref(ddt_phys_t *ddp)
+{
+	ddp->ddp_refcnt++;
+}
+
+void
+ddt_phys_decref(ddt_phys_t *ddp)
+{
+	ASSERT((int64_t)ddp->ddp_refcnt > 0);
+	ddp->ddp_refcnt--;
+}
+
+void
+ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg)
+{
+	blkptr_t blk;
+
+	ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+	ddt_phys_clear(ddp);
+	zio_free(ddt->ddt_spa, txg, &blk);
+}
+
+ddt_phys_t *
+ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp)
+{
+	ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys;
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) &&
+		    BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth)
+			return (ddp);
+	}
+	return (NULL);
+}
+
+uint64_t
+ddt_phys_total_refcnt(const ddt_entry_t *dde)
+{
+	uint64_t refcnt = 0;
+
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++)
+		refcnt += dde->dde_phys[p].ddp_refcnt;
+
+	return (refcnt);
+}
+
+static void
+ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	uint64_t lsize = DDK_GET_LSIZE(ddk);
+	uint64_t psize = DDK_GET_PSIZE(ddk);
+
+	bzero(dds, sizeof (*dds));
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		uint64_t dsize = 0;
+		uint64_t refcnt = ddp->ddp_refcnt;
+
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+
+		for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+			dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]);
+
+		dds->dds_blocks += 1;
+		dds->dds_lsize += lsize;
+		dds->dds_psize += psize;
+		dds->dds_dsize += dsize;
+
+		dds->dds_ref_blocks += refcnt;
+		dds->dds_ref_lsize += lsize * refcnt;
+		dds->dds_ref_psize += psize * refcnt;
+		dds->dds_ref_dsize += dsize * refcnt;
+	}
+}
+
+void
+ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg)
+{
+	const uint64_t *s = (const uint64_t *)src;
+	uint64_t *d = (uint64_t *)dst;
+	uint64_t *d_end = (uint64_t *)(dst + 1);
+
+	ASSERT(neg == 0 || neg == -1ULL);	/* add or subtract */
+
+	while (d < d_end)
+		*d++ += (*s++ ^ neg) - neg;
+}
+
+static void
+ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg)
+{
+	ddt_stat_t dds;
+	ddt_histogram_t *ddh;
+	int bucket;
+
+	ddt_stat_generate(ddt, dde, &dds);
+
+	bucket = highbit(dds.dds_ref_blocks) - 1;
+	ASSERT(bucket >= 0);
+
+	ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class];
+
+	ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg);
+}
+
+void
+ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src)
+{
+	for (int h = 0; h < 64; h++)
+		ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0);
+}
+
+void
+ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh)
+{
+	bzero(dds, sizeof (*dds));
+
+	for (int h = 0; h < 64; h++)
+		ddt_stat_add(dds, &ddh->ddh_stat[h], 0);
+}
+
+boolean_t
+ddt_histogram_empty(const ddt_histogram_t *ddh)
+{
+	const uint64_t *s = (const uint64_t *)ddh;
+	const uint64_t *s_end = (const uint64_t *)(ddh + 1);
+
+	while (s < s_end)
+		if (*s++ != 0)
+			return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+void
+ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo)
+{
+	dmu_object_info_t doi;
+	uint64_t count;
+	int error;
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				error = ddt_object_info(ddt, type, class, &doi);
+				if (error == ENOENT)
+					continue;
+				ASSERT3U(error, ==, 0);
+
+				count = ddt_object_count(ddt, type, class);
+				ddo->ddo_count += count;
+				ddo->ddo_dspace +=
+				    (doi.doi_physical_blocks_512 << 9) / count;
+				ddo->ddo_mspace += doi.doi_fill_count *
+				    doi.doi_data_block_size / count;
+			}
+		}
+	}
+}
+
+void
+ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				ddt_histogram_add(ddh,
+				    &ddt->ddt_histogram[type][class]);
+			}
+		}
+	}
+}
+
+void
+ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total)
+{
+	ddt_histogram_t *ddh_total;
+
+	ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+	ddt_get_dedup_histogram(spa, ddh_total);
+	ddt_histogram_stat(dds_total, ddh_total);
+	kmem_free(ddh_total, sizeof (ddt_histogram_t));
+}
+
+uint64_t
+ddt_get_dedup_dspace(spa_t *spa)
+{
+	ddt_stat_t dds_total = { 0 };
+
+	ddt_get_dedup_stats(spa, &dds_total);
+	return (dds_total.dds_ref_dsize - dds_total.dds_dsize);
+}
+
+uint64_t
+ddt_get_pool_dedup_ratio(spa_t *spa)
+{
+	ddt_stat_t dds_total = { 0 };
+
+	ddt_get_dedup_stats(spa, &dds_total);
+	if (dds_total.dds_dsize == 0)
+		return (100);
+
+	return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize);
+}
+
+int
+ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref)
+{
+	spa_t *spa = ddt->ddt_spa;
+	uint64_t total_refcnt = 0;
+	uint64_t ditto = spa->spa_dedup_ditto;
+	int total_copies = 0;
+	int desired_copies = 0;
+
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+		zio_t *zio = dde->dde_lead_zio[p];
+		uint64_t refcnt = ddp->ddp_refcnt;	/* committed refs */
+		if (zio != NULL)
+			refcnt += zio->io_parent_count;	/* pending refs */
+		if (ddp == ddp_willref)
+			refcnt++;			/* caller's ref */
+		if (refcnt != 0) {
+			total_refcnt += refcnt;
+			total_copies += p;
+		}
+	}
+
+	if (ditto == 0 || ditto > UINT32_MAX)
+		ditto = UINT32_MAX;
+
+	if (total_refcnt >= 1)
+		desired_copies++;
+	if (total_refcnt >= ditto)
+		desired_copies++;
+	if (total_refcnt >= ditto * ditto)
+		desired_copies++;
+
+	return (MAX(desired_copies, total_copies) - total_copies);
+}
+
+int
+ddt_ditto_copies_present(ddt_entry_t *dde)
+{
+	ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO];
+	dva_t *dva = ddp->ddp_dva;
+	int copies = 0 - DVA_GET_GANG(dva);
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++)
+		if (DVA_IS_VALID(dva))
+			copies++;
+
+	ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP);
+
+	return (copies);
+}
+
+size_t
+ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len)
+{
+	uchar_t *version = dst++;
+	int cpfunc = ZIO_COMPRESS_ZLE;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+	size_t c_len;
+
+	ASSERT(d_len >= s_len + 1);	/* no compression plus version byte */
+
+	c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level);
+
+	if (c_len == s_len) {
+		cpfunc = ZIO_COMPRESS_OFF;
+		bcopy(src, dst, s_len);
+	}
+
+	*version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc;
+
+	return (c_len + 1);
+}
+
+void
+ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len)
+{
+	uchar_t version = *src++;
+	int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK;
+	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+
+	if (ci->ci_decompress != NULL)
+		(void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level);
+	else
+		bcopy(src, dst, d_len);
+
+	if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK)
+		byteswap_uint64_array(dst, d_len);
+}
+
+ddt_t *
+ddt_select_by_checksum(spa_t *spa, enum zio_checksum c)
+{
+	return (spa->spa_ddt[c]);
+}
+
+ddt_t *
+ddt_select(spa_t *spa, const blkptr_t *bp)
+{
+	return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]);
+}
+
+void
+ddt_enter(ddt_t *ddt)
+{
+	mutex_enter(&ddt->ddt_lock);
+}
+
+void
+ddt_exit(ddt_t *ddt)
+{
+	mutex_exit(&ddt->ddt_lock);
+}
+
+static ddt_entry_t *
+ddt_alloc(const ddt_key_t *ddk)
+{
+	ddt_entry_t *dde;
+
+	dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP);
+	cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL);
+
+	dde->dde_key = *ddk;
+
+	return (dde);
+}
+
+static void
+ddt_free(ddt_entry_t *dde)
+{
+	ASSERT(!dde->dde_loading);
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++)
+		ASSERT(dde->dde_lead_zio[p] == NULL);
+
+	if (dde->dde_repair_data != NULL)
+		zio_buf_free(dde->dde_repair_data,
+		    DDK_GET_PSIZE(&dde->dde_key));
+
+	cv_destroy(&dde->dde_cv);
+	kmem_free(dde, sizeof (*dde));
+}
+
+void
+ddt_remove(ddt_t *ddt, ddt_entry_t *dde)
+{
+	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+	avl_remove(&ddt->ddt_tree, dde);
+	ddt_free(dde);
+}
+
+ddt_entry_t *
+ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add)
+{
+	ddt_entry_t *dde, dde_search;
+	enum ddt_type type;
+	enum ddt_class class;
+	avl_index_t where;
+	int error;
+
+	ASSERT(MUTEX_HELD(&ddt->ddt_lock));
+
+	ddt_key_fill(&dde_search.dde_key, bp);
+
+	dde = avl_find(&ddt->ddt_tree, &dde_search, &where);
+	if (dde == NULL) {
+		if (!add)
+			return (NULL);
+		dde = ddt_alloc(&dde_search.dde_key);
+		avl_insert(&ddt->ddt_tree, dde, where);
+	}
+
+	while (dde->dde_loading)
+		cv_wait(&dde->dde_cv, &ddt->ddt_lock);
+
+	if (dde->dde_loaded)
+		return (dde);
+
+	dde->dde_loading = B_TRUE;
+
+	ddt_exit(ddt);
+
+	error = ENOENT;
+
+	for (type = 0; type < DDT_TYPES; type++) {
+		for (class = 0; class < DDT_CLASSES; class++) {
+			error = ddt_object_lookup(ddt, type, class, dde);
+			if (error != ENOENT)
+				break;
+		}
+		if (error != ENOENT)
+			break;
+	}
+
+	ASSERT(error == 0 || error == ENOENT);
+
+	ddt_enter(ddt);
+
+	ASSERT(dde->dde_loaded == B_FALSE);
+	ASSERT(dde->dde_loading == B_TRUE);
+
+	dde->dde_type = type;	/* will be DDT_TYPES if no entry found */
+	dde->dde_class = class;	/* will be DDT_CLASSES if no entry found */
+	dde->dde_loaded = B_TRUE;
+	dde->dde_loading = B_FALSE;
+
+	if (error == 0)
+		ddt_stat_update(ddt, dde, -1ULL);
+
+	cv_broadcast(&dde->dde_cv);
+
+	return (dde);
+}
+
+int
+ddt_entry_compare(const void *x1, const void *x2)
+{
+	const ddt_entry_t *dde1 = x1;
+	const ddt_entry_t *dde2 = x2;
+	const uint64_t *u1 = (const uint64_t *)&dde1->dde_key;
+	const uint64_t *u2 = (const uint64_t *)&dde2->dde_key;
+
+	for (int i = 0; i < DDT_KEY_WORDS; i++) {
+		if (u1[i] < u2[i])
+			return (-1);
+		if (u1[i] > u2[i])
+			return (1);
+	}
+
+	return (0);
+}
+
+static ddt_t *
+ddt_table_alloc(spa_t *spa, enum zio_checksum c)
+{
+	ddt_t *ddt;
+
+	ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP);
+
+	mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL);
+	avl_create(&ddt->ddt_tree, ddt_entry_compare,
+	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+	avl_create(&ddt->ddt_repair_tree, ddt_entry_compare,
+	    sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node));
+	ddt->ddt_checksum = c;
+	ddt->ddt_spa = spa;
+	ddt->ddt_os = spa->spa_meta_objset;
+
+	return (ddt);
+}
+
+static void
+ddt_table_free(ddt_t *ddt)
+{
+	ASSERT(avl_numnodes(&ddt->ddt_tree) == 0);
+	ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0);
+	avl_destroy(&ddt->ddt_tree);
+	avl_destroy(&ddt->ddt_repair_tree);
+	mutex_destroy(&ddt->ddt_lock);
+	kmem_free(ddt, sizeof (*ddt));
+}
+
+void
+ddt_create(spa_t *spa)
+{
+	spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM;
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++)
+		spa->spa_ddt[c] = ddt_table_alloc(spa, c);
+}
+
+int
+ddt_load(spa_t *spa)
+{
+	int error;
+
+	ddt_create(spa);
+
+	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+	    &spa->spa_ddt_stat_object);
+
+	if (error)
+		return (error == ENOENT ? 0 : error);
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+			for (enum ddt_class class = 0; class < DDT_CLASSES;
+			    class++) {
+				ddt_t *ddt = spa->spa_ddt[c];
+				error = ddt_object_load(ddt, type, class);
+				if (error != 0 && error != ENOENT)
+					return (error);
+			}
+		}
+	}
+
+	return (0);
+}
+
+void
+ddt_unload(spa_t *spa)
+{
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		if (spa->spa_ddt[c]) {
+			ddt_table_free(spa->spa_ddt[c]);
+			spa->spa_ddt[c] = NULL;
+		}
+	}
+}
+
+boolean_t
+ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp)
+{
+	ddt_t *ddt;
+	ddt_entry_t dde;
+
+	if (!BP_GET_DEDUP(bp))
+		return (B_FALSE);
+
+	if (max_class == DDT_CLASS_UNIQUE)
+		return (B_TRUE);
+
+	ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)];
+
+	ddt_key_fill(&dde.dde_key, bp);
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++)
+		for (enum ddt_class class = 0; class <= max_class; class++)
+			if (ddt_object_lookup(ddt, type, class, &dde) == 0)
+				return (B_TRUE);
+
+	return (B_FALSE);
+}
+
+ddt_entry_t *
+ddt_repair_start(ddt_t *ddt, const blkptr_t *bp)
+{
+	ddt_key_t ddk;
+	ddt_entry_t *dde;
+
+	ddt_key_fill(&ddk, bp);
+
+	dde = ddt_alloc(&ddk);
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			/*
+			 * We can only do repair if there are multiple copies
+			 * of the block.  For anything in the UNIQUE class,
+			 * there's definitely only one copy, so don't even try.
+			 */
+			if (class != DDT_CLASS_UNIQUE &&
+			    ddt_object_lookup(ddt, type, class, dde) == 0)
+				return (dde);
+		}
+	}
+
+	bzero(dde->dde_phys, sizeof (dde->dde_phys));
+
+	return (dde);
+}
+
+void
+ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde)
+{
+	avl_index_t where;
+
+	ddt_enter(ddt);
+
+	if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) &&
+	    avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL)
+		avl_insert(&ddt->ddt_repair_tree, dde, where);
+	else
+		ddt_free(dde);
+
+	ddt_exit(ddt);
+}
+
+static void
+ddt_repair_entry_done(zio_t *zio)
+{
+	ddt_entry_t *rdde = zio->io_private;
+
+	ddt_free(rdde);
+}
+
+static void
+ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio)
+{
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_phys_t *rddp = rdde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	ddt_key_t *rddk = &rdde->dde_key;
+	zio_t *zio;
+	blkptr_t blk;
+
+	zio = zio_null(rio, rio->io_spa, NULL,
+	    ddt_repair_entry_done, rdde, rio->io_flags);
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) {
+		if (ddp->ddp_phys_birth == 0 ||
+		    ddp->ddp_phys_birth != rddp->ddp_phys_birth ||
+		    bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva)))
+			continue;
+		ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk);
+		zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk,
+		    rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL,
+		    ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL));
+	}
+
+	zio_nowait(zio);
+}
+
+static void
+ddt_repair_table(ddt_t *ddt, zio_t *rio)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_entry_t *dde, *rdde_next, *rdde;
+	avl_tree_t *t = &ddt->ddt_repair_tree;
+	blkptr_t blk;
+
+	if (spa_sync_pass(spa) > 1)
+		return;
+
+	ddt_enter(ddt);
+	for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) {
+		rdde_next = AVL_NEXT(t, rdde);
+		avl_remove(&ddt->ddt_repair_tree, rdde);
+		ddt_exit(ddt);
+		ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk);
+		dde = ddt_repair_start(ddt, &blk);
+		ddt_repair_entry(ddt, dde, rdde, rio);
+		ddt_repair_done(ddt, dde);
+		ddt_enter(ddt);
+	}
+	ddt_exit(ddt);
+}
+
+static void
+ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg)
+{
+	dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool;
+	ddt_phys_t *ddp = dde->dde_phys;
+	ddt_key_t *ddk = &dde->dde_key;
+	enum ddt_type otype = dde->dde_type;
+	enum ddt_type ntype = DDT_TYPE_CURRENT;
+	enum ddt_class oclass = dde->dde_class;
+	enum ddt_class nclass;
+	uint64_t total_refcnt = 0;
+
+	ASSERT(dde->dde_loaded);
+	ASSERT(!dde->dde_loading);
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		ASSERT(dde->dde_lead_zio[p] == NULL);
+		ASSERT((int64_t)ddp->ddp_refcnt >= 0);
+		if (ddp->ddp_phys_birth == 0) {
+			ASSERT(ddp->ddp_refcnt == 0);
+			continue;
+		}
+		if (p == DDT_PHYS_DITTO) {
+			if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0)
+				ddt_phys_free(ddt, ddk, ddp, txg);
+			continue;
+		}
+		if (ddp->ddp_refcnt == 0)
+			ddt_phys_free(ddt, ddk, ddp, txg);
+		total_refcnt += ddp->ddp_refcnt;
+	}
+
+	if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0)
+		nclass = DDT_CLASS_DITTO;
+	else if (total_refcnt > 1)
+		nclass = DDT_CLASS_DUPLICATE;
+	else
+		nclass = DDT_CLASS_UNIQUE;
+
+	if (otype != DDT_TYPES &&
+	    (otype != ntype || oclass != nclass || total_refcnt == 0)) {
+		VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0);
+		ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT);
+	}
+
+	if (total_refcnt != 0) {
+		dde->dde_type = ntype;
+		dde->dde_class = nclass;
+		ddt_stat_update(ddt, dde, 0);
+		if (!ddt_object_exists(ddt, ntype, nclass))
+			ddt_object_create(ddt, ntype, nclass, tx);
+		VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0);
+
+		if (dp->dp_scrub_func != SCRUB_FUNC_NONE &&
+		    oclass > nclass &&
+		    nclass <= dp->dp_scrub_ddt_class_max)
+			dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde);
+	}
+}
+
+static void
+ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg)
+{
+	spa_t *spa = ddt->ddt_spa;
+	ddt_entry_t *dde;
+	void *cookie = NULL;
+
+	if (avl_numnodes(&ddt->ddt_tree) == 0)
+		return;
+
+	ASSERT(spa_sync_pass(spa) == 1);
+	ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP);
+
+	if (spa->spa_ddt_stat_object == 0) {
+		spa->spa_ddt_stat_object = zap_create(ddt->ddt_os,
+		    DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx);
+		VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_DDT_STATS, sizeof (uint64_t), 1,
+		    &spa->spa_ddt_stat_object, tx) == 0);
+	}
+
+	while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) {
+		ddt_sync_entry(ddt, dde, tx, txg);
+		ddt_free(dde);
+	}
+
+	for (enum ddt_type type = 0; type < DDT_TYPES; type++) {
+		for (enum ddt_class class = 0; class < DDT_CLASSES; class++) {
+			if (!ddt_object_exists(ddt, type, class))
+				continue;
+			ddt_object_sync(ddt, type, class, tx);
+			if (ddt_object_count(ddt, type, class) == 0)
+				ddt_object_destroy(ddt, type, class, tx);
+		}
+	}
+}
+
+void
+ddt_sync(spa_t *spa, uint64_t txg)
+{
+	dmu_tx_t *tx;
+	zio_t *rio = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+	ASSERT(spa_syncing_txg(spa) == txg);
+
+	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
+
+	for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) {
+		ddt_t *ddt = spa->spa_ddt[c];
+		if (ddt == NULL)
+			continue;
+		ddt_sync_table(ddt, tx, txg);
+		ddt_repair_table(ddt, rio);
+	}
+
+	(void) zio_wait(rio);
+
+	dmu_tx_commit(tx);
+}
+
+int
+ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde)
+{
+	do {
+		do {
+			do {
+				ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum];
+				int error = ENOENT;
+				if (ddt_object_exists(ddt, ddb->ddb_type,
+				    ddb->ddb_class)) {
+					error = ddt_object_walk(ddt,
+					    ddb->ddb_type, ddb->ddb_class,
+					    &ddb->ddb_cursor, dde);
+				}
+				if (error == 0)
+					return (0);
+				if (error != ENOENT)
+					return (error);
+				ddb->ddb_cursor = 0;
+			} while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS);
+			ddb->ddb_checksum = 0;
+		} while (++ddb->ddb_type < DDT_TYPES);
+		ddb->ddb_type = 0;
+	} while (++ddb->ddb_class < DDT_CLASSES);
+
+	return (ENOENT);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c b/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c
new file mode 100644
index 0000000000000..1ba5278193373
--- /dev/null
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c
@@ -0,0 +1,150 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#include <sys/zfs_context.h>
+#include <sys/spa.h>
+#include <sys/zio.h>
+#include <sys/ddt.h>
+#include <sys/zap.h>
+#include <sys/dmu_tx.h>
+#include <util/sscanf.h>
+
+int ddt_zap_leaf_blockshift = 12;
+int ddt_zap_indirect_blockshift = 12;
+
+static int
+ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash)
+{
+	zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY;
+
+	if (prehash)
+		flags |= ZAP_FLAG_PRE_HASHED_KEY;
+
+	*objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP,
+	    ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift,
+	    DMU_OT_NONE, 0, tx);
+
+	return (*objectp == 0 ? ENOTSUP : 0);
+}
+
+static int
+ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx)
+{
+	return (zap_destroy(os, object, tx));
+}
+
+static int
+ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde)
+{
+	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+	uint64_t one, csize;
+	int error;
+
+	error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, &one, &csize);
+	if (error)
+		return (error);
+
+	ASSERT(one == 1);
+	ASSERT(csize <= sizeof (cbuf));
+
+	error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, 1, csize, cbuf);
+	if (error)
+		return (error);
+
+	ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys));
+
+	return (0);
+}
+
+static int
+ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+	uint64_t csize;
+
+	csize = ddt_compress(dde->dde_phys, cbuf,
+	    sizeof (dde->dde_phys), sizeof (cbuf));
+
+	return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, 1, csize, cbuf, tx));
+}
+
+static int
+ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx)
+{
+	return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key,
+	    DDT_KEY_WORDS, tx));
+}
+
+static int
+ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int error;
+
+	zap_cursor_init_serialized(&zc, os, object, *walk);
+	if ((error = zap_cursor_retrieve(&zc, &za)) == 0) {
+		uchar_t cbuf[sizeof (dde->dde_phys) + 1];
+		uint64_t csize = za.za_num_integers;
+		ASSERT(za.za_integer_length == 1);
+		error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name,
+		    DDT_KEY_WORDS, 1, csize, cbuf);
+		ASSERT(error == 0);
+		if (error == 0) {
+			ddt_decompress(cbuf, dde->dde_phys, csize,
+			    sizeof (dde->dde_phys));
+			dde->dde_key = *(ddt_key_t *)za.za_name;
+		}
+		zap_cursor_advance(&zc);
+		*walk = zap_cursor_serialize(&zc);
+	}
+	zap_cursor_fini(&zc);
+	return (error);
+}
+
+static uint64_t
+ddt_zap_count(objset_t *os, uint64_t object)
+{
+	uint64_t count = 0;
+
+	VERIFY(zap_count(os, object, &count) == 0);
+
+	return (count);
+}
+
+const ddt_ops_t ddt_zap_ops = {
+	"zap",
+	ddt_zap_create,
+	ddt_zap_destroy,
+	ddt_zap_lookup,
+	ddt_zap_update,
+	ddt_zap_remove,
+	ddt_zap_walk,
+	ddt_zap_count,
+};
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c
index b6205bd500a87..ad73451cb3a6c 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -85,6 +85,11 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = {
 	{	byteswap_uint64_array,	TRUE,	"FUID table size"	},
 	{	zap_byteswap,		TRUE,	"DSL dataset next clones"},
 	{	zap_byteswap,		TRUE,	"scrub work queue"	},
+	{	zap_byteswap,		TRUE,	"ZFS user/group used"	},
+	{	zap_byteswap,		TRUE,	"ZFS user/group quota"	},
+	{	zap_byteswap,		TRUE,	"snapshot refcount tags"},
+	{	zap_byteswap,		TRUE,	"DDT ZAP algorithm"	},
+	{	zap_byteswap,		TRUE,	"DDT statistics"	},
 };
 
 int
@@ -96,7 +101,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset,
 	dmu_buf_impl_t *db;
 	int err;
 
-	err = dnode_hold(os->os, object, FTAG, &dn);
+	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	blkid = dbuf_whichblock(dn, offset);
@@ -147,7 +152,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
 	dmu_buf_impl_t *db;
 	int error;
 
-	error = dnode_hold(os->os, object, FTAG, &dn);
+	error = dnode_hold(os, object, FTAG, &dn);
 	if (error)
 		return (error);
 
@@ -180,22 +185,22 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp)
  * whose dnodes are in the same block.
  */
 static int
-dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
-    uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp)
+dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
+    int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags)
 {
 	dsl_pool_t *dp = NULL;
 	dmu_buf_t **dbp;
 	uint64_t blkid, nblks, i;
-	uint32_t flags;
+	uint32_t dbuf_flags;
 	int err;
 	zio_t *zio;
 	hrtime_t start;
 
 	ASSERT(length <= DMU_MAX_ACCESS);
 
-	flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT;
-	if (length > zfetch_array_rd_sz)
-		flags |= DB_RF_NOPREFETCH;
+	dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT;
+	if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz)
+		dbuf_flags |= DB_RF_NOPREFETCH;
 
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	if (dn->dn_datablkshift) {
@@ -210,6 +215,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
 			    os_dsl_dataset->ds_object,
 			    (longlong_t)dn->dn_object, dn->dn_datablksz,
 			    (longlong_t)offset, (longlong_t)length);
+			rw_exit(&dn->dn_struct_rwlock);
 			return (EIO);
 		}
 		nblks = 1;
@@ -232,9 +238,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset,
 		}
 		/* initiate async i/o */
 		if (read) {
-			rw_exit(&dn->dn_struct_rwlock);
-			(void) dbuf_read(db, zio, flags);
-			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			(void) dbuf_read(db, zio, dbuf_flags);
 		}
 		dbp[i] = &db->db;
 	}
@@ -280,12 +284,12 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset,
 	dnode_t *dn;
 	int err;
 
-	err = dnode_hold(os->os, object, FTAG, &dn);
+	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp);
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	dnode_rele(dn, FTAG);
 
@@ -300,7 +304,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset,
 	int err;
 
 	err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag,
-	    numbufsp, dbpp);
+	    numbufsp, dbpp, DMU_READ_PREFETCH);
 
 	return (err);
 }
@@ -333,7 +337,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 		return;
 
 	if (len == 0) {  /* they're interested in the bonus buffer */
-		dn = os->os->os_meta_dnode;
+		dn = os->os_meta_dnode;
 
 		if (object == 0 || object >= DN_MAX_OBJECT)
 			return;
@@ -350,7 +354,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 	 * already cached, we will do a *synchronous* read in the
 	 * dnode_hold() call.  The same is true for any indirects.
 	 */
-	err = dnode_hold(os->os, object, FTAG, &dn);
+	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return;
 
@@ -374,56 +378,51 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len)
 	dnode_rele(dn, FTAG);
 }
 
+/*
+ * Get the next "chunk" of file data to free.  We traverse the file from
+ * the end so that the file gets shorter over time (if we crashes in the
+ * middle, this will leave us in a better state).  We find allocated file
+ * data by simply searching the allocated level 1 indirects.
+ */
 static int
-get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit)
+get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit)
 {
-	uint64_t len = *offset - limit;
-	uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT;
-	uint64_t subchunk =
+	uint64_t len = *start - limit;
+	uint64_t blkcnt = 0;
+	uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1));
+	uint64_t iblkrange =
 	    dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT);
 
-	ASSERT(limit <= *offset);
+	ASSERT(limit <= *start);
 
-	if (len <= chunk_len) {
-		*offset = limit;
+	if (len <= iblkrange * maxblks) {
+		*start = limit;
 		return (0);
 	}
+	ASSERT(ISP2(iblkrange));
 
-	ASSERT(ISP2(subchunk));
-
-	while (*offset > limit) {
-		uint64_t initial_offset = P2ROUNDUP(*offset, subchunk);
-		uint64_t delta;
+	while (*start > limit && blkcnt < maxblks) {
 		int err;
 
-		/* skip over allocated data */
+		/* find next allocated L1 indirect */
 		err = dnode_next_offset(dn,
-		    DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
-		if (err == ESRCH)
-			*offset = limit;
-		else if (err)
-			return (err);
+		    DNODE_FIND_BACKWARDS, start, 2, 1, 0);
 
-		ASSERT3U(*offset, <=, initial_offset);
-		*offset = P2ALIGN(*offset, subchunk);
-		delta = initial_offset - *offset;
-		if (delta >= chunk_len) {
-			*offset += delta - chunk_len;
+		/* if there are no more, then we are done */
+		if (err == ESRCH) {
+			*start = limit;
 			return (0);
-		}
-		chunk_len -= delta;
-
-		/* skip over unallocated data */
-		err = dnode_next_offset(dn,
-		    DNODE_FIND_BACKWARDS, offset, 1, 1, 0);
-		if (err == ESRCH)
-			*offset = limit;
-		else if (err)
+		} else if (err) {
 			return (err);
+		}
+		blkcnt += 1;
 
-		if (*offset < limit)
-			*offset = limit;
-		ASSERT3U(*offset, <, initial_offset);
+		/* reset offset to end of "next" block back */
+		*start = P2ALIGN(*start, iblkrange);
+		if (*start <= limit)
+			*start = limit;
+		else
+			*start -= 1;
 	}
 	return (0);
 }
@@ -442,7 +441,8 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 	object_size = align == 1 ? dn->dn_datablksz :
 	    (dn->dn_maxblkid + 1) << dn->dn_datablkshift;
 
-	if (trunc || (end = offset + length) > object_size)
+	end = offset + length;
+	if (trunc || end > object_size)
 		end = object_size;
 	if (end <= offset)
 		return (0);
@@ -450,6 +450,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset,
 
 	while (length) {
 		start = end;
+		/* assert(offset <= start) */
 		err = get_next_chunk(dn, &start, offset);
 		if (err)
 			return (err);
@@ -485,7 +486,7 @@ dmu_free_long_range(objset_t *os, uint64_t object,
 	dnode_t *dn;
 	int err;
 
-	err = dnode_hold(os->os, object, FTAG, &dn);
+	err = dnode_hold(os, object, FTAG, &dn);
 	if (err != 0)
 		return (err);
 	err = dmu_free_long_range_impl(os, dn, offset, length, FALSE);
@@ -500,7 +501,7 @@ dmu_free_object(objset_t *os, uint64_t object)
 	dmu_tx_t *tx;
 	int err;
 
-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
 	    FTAG, &dn);
 	if (err != 0)
 		return (err);
@@ -528,7 +529,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, dmu_tx_t *tx)
 {
 	dnode_t *dn;
-	int err = dnode_hold(os->os, object, FTAG, &dn);
+	int err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	ASSERT(offset < UINT64_MAX);
@@ -540,13 +541,13 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset,
 
 int
 dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-    void *buf)
+    void *buf, uint32_t flags)
 {
 	dnode_t *dn;
 	dmu_buf_t **dbp;
-	int numbufs, i, err;
+	int numbufs, err;
 
-	err = dnode_hold(os->os, object, FTAG, &dn);
+	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 
@@ -555,7 +556,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	 * block.  If we ever do the tail block optimization, we will need to
 	 * handle that here as well.
 	 */
-	if (dn->dn_datablkshift == 0) {
+	if (dn->dn_maxblkid == 0) {
 		int newsz = offset > dn->dn_datablksz ? 0 :
 		    MIN(size, dn->dn_datablksz - offset);
 		bzero((char *)buf + newsz, size - newsz);
@@ -564,13 +565,14 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 
 	while (size > 0) {
 		uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2);
+		int i;
 
 		/*
 		 * NB: we could do this block-at-a-time, but it's nice
 		 * to be reading in parallel.
 		 */
 		err = dmu_buf_hold_array_by_dnode(dn, offset, mylen,
-		    TRUE, FTAG, &numbufs, &dbp);
+		    TRUE, FTAG, &numbufs, &dbp, flags);
 		if (err)
 			break;
 
@@ -659,12 +661,136 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	dmu_buf_rele_array(dbp, numbufs, FTAG);
 }
 
+/*
+ * DMU support for xuio
+ */
+kstat_t *xuio_ksp = NULL;
+
+int
+dmu_xuio_init(xuio_t *xuio, int nblk)
+{
+	dmu_xuio_t *priv;
+	uio_t *uio = &xuio->xu_uio;
+
+	uio->uio_iovcnt = nblk;
+	uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP);
+
+	priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP);
+	priv->cnt = nblk;
+	priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP);
+	priv->iovp = uio->uio_iov;
+	XUIO_XUZC_PRIV(xuio) = priv;
+
+	if (XUIO_XUZC_RW(xuio) == UIO_READ)
+		XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk);
+	else
+		XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk);
+
+	return (0);
+}
+
+void
+dmu_xuio_fini(xuio_t *xuio)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+	int nblk = priv->cnt;
+
+	kmem_free(priv->iovp, nblk * sizeof (iovec_t));
+	kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *));
+	kmem_free(priv, sizeof (dmu_xuio_t));
+
+	if (XUIO_XUZC_RW(xuio) == UIO_READ)
+		XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk);
+	else
+		XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk);
+}
+
+/*
+ * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf }
+ * and increase priv->next by 1.
+ */
+int
+dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n)
+{
+	struct iovec *iov;
+	uio_t *uio = &xuio->xu_uio;
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+	int i = priv->next++;
+
+	ASSERT(i < priv->cnt);
+	ASSERT(off + n <= arc_buf_size(abuf));
+	iov = uio->uio_iov + i;
+	iov->iov_base = (char *)abuf->b_data + off;
+	iov->iov_len = n;
+	priv->bufs[i] = abuf;
+	return (0);
+}
+
+int
+dmu_xuio_cnt(xuio_t *xuio)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+	return (priv->cnt);
+}
+
+arc_buf_t *
+dmu_xuio_arcbuf(xuio_t *xuio, int i)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+	ASSERT(i < priv->cnt);
+	return (priv->bufs[i]);
+}
+
+void
+dmu_xuio_clear(xuio_t *xuio, int i)
+{
+	dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio);
+
+	ASSERT(i < priv->cnt);
+	priv->bufs[i] = NULL;
+}
+
+static void
+xuio_stat_init(void)
+{
+	xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+	if (xuio_ksp != NULL) {
+		xuio_ksp->ks_data = &xuio_stats;
+		kstat_install(xuio_ksp);
+	}
+}
+
+static void
+xuio_stat_fini(void)
+{
+	if (xuio_ksp != NULL) {
+		kstat_delete(xuio_ksp);
+		xuio_ksp = NULL;
+	}
+}
+
+void
+xuio_stat_wbuf_copied()
+{
+	XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+}
+
+void
+xuio_stat_wbuf_nocopy()
+{
+	XUIOSTAT_BUMP(xuiostat_wbuf_nocopy);
+}
+
 #ifdef _KERNEL
 int
 dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 {
 	dmu_buf_t **dbp;
 	int numbufs, i, err;
+	xuio_t *xuio = NULL;
 
 	/*
 	 * NB: we could do this block-at-a-time, but it's nice
@@ -675,6 +801,9 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 	if (err)
 		return (err);
 
+	if (uio->uio_extflg == UIO_XUIO)
+		xuio = (xuio_t *)uio;
+
 	for (i = 0; i < numbufs; i++) {
 		int tocpy;
 		int bufoff;
@@ -685,8 +814,24 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size)
 		bufoff = uio->uio_loffset - db->db_offset;
 		tocpy = (int)MIN(db->db_size - bufoff, size);
 
-		err = uiomove((char *)db->db_data + bufoff, tocpy,
-		    UIO_READ, uio);
+		if (xuio) {
+			dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db;
+			arc_buf_t *dbuf_abuf = dbi->db_buf;
+			arc_buf_t *abuf = dbuf_loan_arcbuf(dbi);
+			err = dmu_xuio_add(xuio, abuf, bufoff, tocpy);
+			if (!err) {
+				uio->uio_resid -= tocpy;
+				uio->uio_loffset += tocpy;
+			}
+
+			if (abuf == dbuf_abuf)
+				XUIOSTAT_BUMP(xuiostat_rbuf_nocopy);
+			else
+				XUIOSTAT_BUMP(xuiostat_rbuf_copied);
+		} else {
+			err = uiomove((char *)db->db_data + bufoff, tocpy,
+			    UIO_READ, uio);
+		}
 		if (err)
 			break;
 
@@ -799,9 +944,6 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 		if (tocpy == db->db_size)
 			dmu_buf_fill_done(db, tx);
 
-		if (err)
-			break;
-
 		offset += tocpy;
 		size -= tocpy;
 	}
@@ -810,48 +952,167 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 }
 #endif
 
+/*
+ * Allocate a loaned anonymous arc buffer.
+ */
+arc_buf_t *
+dmu_request_arcbuf(dmu_buf_t *handle, int size)
+{
+	dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+
+	return (arc_loan_buf(dn->dn_objset->os_spa, size));
+}
+
+/*
+ * Free a loaned arc buffer.
+ */
+void
+dmu_return_arcbuf(arc_buf_t *buf)
+{
+	arc_return_buf(buf, FTAG);
+	VERIFY(arc_buf_remove_ref(buf, FTAG) == 1);
+}
+
+/*
+ * When possible directly assign passed loaned arc buffer to a dbuf.
+ * If this is not possible copy the contents of passed arc buf via
+ * dmu_write().
+ */
+void
+dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf,
+    dmu_tx_t *tx)
+{
+	dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode;
+	dmu_buf_impl_t *db;
+	uint32_t blksz = (uint32_t)arc_buf_size(buf);
+	uint64_t blkid;
+
+	rw_enter(&dn->dn_struct_rwlock, RW_READER);
+	blkid = dbuf_whichblock(dn, offset);
+	VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL);
+	rw_exit(&dn->dn_struct_rwlock);
+
+	if (offset == db->db.db_offset && blksz == db->db.db_size) {
+		dbuf_assign_arcbuf(db, buf, tx);
+		dbuf_rele(db, FTAG);
+	} else {
+		dbuf_rele(db, FTAG);
+		dmu_write(dn->dn_objset, dn->dn_object, offset, blksz,
+		    buf->b_data, tx);
+		dmu_return_arcbuf(buf);
+		XUIOSTAT_BUMP(xuiostat_wbuf_copied);
+	}
+}
+
 typedef struct {
-	dbuf_dirty_record_t	*dr;
-	dmu_sync_cb_t		*done;
-	void			*arg;
+	dbuf_dirty_record_t	*dsa_dr;
+	dmu_sync_cb_t		*dsa_done;
+	zgd_t			*dsa_zgd;
+	dmu_tx_t		*dsa_tx;
 } dmu_sync_arg_t;
 
 /* ARGSUSED */
 static void
 dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg)
 {
+	dmu_sync_arg_t *dsa = varg;
+	dmu_buf_t *db = dsa->dsa_zgd->zgd_db;
+	dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode;
 	blkptr_t *bp = zio->io_bp;
 
-	if (!BP_IS_HOLE(bp)) {
-		dmu_sync_arg_t *in = varg;
-		dbuf_dirty_record_t *dr = in->dr;
-		dmu_buf_impl_t *db = dr->dr_dbuf;
-		ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type);
-		ASSERT(BP_GET_LEVEL(bp) == 0);
-		bp->blk_fill = 1;
+	if (zio->io_error == 0) {
+		if (BP_IS_HOLE(bp)) {
+			/*
+			 * A block of zeros may compress to a hole, but the
+			 * block size still needs to be known for replay.
+			 */
+			BP_SET_LSIZE(bp, db->db_size);
+		} else {
+			ASSERT(BP_GET_TYPE(bp) == dn->dn_type);
+			ASSERT(BP_GET_LEVEL(bp) == 0);
+			bp->blk_fill = 1;
+		}
 	}
 }
 
+static void
+dmu_sync_late_arrival_ready(zio_t *zio)
+{
+	dmu_sync_ready(zio, NULL, zio->io_private);
+}
+
 /* ARGSUSED */
 static void
 dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
 {
-	dmu_sync_arg_t *in = varg;
-	dbuf_dirty_record_t *dr = in->dr;
+	dmu_sync_arg_t *dsa = varg;
+	dbuf_dirty_record_t *dr = dsa->dsa_dr;
 	dmu_buf_impl_t *db = dr->dr_dbuf;
-	dmu_sync_cb_t *done = in->done;
 
 	mutex_enter(&db->db_mtx);
 	ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC);
-	dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */
-	dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+	if (zio->io_error == 0) {
+		dr->dt.dl.dr_overridden_by = *zio->io_bp;
+		dr->dt.dl.dr_override_state = DR_OVERRIDDEN;
+		dr->dt.dl.dr_copies = zio->io_prop.zp_copies;
+		if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by))
+			BP_ZERO(&dr->dt.dl.dr_overridden_by);
+	} else {
+		dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN;
+	}
 	cv_broadcast(&db->db_changed);
 	mutex_exit(&db->db_mtx);
 
-	if (done)
-		done(&(db->db), in->arg);
+	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+	kmem_free(dsa, sizeof (*dsa));
+}
+
+static void
+dmu_sync_late_arrival_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	dmu_sync_arg_t *dsa = zio->io_private;
+
+	if (zio->io_error == 0 && !BP_IS_HOLE(bp)) {
+		ASSERT(zio->io_bp->blk_birth == zio->io_txg);
+		ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa));
+		zio_free(zio->io_spa, zio->io_txg, zio->io_bp);
+	}
+
+	dmu_tx_commit(dsa->dsa_tx);
+
+	dsa->dsa_done(dsa->dsa_zgd, zio->io_error);
+
+	kmem_free(dsa, sizeof (*dsa));
+}
+
+static int
+dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd,
+    zio_prop_t *zp, zbookmark_t *zb)
+{
+	dmu_sync_arg_t *dsa;
+	dmu_tx_t *tx;
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_space(tx, zgd->zgd_db->db_size);
+	if (dmu_tx_assign(tx, TXG_WAIT) != 0) {
+		dmu_tx_abort(tx);
+		return (EIO);	/* Make zl_get_data do txg_waited_synced() */
+	}
 
-	kmem_free(in, sizeof (dmu_sync_arg_t));
+	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	dsa->dsa_dr = NULL;
+	dsa->dsa_done = done;
+	dsa->dsa_zgd = zgd;
+	dsa->dsa_tx = tx;
+
+	zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp,
+	    zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp,
+	    dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb));
+
+	return (0);
 }
 
 /*
@@ -870,156 +1131,108 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg)
  *	EALREADY: this block is already in the process of being synced.
  *		The caller should track its progress (somehow).
  *
- *	EINPROGRESS: the IO has been initiated.
- *		The caller should log this blkptr in the callback.
+ *	EIO: could not do the I/O.
+ *		The caller should do a txg_wait_synced().
  *
- *	0: completed.  Sets *bp to the blkptr just written.
- *		The caller should log this blkptr immediately.
+ *	0: the I/O has been initiated.
+ *		The caller should log this blkptr in the done callback.
+ *		It is possible that the I/O will fail, in which case
+ *		the error will be reported to the done callback and
+ *		propagated to pio from zio_done().
  */
 int
-dmu_sync(zio_t *pio, dmu_buf_t *db_fake,
-    blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg)
+dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd)
 {
-	dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake;
-	objset_impl_t *os = db->db_objset;
-	dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool;
-	tx_state_t *tx = &dp->dp_tx;
+	blkptr_t *bp = zgd->zgd_bp;
+	dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db;
+	objset_t *os = db->db_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
 	dbuf_dirty_record_t *dr;
-	dmu_sync_arg_t *in;
+	dmu_sync_arg_t *dsa;
 	zbookmark_t zb;
-	writeprops_t wp = { 0 };
-	zio_t *zio;
-	int err;
+	zio_prop_t zp;
 
+	ASSERT(pio != NULL);
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT(txg != 0);
 
-	dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n",
-	    txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg);
+	SET_BOOKMARK(&zb, ds->ds_object,
+	    db->db.db_object, db->db_level, db->db_blkid);
+
+	dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp);
 
 	/*
-	 * XXX - would be nice if we could do this without suspending...
+	 * If we're frozen (running ziltest), we always need to generate a bp.
 	 */
-	txg_suspend(dp);
+	if (txg > spa_freeze_txg(os->os_spa))
+		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 
 	/*
-	 * If this txg already synced, there's nothing to do.
+	 * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf()
+	 * and us.  If we determine that this txg is not yet syncing,
+	 * but it begins to sync a moment later, that's OK because the
+	 * sync thread will block in dbuf_sync_leaf() until we drop db_mtx.
 	 */
-	if (txg <= tx->tx_synced_txg) {
-		txg_resume(dp);
+	mutex_enter(&db->db_mtx);
+
+	if (txg <= spa_last_synced_txg(os->os_spa)) {
 		/*
-		 * If we're running ziltest, we need the blkptr regardless.
+		 * This txg has already synced.  There's nothing to do.
 		 */
-		if (txg > spa_freeze_txg(dp->dp_spa)) {
-			/* if db_blkptr == NULL, this was an empty write */
-			if (db->db_blkptr)
-				*bp = *db->db_blkptr; /* structure assignment */
-			return (0);
-		}
+		mutex_exit(&db->db_mtx);
 		return (EEXIST);
 	}
 
-	mutex_enter(&db->db_mtx);
-
-	if (txg == tx->tx_syncing_txg) {
-		while (db->db_data_pending) {
-			/*
-			 * IO is in-progress.  Wait for it to finish.
-			 * XXX - would be nice to be able to somehow "attach"
-			 * this zio to the parent zio passed in.
-			 */
-			cv_wait(&db->db_changed, &db->db_mtx);
-			if (!db->db_data_pending &&
-			    db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) {
-				/*
-				 * IO was compressed away
-				 */
-				*bp = *db->db_blkptr; /* structure assignment */
-				mutex_exit(&db->db_mtx);
-				txg_resume(dp);
-				return (0);
-			}
-			ASSERT(db->db_data_pending ||
-			    (db->db_blkptr && db->db_blkptr->blk_birth == txg));
-		}
-
-		if (db->db_blkptr && db->db_blkptr->blk_birth == txg) {
-			/*
-			 * IO is already completed.
-			 */
-			*bp = *db->db_blkptr; /* structure assignment */
-			mutex_exit(&db->db_mtx);
-			txg_resume(dp);
-			return (0);
-		}
+	if (txg <= spa_syncing_txg(os->os_spa)) {
+		/*
+		 * This txg is currently syncing, so we can't mess with
+		 * the dirty record anymore; just write a new log block.
+		 */
+		mutex_exit(&db->db_mtx);
+		return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb));
 	}
 
 	dr = db->db_last_dirty;
-	while (dr && dr->dr_txg > txg)
+	while (dr && dr->dr_txg != txg)
 		dr = dr->dr_next;
-	if (dr == NULL || dr->dr_txg < txg) {
+
+	if (dr == NULL) {
 		/*
-		 * This dbuf isn't dirty, must have been free_range'd.
+		 * There's no dr for this dbuf, so it must have been freed.
 		 * There's no need to log writes to freed blocks, so we're done.
 		 */
 		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
 		return (ENOENT);
 	}
 
 	ASSERT(dr->dr_txg == txg);
-	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) {
+	if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC ||
+	    dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
 		/*
-		 * We have already issued a sync write for this buffer.
-		 */
-		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
-		return (EALREADY);
-	} else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) {
-		/*
-		 * This buffer has already been synced.  It could not
+		 * We have already issued a sync write for this buffer,
+		 * or this buffer has already been synced.  It could not
 		 * have been dirtied since, or we would have cleared the state.
 		 */
-		*bp = dr->dt.dl.dr_overridden_by; /* structure assignment */
 		mutex_exit(&db->db_mtx);
-		txg_resume(dp);
-		return (0);
+		return (EALREADY);
 	}
 
+	ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN);
 	dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC;
-	in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
-	in->dr = dr;
-	in->done = done;
-	in->arg = arg;
 	mutex_exit(&db->db_mtx);
-	txg_resume(dp);
-
-	zb.zb_objset = os->os_dsl_dataset->ds_object;
-	zb.zb_object = db->db.db_object;
-	zb.zb_level = db->db_level;
-	zb.zb_blkid = db->db_blkid;
 
-	wp.wp_type = db->db_dnode->dn_type;
-	wp.wp_level = db->db_level;
-	wp.wp_copies = os->os_copies;
-	wp.wp_dnchecksum = db->db_dnode->dn_checksum;
-	wp.wp_oschecksum = os->os_checksum;
-	wp.wp_dncompress = db->db_dnode->dn_compress;
-	wp.wp_oscompress = os->os_compress;
+	dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP);
+	dsa->dsa_dr = dr;
+	dsa->dsa_done = done;
+	dsa->dsa_zgd = zgd;
+	dsa->dsa_tx = NULL;
 
-	ASSERT(BP_IS_HOLE(bp));
+	zio_nowait(arc_write(pio, os->os_spa, txg,
+	    bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp,
+	    dmu_sync_ready, dmu_sync_done, dsa,
+	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb));
 
-	zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db),
-	    txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in,
-	    ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
-	if (pio) {
-		zio_nowait(zio);
-		err = EINPROGRESS;
-	} else {
-		err = zio_wait(zio);
-		ASSERT(err == 0);
-	}
-	return (err);
+	return (0);
 }
 
 int
@@ -1029,7 +1242,7 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs,
 	dnode_t *dn;
 	int err;
 
-	err = dnode_hold(os->os, object, FTAG, &dn);
+	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	err = dnode_set_blksz(dn, size, ibs, tx);
@@ -1044,7 +1257,7 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum,
 	dnode_t *dn;
 
 	/* XXX assumes dnode_hold will not get an i/o error */
-	(void) dnode_hold(os->os, object, FTAG, &dn);
+	(void) dnode_hold(os, object, FTAG, &dn);
 	ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS);
 	dn->dn_checksum = checksum;
 	dnode_setdirty(dn, tx);
@@ -1058,20 +1271,98 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
 	dnode_t *dn;
 
 	/* XXX assumes dnode_hold will not get an i/o error */
-	(void) dnode_hold(os->os, object, FTAG, &dn);
+	(void) dnode_hold(os, object, FTAG, &dn);
 	ASSERT(compress < ZIO_COMPRESS_FUNCTIONS);
 	dn->dn_compress = compress;
 	dnode_setdirty(dn, tx);
 	dnode_rele(dn, FTAG);
 }
 
+int zfs_mdcomp_disable = 0;
+
+void
+dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
+{
+	dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET;
+	boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata);
+	enum zio_checksum checksum = os->os_checksum;
+	enum zio_compress compress = os->os_compress;
+	enum zio_checksum dedup_checksum = os->os_dedup_checksum;
+	boolean_t dedup;
+	boolean_t dedup_verify = os->os_dedup_verify;
+	int copies = os->os_copies;
+
+	/*
+	 * Determine checksum setting.
+	 */
+	if (ismd) {
+		/*
+		 * Metadata always gets checksummed.  If the data
+		 * checksum is multi-bit correctable, and it's not a
+		 * ZBT-style checksum, then it's suitable for metadata
+		 * as well.  Otherwise, the metadata checksum defaults
+		 * to fletcher4.
+		 */
+		if (zio_checksum_table[checksum].ci_correctable < 1 ||
+		    zio_checksum_table[checksum].ci_eck)
+			checksum = ZIO_CHECKSUM_FLETCHER_4;
+	} else {
+		checksum = zio_checksum_select(dn->dn_checksum, checksum);
+	}
+
+	/*
+	 * Determine compression setting.
+	 */
+	if (ismd) {
+		/*
+		 * XXX -- we should design a compression algorithm
+		 * that specializes in arrays of bps.
+		 */
+		compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY :
+		    ZIO_COMPRESS_LZJB;
+	} else {
+		compress = zio_compress_select(dn->dn_compress, compress);
+	}
+
+	/*
+	 * Determine dedup setting.  If we are in dmu_sync(), we won't
+	 * actually dedup now because that's all done in syncing context;
+	 * but we do want to use the dedup checkum.  If the checksum is not
+	 * strong enough to ensure unique signatures, force dedup_verify.
+	 */
+	dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF);
+	if (dedup) {
+		checksum = dedup_checksum;
+		if (!zio_checksum_table[checksum].ci_dedup)
+			dedup_verify = 1;
+	}
+
+	if (wp & WP_DMU_SYNC)
+		dedup = 0;
+
+	if (wp & WP_NOFILL) {
+		ASSERT(!ismd && level == 0);
+		checksum = ZIO_CHECKSUM_OFF;
+		compress = ZIO_COMPRESS_OFF;
+		dedup = B_FALSE;
+	}
+
+	zp->zp_checksum = checksum;
+	zp->zp_compress = compress;
+	zp->zp_type = type;
+	zp->zp_level = level;
+	zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
+	zp->zp_dedup = dedup;
+	zp->zp_dedup_verify = dedup && dedup_verify;
+}
+
 int
 dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 {
 	dnode_t *dn;
 	int i, err;
 
-	err = dnode_hold(os->os, object, FTAG, &dn);
+	err = dnode_hold(os, object, FTAG, &dn);
 	if (err)
 		return (err);
 	/*
@@ -1085,7 +1376,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 	if (i != TXG_SIZE) {
 		dnode_rele(dn, FTAG);
 		txg_wait_synced(dmu_objset_pool(os), 0);
-		err = dnode_hold(os->os, object, FTAG, &dn);
+		err = dnode_hold(os, object, FTAG, &dn);
 		if (err)
 			return (err);
 	}
@@ -1099,21 +1390,27 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off)
 void
 dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi)
 {
+	dnode_phys_t *dnp;
+
 	rw_enter(&dn->dn_struct_rwlock, RW_READER);
 	mutex_enter(&dn->dn_mtx);
 
+	dnp = dn->dn_phys;
+
 	doi->doi_data_block_size = dn->dn_datablksz;
 	doi->doi_metadata_block_size = dn->dn_indblkshift ?
 	    1ULL << dn->dn_indblkshift : 0;
+	doi->doi_type = dn->dn_type;
+	doi->doi_bonus_type = dn->dn_bonustype;
+	doi->doi_bonus_size = dn->dn_bonuslen;
 	doi->doi_indirection = dn->dn_nlevels;
 	doi->doi_checksum = dn->dn_checksum;
 	doi->doi_compress = dn->dn_compress;
-	doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) +
-	    SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT;
-	doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid;
-	doi->doi_type = dn->dn_type;
-	doi->doi_bonus_size = dn->dn_bonuslen;
-	doi->doi_bonus_type = dn->dn_bonustype;
+	doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9;
+	doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz;
+	doi->doi_fill_count = 0;
+	for (int i = 0; i < dnp->dn_nblkptr; i++)
+		doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill;
 
 	mutex_exit(&dn->dn_mtx);
 	rw_exit(&dn->dn_struct_rwlock);
@@ -1127,7 +1424,7 @@ int
 dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi)
 {
 	dnode_t *dn;
-	int err = dnode_hold(os->os, object, FTAG, &dn);
+	int err = dnode_hold(os, object, FTAG, &dn);
 
 	if (err)
 		return (err);
@@ -1213,15 +1510,19 @@ dmu_init(void)
 {
 	dbuf_init();
 	dnode_init();
+	zfetch_init();
 	arc_init();
 	l2arc_init();
+	xuio_stat_init();
 }
 
 void
 dmu_fini(void)
 {
 	arc_fini();
+	zfetch_fini();
 	dnode_fini();
 	dbuf_fini();
 	l2arc_fini();
+	xuio_stat_fini();
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c
index 1b9247d66e65e..06c0ee490b016 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
@@ -34,16 +32,15 @@ uint64_t
 dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-	objset_impl_t *osi = os->os;
 	uint64_t object;
 	uint64_t L2_dnode_count = DNODES_PER_BLOCK <<
-	    (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
+	    (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT);
 	dnode_t *dn = NULL;
 	int restarted = B_FALSE;
 
-	mutex_enter(&osi->os_obj_lock);
+	mutex_enter(&os->os_obj_lock);
 	for (;;) {
-		object = osi->os_obj_next;
+		object = os->os_obj_next;
 		/*
 		 * Each time we polish off an L2 bp worth of dnodes
 		 * (2^13 objects), move to another L2 bp that's still
@@ -53,14 +50,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 		 */
 		if (P2PHASE(object, L2_dnode_count) == 0) {
 			uint64_t offset = restarted ? object << DNODE_SHIFT : 0;
-			int error = dnode_next_offset(osi->os_meta_dnode,
+			int error = dnode_next_offset(os->os_meta_dnode,
 			    DNODE_FIND_HOLE,
 			    &offset, 2, DNODES_PER_BLOCK >> 2, 0);
 			restarted = B_TRUE;
 			if (error == 0)
 				object = offset >> DNODE_SHIFT;
 		}
-		osi->os_obj_next = ++object;
+		os->os_obj_next = ++object;
 
 		/*
 		 * XXX We should check for an i/o error here and return
@@ -68,19 +65,19 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize,
 		 * dmu_tx_assign(), but there is currently no mechanism
 		 * to do so.
 		 */
-		(void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE,
+		(void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE,
 		    FTAG, &dn);
 		if (dn)
 			break;
 
 		if (dmu_object_next(os, &object, B_TRUE, 0) == 0)
-			osi->os_obj_next = object - 1;
+			os->os_obj_next = object - 1;
 	}
 
 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
 	dnode_rele(dn, FTAG);
 
-	mutex_exit(&osi->os_obj_lock);
+	mutex_exit(&os->os_obj_lock);
 
 	dmu_tx_add_new_object(tx, os, object);
 	return (object);
@@ -96,7 +93,7 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
 	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
 		return (EBADF);
 
-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn);
 	if (err)
 		return (err);
 	dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx);
@@ -108,22 +105,56 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
 
 int
 dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+    int blocksize, dmu_object_type_t bonustype, int bonuslen)
 {
 	dnode_t *dn;
+	dmu_tx_t *tx;
+	int nblkptr;
 	int err;
 
-	if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx))
+	if (object == DMU_META_DNODE_OBJECT)
 		return (EBADF);
 
-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
 	    FTAG, &dn);
 	if (err)
 		return (err);
+
+	if (dn->dn_type == ot && dn->dn_datablksz == blocksize &&
+	    dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) {
+		/* nothing is changing, this is a noop */
+		dnode_rele(dn, FTAG);
+		return (0);
+	}
+
+	nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+
+	/*
+	 * If we are losing blkptrs or changing the block size this must
+	 * be a new file instance.   We must clear out the previous file
+	 * contents before we can change this type of metadata in the dnode.
+	 */
+	if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) {
+		err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END);
+		if (err)
+			goto out;
+	}
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, object);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		goto out;
+	}
+
 	dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx);
+
+	dmu_tx_commit(tx);
+out:
 	dnode_rele(dn, FTAG);
 
-	return (0);
+	return (err);
 }
 
 int
@@ -134,7 +165,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx)
 
 	ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx));
 
-	err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED,
+	err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED,
 	    FTAG, &dn);
 	if (err)
 		return (err);
@@ -153,7 +184,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg)
 	uint64_t offset = (*objectp + 1) << DNODE_SHIFT;
 	int error;
 
-	error = dnode_next_offset(os->os->os_meta_dnode,
+	error = dnode_next_offset(os->os_meta_dnode,
 	    (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg);
 
 	*objectp = offset >> DNODE_SHIFT;
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c
index f37cc2fc56cb2..ac29deb6c1af2 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,22 +36,22 @@
 #include <sys/dbuf.h>
 #include <sys/zvol.h>
 #include <sys/dmu_tx.h>
-#include <sys/zio_checksum.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
 #include <sys/dmu_impl.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/sunddi.h>
 
 spa_t *
 dmu_objset_spa(objset_t *os)
 {
-	return (os->os->os_spa);
+	return (os->os_spa);
 }
 
 zilog_t *
 dmu_objset_zil(objset_t *os)
 {
-	return (os->os->os_zil);
+	return (os->os_zil);
 }
 
 dsl_pool_t *
@@ -59,82 +59,106 @@ dmu_objset_pool(objset_t *os)
 {
 	dsl_dataset_t *ds;
 
-	if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir)
+	if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir)
 		return (ds->ds_dir->dd_pool);
 	else
-		return (spa_get_dsl(os->os->os_spa));
+		return (spa_get_dsl(os->os_spa));
 }
 
 dsl_dataset_t *
 dmu_objset_ds(objset_t *os)
 {
-	return (os->os->os_dsl_dataset);
+	return (os->os_dsl_dataset);
 }
 
 dmu_objset_type_t
 dmu_objset_type(objset_t *os)
 {
-	return (os->os->os_phys->os_type);
+	return (os->os_phys->os_type);
 }
 
 void
 dmu_objset_name(objset_t *os, char *buf)
 {
-	dsl_dataset_name(os->os->os_dsl_dataset, buf);
+	dsl_dataset_name(os->os_dsl_dataset, buf);
 }
 
 uint64_t
 dmu_objset_id(objset_t *os)
 {
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	return (ds ? ds->ds_object : 0);
 }
 
+uint64_t
+dmu_objset_logbias(objset_t *os)
+{
+	return (os->os_logbias);
+}
+
 static void
 checksum_changed_cb(void *arg, uint64_t newval)
 {
-	objset_impl_t *osi = arg;
+	objset_t *os = arg;
 
 	/*
 	 * Inheritance should have been done by now.
 	 */
 	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
 
-	osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
+	os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE);
 }
 
 static void
 compression_changed_cb(void *arg, uint64_t newval)
 {
-	objset_impl_t *osi = arg;
+	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval != ZIO_COMPRESS_INHERIT);
 
-	osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
+	os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE);
 }
 
 static void
 copies_changed_cb(void *arg, uint64_t newval)
 {
-	objset_impl_t *osi = arg;
+	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
 	 */
 	ASSERT(newval > 0);
-	ASSERT(newval <= spa_max_replication(osi->os_spa));
+	ASSERT(newval <= spa_max_replication(os->os_spa));
+
+	os->os_copies = newval;
+}
+
+static void
+dedup_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+	spa_t *spa = os->os_spa;
+	enum zio_checksum checksum;
+
+	/*
+	 * Inheritance should have been done by now.
+	 */
+	ASSERT(newval != ZIO_CHECKSUM_INHERIT);
+
+	checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF);
 
-	osi->os_copies = newval;
+	os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK;
+	os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY);
 }
 
 static void
 primary_cache_changed_cb(void *arg, uint64_t newval)
 {
-	objset_impl_t *osi = arg;
+	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
@@ -142,13 +166,13 @@ primary_cache_changed_cb(void *arg, uint64_t newval)
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
-	osi->os_primary_cache = newval;
+	os->os_primary_cache = newval;
 }
 
 static void
 secondary_cache_changed_cb(void *arg, uint64_t newval)
 {
-	objset_impl_t *osi = arg;
+	objset_t *os = arg;
 
 	/*
 	 * Inheritance and range checking should have been done by now.
@@ -156,7 +180,19 @@ secondary_cache_changed_cb(void *arg, uint64_t newval)
 	ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE ||
 	    newval == ZFS_CACHE_METADATA);
 
-	osi->os_secondary_cache = newval;
+	os->os_secondary_cache = newval;
+}
+
+static void
+logbias_changed_cb(void *arg, uint64_t newval)
+{
+	objset_t *os = arg;
+
+	ASSERT(newval == ZFS_LOGBIAS_LATENCY ||
+	    newval == ZFS_LOGBIAS_THROUGHPUT);
+	os->os_logbias = newval;
+	if (os->os_zil)
+		zil_set_logbias(os->os_zil, newval);
 }
 
 void
@@ -164,58 +200,79 @@ dmu_objset_byteswap(void *buf, size_t size)
 {
 	objset_phys_t *osp = buf;
 
-	ASSERT(size == sizeof (objset_phys_t));
+	ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t));
 	dnode_byteswap(&osp->os_meta_dnode);
 	byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t));
 	osp->os_type = BSWAP_64(osp->os_type);
+	osp->os_flags = BSWAP_64(osp->os_flags);
+	if (size == sizeof (objset_phys_t)) {
+		dnode_byteswap(&osp->os_userused_dnode);
+		dnode_byteswap(&osp->os_groupused_dnode);
+	}
 }
 
 int
 dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
-    objset_impl_t **osip)
+    objset_t **osp)
 {
-	objset_impl_t *osi;
+	objset_t *os;
 	int i, err;
 
 	ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock));
 
-	osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP);
-	osi->os.os = osi;
-	osi->os_dsl_dataset = ds;
-	osi->os_spa = spa;
-	osi->os_rootbp = bp;
-	if (!BP_IS_HOLE(osi->os_rootbp)) {
+	os = kmem_zalloc(sizeof (objset_t), KM_SLEEP);
+	os->os_dsl_dataset = ds;
+	os->os_spa = spa;
+	os->os_rootbp = bp;
+	if (!BP_IS_HOLE(os->os_rootbp)) {
 		uint32_t aflags = ARC_WAIT;
 		zbookmark_t zb;
-		zb.zb_objset = ds ? ds->ds_object : 0;
-		zb.zb_object = 0;
-		zb.zb_level = -1;
-		zb.zb_blkid = 0;
-		if (DMU_OS_IS_L2CACHEABLE(osi))
+		SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+		    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+		if (DMU_OS_IS_L2CACHEABLE(os))
 			aflags |= ARC_L2CACHE;
 
-		dprintf_bp(osi->os_rootbp, "reading %s", "");
+		dprintf_bp(os->os_rootbp, "reading %s", "");
 		/*
 		 * NB: when bprewrite scrub can change the bp,
 		 * and this is called from dmu_objset_open_ds_os, the bp
 		 * could change, and we'll need a lock.
 		 */
-		err = arc_read_nolock(NULL, spa, osi->os_rootbp,
-		    arc_getbuf_func, &osi->os_phys_buf,
+		err = arc_read_nolock(NULL, spa, os->os_rootbp,
+		    arc_getbuf_func, &os->os_phys_buf,
 		    ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb);
 		if (err) {
-			kmem_free(osi, sizeof (objset_impl_t));
+			kmem_free(os, sizeof (objset_t));
 			/* convert checksum errors into IO errors */
 			if (err == ECKSUM)
 				err = EIO;
 			return (err);
 		}
-		osi->os_phys = osi->os_phys_buf->b_data;
+
+		/* Increase the blocksize if we are permitted. */
+		if (spa_version(spa) >= SPA_VERSION_USERSPACE &&
+		    arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) {
+			arc_buf_t *buf = arc_buf_alloc(spa,
+			    sizeof (objset_phys_t), &os->os_phys_buf,
+			    ARC_BUFC_METADATA);
+			bzero(buf->b_data, sizeof (objset_phys_t));
+			bcopy(os->os_phys_buf->b_data, buf->b_data,
+			    arc_buf_size(os->os_phys_buf));
+			(void) arc_buf_remove_ref(os->os_phys_buf,
+			    &os->os_phys_buf);
+			os->os_phys_buf = buf;
+		}
+
+		os->os_phys = os->os_phys_buf->b_data;
+		os->os_flags = os->os_phys->os_flags;
 	} else {
-		osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t),
-		    &osi->os_phys_buf, ARC_BUFC_METADATA);
-		osi->os_phys = osi->os_phys_buf->b_data;
-		bzero(osi->os_phys, sizeof (objset_phys_t));
+		int size = spa_version(spa) >= SPA_VERSION_USERSPACE ?
+		    sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE;
+		os->os_phys_buf = arc_buf_alloc(spa, size,
+		    &os->os_phys_buf, ARC_BUFC_METADATA);
+		os->os_phys = os->os_phys_buf->b_data;
+		bzero(os->os_phys, size);
 	}
 
 	/*
@@ -226,173 +283,167 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 	 */
 	if (ds) {
 		err = dsl_prop_register(ds, "primarycache",
-		    primary_cache_changed_cb, osi);
+		    primary_cache_changed_cb, os);
 		if (err == 0)
 			err = dsl_prop_register(ds, "secondarycache",
-			    secondary_cache_changed_cb, osi);
+			    secondary_cache_changed_cb, os);
 		if (!dsl_dataset_is_snapshot(ds)) {
 			if (err == 0)
 				err = dsl_prop_register(ds, "checksum",
-				    checksum_changed_cb, osi);
+				    checksum_changed_cb, os);
 			if (err == 0)
 				err = dsl_prop_register(ds, "compression",
-				    compression_changed_cb, osi);
+				    compression_changed_cb, os);
 			if (err == 0)
 				err = dsl_prop_register(ds, "copies",
-				    copies_changed_cb, osi);
+				    copies_changed_cb, os);
+			if (err == 0)
+				err = dsl_prop_register(ds, "dedup",
+				    dedup_changed_cb, os);
+			if (err == 0)
+				err = dsl_prop_register(ds, "logbias",
+				    logbias_changed_cb, os);
 		}
 		if (err) {
-			VERIFY(arc_buf_remove_ref(osi->os_phys_buf,
-			    &osi->os_phys_buf) == 1);
-			kmem_free(osi, sizeof (objset_impl_t));
+			VERIFY(arc_buf_remove_ref(os->os_phys_buf,
+			    &os->os_phys_buf) == 1);
+			kmem_free(os, sizeof (objset_t));
 			return (err);
 		}
 	} else if (ds == NULL) {
 		/* It's the meta-objset. */
-		osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
-		osi->os_compress = ZIO_COMPRESS_LZJB;
-		osi->os_copies = spa_max_replication(spa);
-		osi->os_primary_cache = ZFS_CACHE_ALL;
-		osi->os_secondary_cache = ZFS_CACHE_ALL;
+		os->os_checksum = ZIO_CHECKSUM_FLETCHER_4;
+		os->os_compress = ZIO_COMPRESS_LZJB;
+		os->os_copies = spa_max_replication(spa);
+		os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
+		os->os_dedup_verify = 0;
+		os->os_logbias = 0;
+		os->os_primary_cache = ZFS_CACHE_ALL;
+		os->os_secondary_cache = ZFS_CACHE_ALL;
 	}
 
-	osi->os_zil_header = osi->os_phys->os_zil_header;
-	osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header);
+	os->os_zil_header = os->os_phys->os_zil_header;
+	os->os_zil = zil_alloc(os, &os->os_zil_header);
 
 	for (i = 0; i < TXG_SIZE; i++) {
-		list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t),
+		list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]));
-		list_create(&osi->os_free_dnodes[i], sizeof (dnode_t),
+		list_create(&os->os_free_dnodes[i], sizeof (dnode_t),
 		    offsetof(dnode_t, dn_dirty_link[i]));
 	}
-	list_create(&osi->os_dnodes, sizeof (dnode_t),
+	list_create(&os->os_dnodes, sizeof (dnode_t),
 	    offsetof(dnode_t, dn_link));
-	list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
+	list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t),
 	    offsetof(dmu_buf_impl_t, db_link));
 
-	mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
-
-	osi->os_meta_dnode = dnode_special_open(osi,
-	    &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+	mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL);
+
+	os->os_meta_dnode = dnode_special_open(os,
+	    &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT);
+	if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) {
+		os->os_userused_dnode = dnode_special_open(os,
+		    &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT);
+		os->os_groupused_dnode = dnode_special_open(os,
+		    &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT);
+	}
 
 	/*
 	 * We should be the only thread trying to do this because we
 	 * have ds_opening_lock
 	 */
 	if (ds) {
-		VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi,
-		    dmu_objset_evict));
+		mutex_enter(&ds->ds_lock);
+		ASSERT(ds->ds_objset == NULL);
+		ds->ds_objset = os;
+		mutex_exit(&ds->ds_lock);
 	}
 
-	*osip = osi;
+	*osp = os;
 	return (0);
 }
 
-static int
-dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type)
+int
+dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp)
 {
-	objset_impl_t *osi;
+	int err = 0;
 
 	mutex_enter(&ds->ds_opening_lock);
-	osi = dsl_dataset_get_user_ptr(ds);
-	if (osi == NULL) {
-		int err;
-
+	*osp = ds->ds_objset;
+	if (*osp == NULL) {
 		err = dmu_objset_open_impl(dsl_dataset_get_spa(ds),
-		    ds, &ds->ds_phys->ds_bp, &osi);
-		if (err) {
-			mutex_exit(&ds->ds_opening_lock);
-			return (err);
-		}
+		    ds, &ds->ds_phys->ds_bp, osp);
 	}
 	mutex_exit(&ds->ds_opening_lock);
-
-	os->os = osi;
-	os->os_mode = DS_MODE_NOHOLD;
-
-	if (type != DMU_OST_ANY && type != os->os->os_phys->os_type)
-		return (EINVAL);
-	return (0);
+	return (err);
 }
 
+/* called from zpl */
 int
-dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp)
+dmu_objset_hold(const char *name, void *tag, objset_t **osp)
 {
-	objset_t *os;
+	dsl_dataset_t *ds;
 	int err;
 
-	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
-	err = dmu_objset_open_ds_os(ds, os, type);
+	err = dsl_dataset_hold(name, tag, &ds);
 	if (err)
-		kmem_free(os, sizeof (objset_t));
-	else
-		*osp = os;
+		return (err);
+
+	err = dmu_objset_from_ds(ds, osp);
+	if (err)
+		dsl_dataset_rele(ds, tag);
+
 	return (err);
 }
 
 /* called from zpl */
 int
-dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp)
+dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp)
 {
-	objset_t *os;
 	dsl_dataset_t *ds;
 	int err;
 
-	ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER ||
-	    DS_MODE_TYPE(mode) == DS_MODE_OWNER);
-
-	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
-	if (DS_MODE_TYPE(mode) == DS_MODE_USER)
-		err = dsl_dataset_hold(name, os, &ds);
-	else
-		err = dsl_dataset_own(name, mode, os, &ds);
-	if (err) {
-		kmem_free(os, sizeof (objset_t));
+	err = dsl_dataset_own(name, B_FALSE, tag, &ds);
+	if (err)
 		return (err);
-	}
 
-	err = dmu_objset_open_ds_os(ds, os, type);
+	err = dmu_objset_from_ds(ds, osp);
 	if (err) {
-		if (DS_MODE_TYPE(mode) == DS_MODE_USER)
-			dsl_dataset_rele(ds, os);
-		else
-			dsl_dataset_disown(ds, os);
-		kmem_free(os, sizeof (objset_t));
-	} else {
-		os->os_mode = mode;
-		*osp = os;
+		dsl_dataset_disown(ds, tag);
+	} else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) {
+		dmu_objset_disown(*osp, tag);
+		return (EINVAL);
+	} else if (!readonly && dsl_dataset_is_snapshot(ds)) {
+		dmu_objset_disown(*osp, tag);
+		return (EROFS);
 	}
 	return (err);
 }
 
 void
-dmu_objset_close(objset_t *os)
+dmu_objset_rele(objset_t *os, void *tag)
 {
-	ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER ||
-	    DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER ||
-	    DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD);
-
-	if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER)
-		dsl_dataset_rele(os->os->os_dsl_dataset, os);
-	else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER)
-		dsl_dataset_disown(os->os->os_dsl_dataset, os);
-	kmem_free(os, sizeof (objset_t));
+	dsl_dataset_rele(os->os_dsl_dataset, tag);
+}
+
+void
+dmu_objset_disown(objset_t *os, void *tag)
+{
+	dsl_dataset_disown(os->os_dsl_dataset, tag);
 }
 
 int
 dmu_objset_evict_dbufs(objset_t *os)
 {
-	objset_impl_t *osi = os->os;
 	dnode_t *dn;
 
-	mutex_enter(&osi->os_lock);
+	mutex_enter(&os->os_lock);
 
 	/* process the mdn last, since the other dnodes have holds on it */
-	list_remove(&osi->os_dnodes, osi->os_meta_dnode);
-	list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode);
+	list_remove(&os->os_dnodes, os->os_meta_dnode);
+	list_insert_tail(&os->os_dnodes, os->os_meta_dnode);
 
 	/*
 	 * Find the first dnode with holds.  We have to do this dance
@@ -400,91 +451,98 @@ dmu_objset_evict_dbufs(objset_t *os)
 	 * hold.  If there are no holds then it has no dbufs so OK to
 	 * skip.
 	 */
-	for (dn = list_head(&osi->os_dnodes);
+	for (dn = list_head(&os->os_dnodes);
 	    dn && !dnode_add_ref(dn, FTAG);
-	    dn = list_next(&osi->os_dnodes, dn))
+	    dn = list_next(&os->os_dnodes, dn))
 		continue;
 
 	while (dn) {
 		dnode_t *next_dn = dn;
 
 		do {
-			next_dn = list_next(&osi->os_dnodes, next_dn);
+			next_dn = list_next(&os->os_dnodes, next_dn);
 		} while (next_dn && !dnode_add_ref(next_dn, FTAG));
 
-		mutex_exit(&osi->os_lock);
+		mutex_exit(&os->os_lock);
 		dnode_evict_dbufs(dn);
 		dnode_rele(dn, FTAG);
-		mutex_enter(&osi->os_lock);
+		mutex_enter(&os->os_lock);
 		dn = next_dn;
 	}
-	mutex_exit(&osi->os_lock);
-	return (list_head(&osi->os_dnodes) != osi->os_meta_dnode);
+	mutex_exit(&os->os_lock);
+	return (list_head(&os->os_dnodes) != os->os_meta_dnode);
 }
 
 void
-dmu_objset_evict(dsl_dataset_t *ds, void *arg)
+dmu_objset_evict(objset_t *os)
 {
-	objset_impl_t *osi = arg;
-	objset_t os;
-	int i;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
 
-	for (i = 0; i < TXG_SIZE; i++) {
-		ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL);
-		ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL);
-	}
+	for (int t = 0; t < TXG_SIZE; t++)
+		ASSERT(!dmu_objset_is_dirty(os, t));
 
 	if (ds) {
 		if (!dsl_dataset_is_snapshot(ds)) {
 			VERIFY(0 == dsl_prop_unregister(ds, "checksum",
-			    checksum_changed_cb, osi));
+			    checksum_changed_cb, os));
 			VERIFY(0 == dsl_prop_unregister(ds, "compression",
-			    compression_changed_cb, osi));
+			    compression_changed_cb, os));
 			VERIFY(0 == dsl_prop_unregister(ds, "copies",
-			    copies_changed_cb, osi));
+			    copies_changed_cb, os));
+			VERIFY(0 == dsl_prop_unregister(ds, "dedup",
+			    dedup_changed_cb, os));
+			VERIFY(0 == dsl_prop_unregister(ds, "logbias",
+			    logbias_changed_cb, os));
 		}
 		VERIFY(0 == dsl_prop_unregister(ds, "primarycache",
-		    primary_cache_changed_cb, osi));
+		    primary_cache_changed_cb, os));
 		VERIFY(0 == dsl_prop_unregister(ds, "secondarycache",
-		    secondary_cache_changed_cb, osi));
+		    secondary_cache_changed_cb, os));
 	}
 
 	/*
 	 * We should need only a single pass over the dnode list, since
 	 * nothing can be added to the list at this point.
 	 */
-	os.os = osi;
-	(void) dmu_objset_evict_dbufs(&os);
+	(void) dmu_objset_evict_dbufs(os);
 
-	ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode);
-	ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode);
-	ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL);
+	dnode_special_close(os->os_meta_dnode);
+	if (os->os_userused_dnode) {
+		dnode_special_close(os->os_userused_dnode);
+		dnode_special_close(os->os_groupused_dnode);
+	}
+	zil_free(os->os_zil);
 
-	dnode_special_close(osi->os_meta_dnode);
-	zil_free(osi->os_zil);
+	ASSERT3P(list_head(&os->os_dnodes), ==, NULL);
 
-	VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1);
-	mutex_destroy(&osi->os_lock);
-	mutex_destroy(&osi->os_obj_lock);
-	mutex_destroy(&osi->os_user_ptr_lock);
-	kmem_free(osi, sizeof (objset_impl_t));
+	VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1);
+	mutex_destroy(&os->os_lock);
+	mutex_destroy(&os->os_obj_lock);
+	mutex_destroy(&os->os_user_ptr_lock);
+	kmem_free(os, sizeof (objset_t));
+}
+
+timestruc_t
+dmu_objset_snap_cmtime(objset_t *os)
+{
+	return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir));
 }
 
 /* called from dsl for meta-objset */
-objset_impl_t *
+objset_t *
 dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
     dmu_objset_type_t type, dmu_tx_t *tx)
 {
-	objset_impl_t *osi;
+	objset_t *os;
 	dnode_t *mdn;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	if (ds)
 		mutex_enter(&ds->ds_opening_lock);
-	VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi));
+	VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os));
 	if (ds)
 		mutex_exit(&ds->ds_opening_lock);
-	mdn = osi->os_meta_dnode;
+	mdn = os->os_meta_dnode;
 
 	dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT,
 	    DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx);
@@ -519,17 +577,21 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
 	ASSERT(type != DMU_OST_NONE);
 	ASSERT(type != DMU_OST_ANY);
 	ASSERT(type < DMU_OST_NUMTYPES);
-	osi->os_phys->os_type = type;
+	os->os_phys->os_type = type;
+	if (dmu_objset_userused_enabled(os)) {
+		os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+		os->os_flags = os->os_phys->os_flags;
+	}
 
 	dsl_dataset_dirty(ds, tx);
 
-	return (osi);
+	return (os);
 }
 
 struct oscarg {
 	void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
 	void *userarg;
-	dsl_dataset_t *clone_parent;
+	dsl_dataset_t *clone_origin;
 	const char *lastname;
 	dmu_objset_type_t type;
 	uint64_t flags;
@@ -550,17 +612,13 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	if (err != ENOENT)
 		return (err ? err : EEXIST);
 
-	if (oa->clone_parent != NULL) {
-		/*
-		 * You can't clone across pools.
-		 */
-		if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool)
+	if (oa->clone_origin != NULL) {
+		/* You can't clone across pools. */
+		if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool)
 			return (EXDEV);
 
-		/*
-		 * You can only clone snapshots, not the head datasets.
-		 */
-		if (oa->clone_parent->ds_phys->ds_num_children == 0)
+		/* You can only clone snapshots, not the head datasets. */
+		if (!dsl_dataset_is_snapshot(oa->clone_origin))
 			return (EINVAL);
 	}
 
@@ -572,37 +630,37 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct oscarg *oa = arg2;
-	dsl_dataset_t *ds;
-	blkptr_t *bp;
 	uint64_t dsobj;
 
 	ASSERT(dmu_tx_is_syncing(tx));
 
 	dsobj = dsl_dataset_create_sync(dd, oa->lastname,
-	    oa->clone_parent, oa->flags, cr, tx);
+	    oa->clone_origin, oa->flags, cr, tx);
 
-	VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds));
-	bp = dsl_dataset_get_blkptr(ds);
-	if (BP_IS_HOLE(bp)) {
-		objset_impl_t *osi;
+	if (oa->clone_origin == NULL) {
+		dsl_dataset_t *ds;
+		blkptr_t *bp;
+		objset_t *os;
 
-		/* This is an empty dmu_objset; not a clone. */
-		osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
+		VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj,
+		    FTAG, &ds));
+		bp = dsl_dataset_get_blkptr(ds);
+		ASSERT(BP_IS_HOLE(bp));
+
+		os = dmu_objset_create_impl(dsl_dataset_get_spa(ds),
 		    ds, bp, oa->type, tx);
 
 		if (oa->userfunc)
-			oa->userfunc(&osi->os, oa->userarg, cr, tx);
+			oa->userfunc(os, oa->userarg, cr, tx);
+		dsl_dataset_rele(ds, FTAG);
 	}
 
 	spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa,
 	    tx, cr, "dataset = %llu", dsobj);
-
-	dsl_dataset_rele(ds, FTAG);
 }
 
 int
-dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg)
 {
 	dsl_dir_t *pdd;
@@ -619,24 +677,12 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
 		return (EEXIST);
 	}
 
-	dprintf("name=%s\n", name);
-
 	oa.userfunc = func;
 	oa.userarg = arg;
 	oa.lastname = tail;
 	oa.type = type;
 	oa.flags = flags;
 
-	if (clone_parent != NULL) {
-		/*
-		 * You can't clone to a different type.
-		 */
-		if (clone_parent->os->os_phys->os_type != type) {
-			dsl_dir_close(pdd, FTAG);
-			return (EINVAL);
-		}
-		oa.clone_parent = clone_parent->os->os_dsl_dataset;
-	}
 	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
 	    dmu_objset_create_sync, pdd, &oa, 5);
 	dsl_dir_close(pdd, FTAG);
@@ -644,100 +690,135 @@ dmu_objset_create(const char *name, dmu_objset_type_t type,
 }
 
 int
-dmu_objset_destroy(const char *name)
+dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags)
 {
-	objset_t *os;
-	int error;
-
-	/*
-	 * If it looks like we'll be able to destroy it, and there's
-	 * an unplayed replay log sitting around, destroy the log.
-	 * It would be nicer to do this in dsl_dataset_destroy_sync(),
-	 * but the replay log objset is modified in open context.
-	 */
-	error = dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os);
-	if (error == 0) {
-		dsl_dataset_t *ds = os->os->os_dsl_dataset;
-		zil_destroy(dmu_objset_zil(os), B_FALSE);
+	dsl_dir_t *pdd;
+	const char *tail;
+	int err = 0;
+	struct oscarg oa = { 0 };
 
-		error = dsl_dataset_destroy(ds, os);
-		/*
-		 * dsl_dataset_destroy() closes the ds.
-		 */
-		kmem_free(os, sizeof (objset_t));
+	ASSERT(strchr(name, '@') == NULL);
+	err = dsl_dir_open(name, FTAG, &pdd, &tail);
+	if (err)
+		return (err);
+	if (tail == NULL) {
+		dsl_dir_close(pdd, FTAG);
+		return (EEXIST);
 	}
 
-	return (error);
+	oa.lastname = tail;
+	oa.clone_origin = clone_origin;
+	oa.flags = flags;
+
+	err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check,
+	    dmu_objset_create_sync, pdd, &oa, 5);
+	dsl_dir_close(pdd, FTAG);
+	return (err);
 }
 
-/*
- * This will close the objset.
- */
 int
-dmu_objset_rollback(objset_t *os)
+dmu_objset_destroy(const char *name, boolean_t defer)
 {
-	int err;
 	dsl_dataset_t *ds;
-
-	ds = os->os->os_dsl_dataset;
-
-	if (!dsl_dataset_tryown(ds, TRUE, os)) {
-		dmu_objset_close(os);
-		return (EBUSY);
-	}
-
-	err = dsl_dataset_rollback(ds, os->os->os_phys->os_type);
+	int error;
 
 	/*
-	 * NB: we close the objset manually because the rollback
-	 * actually implicitly called dmu_objset_evict(), thus freeing
-	 * the objset_impl_t.
+	 * dsl_dataset_destroy() can free any claimed-but-unplayed
+	 * intent log, but if there is an active log, it has blocks that
+	 * are allocated, but may not yet be reflected in the on-disk
+	 * structure.  Only the ZIL knows how to free them, so we have
+	 * to call into it here.
 	 */
-	dsl_dataset_disown(ds, os);
-	kmem_free(os, sizeof (objset_t));
-	return (err);
+	error = dsl_dataset_own(name, B_TRUE, FTAG, &ds);
+	if (error == 0) {
+		objset_t *os;
+		if (dmu_objset_from_ds(ds, &os) == 0)
+			zil_destroy(dmu_objset_zil(os), B_FALSE);
+		error = dsl_dataset_destroy(ds, FTAG, defer);
+		/* dsl_dataset_destroy() closes the ds. */
+	}
+
+	return (error);
 }
 
 struct snaparg {
 	dsl_sync_task_group_t *dstg;
 	char *snapname;
 	char failed[MAXPATHLEN];
-	boolean_t checkperms;
-	list_t objsets;
+	boolean_t recursive;
+	nvlist_t *props;
 };
 
-struct osnode {
-	list_node_t node;
-	objset_t *os;
-};
+static int
+snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	objset_t *os = arg1;
+	struct snaparg *sn = arg2;
+
+	/* The props have already been checked by zfs_check_userprops(). */
+
+	return (dsl_dataset_snapshot_check(os->os_dsl_dataset,
+	    sn->snapname, tx));
+}
+
+static void
+snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	objset_t *os = arg1;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	struct snaparg *sn = arg2;
+
+	dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx);
+
+	if (sn->props) {
+		dsl_props_arg_t pa;
+		pa.pa_props = sn->props;
+		pa.pa_source = ZPROP_SRC_LOCAL;
+		dsl_props_set_sync(ds->ds_prev, &pa, cr, tx);
+	}
+}
 
 static int
-dmu_objset_snapshot_one(char *name, void *arg)
+dmu_objset_snapshot_one(const char *name, void *arg)
 {
 	struct snaparg *sn = arg;
 	objset_t *os;
 	int err;
+	char *cp;
+
+	/*
+	 * If the objset starts with a '%', then ignore it unless it was
+	 * explicitly named (ie, not recursive).  These hidden datasets
+	 * are always inconsistent, and by not opening them here, we can
+	 * avoid a race with dsl_dir_destroy_check().
+	 */
+	cp = strrchr(name, '/');
+	if (cp && cp[1] == '%' && sn->recursive)
+		return (0);
 
 	(void) strcpy(sn->failed, name);
 
 	/*
-	 * Check permissions only when requested.  This only applies when
-	 * doing a recursive snapshot.  The permission checks for the starting
-	 * dataset have already been performed in zfs_secpolicy_snapshot()
+	 * Check permissions if we are doing a recursive snapshot.  The
+	 * permission checks for the starting dataset have already been
+	 * performed in zfs_secpolicy_snapshot()
 	 */
-	if (sn->checkperms == B_TRUE &&
-	    (err = zfs_secpolicy_snapshot_perms(name, CRED())))
+	if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED())))
 		return (err);
 
-	err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os);
+	err = dmu_objset_hold(name, sn, &os);
 	if (err != 0)
 		return (err);
 
-	/* If the objset is in an inconsistent state, return busy */
-	if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
-		dmu_objset_close(os);
-		return (EBUSY);
+	/*
+	 * If the objset is in an inconsistent state (eg, in the process
+	 * of being destroyed), don't snapshot it.  As with %hidden
+	 * datasets, we return EBUSY if this name was explicitly
+	 * requested (ie, not recursive), and otherwise ignore it.
+	 */
+	if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) {
+		dmu_objset_rele(os, sn);
+		return (sn->recursive ? 0 : EBUSY);
 	}
 
 	/*
@@ -747,26 +828,21 @@ dmu_objset_snapshot_one(char *name, void *arg)
 	 */
 	err = zil_suspend(dmu_objset_zil(os));
 	if (err == 0) {
-		struct osnode *osn;
-		dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check,
-		    dsl_dataset_snapshot_sync, os->os->os_dsl_dataset,
-		    sn->snapname, 3);
-		osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP);
-		osn->os = os;
-		list_insert_tail(&sn->objsets, osn);
+		dsl_sync_task_create(sn->dstg, snapshot_check,
+		    snapshot_sync, os, sn, 3);
 	} else {
-		dmu_objset_close(os);
+		dmu_objset_rele(os, sn);
 	}
 
 	return (err);
 }
 
 int
-dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
+dmu_objset_snapshot(char *fsname, char *snapname,
+    nvlist_t *props, boolean_t recursive)
 {
 	dsl_sync_task_t *dst;
-	struct osnode *osn;
-	struct snaparg sn = { 0 };
+	struct snaparg sn;
 	spa_t *spa;
 	int err;
 
@@ -778,39 +854,29 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
 
 	sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	sn.snapname = snapname;
-	list_create(&sn.objsets, sizeof (struct osnode),
-	    offsetof(struct osnode, node));
+	sn.props = props;
+	sn.recursive = recursive;
 
 	if (recursive) {
-		sn.checkperms = B_TRUE;
 		err = dmu_objset_find(fsname,
 		    dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN);
 	} else {
-		sn.checkperms = B_FALSE;
 		err = dmu_objset_snapshot_one(fsname, &sn);
 	}
 
-	if (err)
-		goto out;
-
-	err = dsl_sync_task_group_wait(sn.dstg);
+	if (err == 0)
+		err = dsl_sync_task_group_wait(sn.dstg);
 
 	for (dst = list_head(&sn.dstg->dstg_tasks); dst;
 	    dst = list_next(&sn.dstg->dstg_tasks, dst)) {
-		dsl_dataset_t *ds = dst->dst_arg1;
+		objset_t *os = dst->dst_arg1;
+		dsl_dataset_t *ds = os->os_dsl_dataset;
 		if (dst->dst_err)
 			dsl_dataset_name(ds, sn.failed);
+		zil_resume(dmu_objset_zil(os));
+		dmu_objset_rele(os, &sn);
 	}
 
-out:
-	while (osn = list_head(&sn.objsets)) {
-		list_remove(&sn.objsets, osn);
-		zil_resume(dmu_objset_zil(osn->os));
-		dmu_objset_close(osn->os);
-		kmem_free(osn, sizeof (struct osnode));
-	}
-	list_destroy(&sn.objsets);
-
 	if (err)
 		(void) strcpy(fsname, sn.failed);
 	dsl_sync_task_group_destroy(sn.dstg);
@@ -819,7 +885,7 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive)
 }
 
 static void
-dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
+dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx)
 {
 	dnode_t *dn;
 
@@ -827,25 +893,30 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx)
 		ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT);
 		ASSERT(dn->dn_dbuf->db_data_pending);
 		/*
-		 * Initialize dn_zio outside dnode_sync()
-		 * to accomodate meta-dnode
+		 * Initialize dn_zio outside dnode_sync() because the
+		 * meta-dnode needs to set it ouside dnode_sync().
 		 */
 		dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio;
 		ASSERT(dn->dn_zio);
 
 		ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS);
 		list_remove(list, dn);
+
+		if (newlist) {
+			(void) dnode_add_ref(dn, newlist);
+			list_insert_tail(newlist, dn);
+		}
+
 		dnode_sync(dn, tx);
 	}
 }
 
 /* ARGSUSED */
 static void
-ready(zio_t *zio, arc_buf_t *abuf, void *arg)
+dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 {
 	blkptr_t *bp = zio->io_bp;
-	blkptr_t *bp_orig = &zio->io_bp_orig;
-	objset_impl_t *os = arg;
+	objset_t *os = arg;
 	dnode_phys_t *dnp = &os->os_phys->os_meta_dnode;
 
 	ASSERT(bp == os->os_rootbp);
@@ -853,31 +924,45 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg)
 	ASSERT(BP_GET_LEVEL(bp) == 0);
 
 	/*
-	 * Update rootbp fill count.
+	 * Update rootbp fill count: it should be the number of objects
+	 * allocated in the object set (not counting the "special"
+	 * objects that are stored in the objset_phys_t -- the meta
+	 * dnode and user/group accounting objects).
 	 */
-	bp->blk_fill = 1;	/* count the meta-dnode */
+	bp->blk_fill = 0;
 	for (int i = 0; i < dnp->dn_nblkptr; i++)
 		bp->blk_fill += dnp->dn_blkptr[i].blk_fill;
+}
+
+/* ARGSUSED */
+static void
+dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg)
+{
+	blkptr_t *bp = zio->io_bp;
+	blkptr_t *bp_orig = &zio->io_bp_orig;
+	objset_t *os = arg;
 
 	if (zio->io_flags & ZIO_FLAG_IO_REWRITE) {
-		ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig)));
+		ASSERT(BP_EQUAL(bp, bp_orig));
 	} else {
-		if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg)
-			(void) dsl_dataset_block_kill(os->os_dsl_dataset,
-			    &zio->io_bp_orig, zio, os->os_synctx);
-		dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx);
+		dsl_dataset_t *ds = os->os_dsl_dataset;
+		dmu_tx_t *tx = os->os_synctx;
+
+		(void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE);
+		dsl_dataset_block_born(ds, bp, tx);
 	}
 }
 
 /* called from dsl */
 void
-dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
+dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx)
 {
 	int txgoff;
 	zbookmark_t zb;
-	writeprops_t wp = { 0 };
+	zio_prop_t zp;
 	zio_t *zio;
 	list_t *list;
+	list_t *newlist = NULL;
 	dbuf_dirty_record_t *dr;
 
 	dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg);
@@ -898,37 +983,49 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 	/*
 	 * Create the root block IO
 	 */
-	zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0;
-	zb.zb_object = 0;
-	zb.zb_level = -1;	/* for block ordering; it's level 0 on disk */
-	zb.zb_blkid = 0;
-
-	wp.wp_type = DMU_OT_OBJSET;
-	wp.wp_level = 0;	/* on-disk BP level; see above */
-	wp.wp_copies = os->os_copies;
-	wp.wp_oschecksum = os->os_checksum;
-	wp.wp_oscompress = os->os_compress;
-
-	if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) {
-		(void) dsl_dataset_block_kill(os->os_dsl_dataset,
-		    os->os_rootbp, pio, tx);
-	}
-
 	arc_release(os->os_phys_buf, &os->os_phys_buf);
-	zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os),
-	    tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os,
+
+	SET_BOOKMARK(&zb, os->os_dsl_dataset ?
+	    os->os_dsl_dataset->ds_object : DMU_META_OBJSET,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+	dmu_write_policy(os, NULL, 0, 0, &zp);
+
+	zio = arc_write(pio, os->os_spa, tx->tx_txg,
+	    os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp,
+	    dmu_objset_write_ready, dmu_objset_write_done, os,
 	    ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb);
 
 	/*
-	 * Sync meta-dnode - the parent IO for the sync is the root block
+	 * Sync special dnodes - the parent IO for the sync is the root block
 	 */
 	os->os_meta_dnode->dn_zio = zio;
 	dnode_sync(os->os_meta_dnode, tx);
 
+	os->os_phys->os_flags = os->os_flags;
+
+	if (os->os_userused_dnode &&
+	    os->os_userused_dnode->dn_type != DMU_OT_NONE) {
+		os->os_userused_dnode->dn_zio = zio;
+		dnode_sync(os->os_userused_dnode, tx);
+		os->os_groupused_dnode->dn_zio = zio;
+		dnode_sync(os->os_groupused_dnode, tx);
+	}
+
 	txgoff = tx->tx_txg & TXG_MASK;
 
-	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx);
-	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx);
+	if (dmu_objset_userused_enabled(os)) {
+		newlist = &os->os_synced_dnodes;
+		/*
+		 * We must create the list here because it uses the
+		 * dn_dirty_link[] of this txg.
+		 */
+		list_create(newlist, sizeof (dnode_t),
+		    offsetof(dnode_t, dn_dirty_link[txgoff]));
+	}
+
+	dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx);
+	dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx);
 
 	list = &os->os_meta_dnode->dn_dirty_records[txgoff];
 	while (dr = list_head(list)) {
@@ -945,46 +1042,199 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx)
 	zio_nowait(zio);
 }
 
+boolean_t
+dmu_objset_is_dirty(objset_t *os, uint64_t txg)
+{
+	return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) ||
+	    !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK]));
+}
+
+static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES];
+
+void
+dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb)
+{
+	used_cbs[ost] = cb;
+}
+
+boolean_t
+dmu_objset_userused_enabled(objset_t *os)
+{
+	return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE &&
+	    used_cbs[os->os_phys->os_type] &&
+	    os->os_userused_dnode);
+}
+
+static void
+do_userquota_callback(objset_t *os, dnode_phys_t *dnp,
+    boolean_t subtract, dmu_tx_t *tx)
+{
+	static const char zerobuf[DN_MAX_BONUSLEN] = {0};
+	uint64_t user, group;
+
+	ASSERT(dnp->dn_type != 0 ||
+	    (bcmp(DN_BONUS(dnp), zerobuf, DN_MAX_BONUSLEN) == 0 &&
+	    DN_USED_BYTES(dnp) == 0));
+
+	if ((dnp->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) &&
+	    0 == used_cbs[os->os_phys->os_type](dnp->dn_bonustype,
+	    DN_BONUS(dnp), &user, &group)) {
+		int64_t delta = DNODE_SIZE + DN_USED_BYTES(dnp);
+		if (subtract)
+			delta = -delta;
+		VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT,
+		    user, delta, tx));
+		VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT,
+		    group, delta, tx));
+	}
+}
+
+void
+dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx)
+{
+	dnode_t *dn;
+	list_t *list = &os->os_synced_dnodes;
+
+	ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os));
+
+	while (dn = list_head(list)) {
+		ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object));
+		ASSERT(dn->dn_oldphys);
+		ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE ||
+		    dn->dn_phys->dn_flags &
+		    DNODE_FLAG_USERUSED_ACCOUNTED);
+
+		/* Allocate the user/groupused objects if necessary. */
+		if (os->os_userused_dnode->dn_type == DMU_OT_NONE) {
+			VERIFY(0 == zap_create_claim(os,
+			    DMU_USERUSED_OBJECT,
+			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+			VERIFY(0 == zap_create_claim(os,
+			    DMU_GROUPUSED_OBJECT,
+			    DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx));
+		}
+
+		/*
+		 * We intentionally modify the zap object even if the
+		 * net delta (due to phys-oldphys) is zero.  Otherwise
+		 * the block of the zap obj could be shared between
+		 * datasets but need to be different between them after
+		 * a bprewrite.
+		 */
+		do_userquota_callback(os, dn->dn_oldphys, B_TRUE, tx);
+		do_userquota_callback(os, dn->dn_phys, B_FALSE, tx);
+
+		/*
+		 * The mutex is needed here for interlock with dnode_allocate.
+		 */
+		mutex_enter(&dn->dn_mtx);
+		zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t));
+		dn->dn_oldphys = NULL;
+		mutex_exit(&dn->dn_mtx);
+
+		list_remove(list, dn);
+		dnode_rele(dn, list);
+	}
+}
+
+boolean_t
+dmu_objset_userspace_present(objset_t *os)
+{
+	return (os->os_phys->os_flags &
+	    OBJSET_FLAG_USERACCOUNTING_COMPLETE);
+}
+
+int
+dmu_objset_userspace_upgrade(objset_t *os)
+{
+	uint64_t obj;
+	int err = 0;
+
+	if (dmu_objset_userspace_present(os))
+		return (0);
+	if (!dmu_objset_userused_enabled(os))
+		return (ENOTSUP);
+	if (dmu_objset_is_snapshot(os))
+		return (EINVAL);
+
+	/*
+	 * We simply need to mark every object dirty, so that it will be
+	 * synced out and now accounted.  If this is called
+	 * concurrently, or if we already did some work before crashing,
+	 * that's fine, since we track each object's accounted state
+	 * independently.
+	 */
+
+	for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) {
+		dmu_tx_t *tx;
+		dmu_buf_t *db;
+		int objerr;
+
+		if (issig(JUSTLOOKING) && issig(FORREAL))
+			return (EINTR);
+
+		objerr = dmu_bonus_hold(os, obj, FTAG, &db);
+		if (objerr)
+			continue;
+		tx = dmu_tx_create(os);
+		dmu_tx_hold_bonus(tx, obj);
+		objerr = dmu_tx_assign(tx, TXG_WAIT);
+		if (objerr) {
+			dmu_tx_abort(tx);
+			continue;
+		}
+		dmu_buf_will_dirty(db, tx);
+		dmu_buf_rele(db, FTAG);
+		dmu_tx_commit(tx);
+	}
+
+	os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE;
+	txg_wait_synced(dmu_objset_pool(os), 0);
+	return (0);
+}
+
 void
 dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp)
 {
-	dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp,
+	dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp,
 	    usedobjsp, availobjsp);
 }
 
 uint64_t
 dmu_objset_fsid_guid(objset_t *os)
 {
-	return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset));
+	return (dsl_dataset_fsid_guid(os->os_dsl_dataset));
 }
 
 void
 dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat)
 {
-	stat->dds_type = os->os->os_phys->os_type;
-	if (os->os->os_dsl_dataset)
-		dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat);
+	stat->dds_type = os->os_phys->os_type;
+	if (os->os_dsl_dataset)
+		dsl_dataset_fast_stat(os->os_dsl_dataset, stat);
 }
 
 void
 dmu_objset_stats(objset_t *os, nvlist_t *nv)
 {
-	ASSERT(os->os->os_dsl_dataset ||
-	    os->os->os_phys->os_type == DMU_OST_META);
+	ASSERT(os->os_dsl_dataset ||
+	    os->os_phys->os_type == DMU_OST_META);
 
-	if (os->os->os_dsl_dataset != NULL)
-		dsl_dataset_stats(os->os->os_dsl_dataset, nv);
+	if (os->os_dsl_dataset != NULL)
+		dsl_dataset_stats(os->os_dsl_dataset, nv);
 
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE,
-	    os->os->os_phys->os_type);
+	    os->os_phys->os_type);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING,
+	    dmu_objset_userspace_present(os));
 }
 
 int
 dmu_objset_is_snapshot(objset_t *os)
 {
-	if (os->os->os_dsl_dataset != NULL)
-		return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset));
+	if (os->os_dsl_dataset != NULL)
+		return (dsl_dataset_is_snapshot(os->os_dsl_dataset));
 	else
 		return (B_FALSE);
 }
@@ -993,7 +1243,7 @@ int
 dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen,
     boolean_t *conflict)
 {
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
 	uint64_t ignored;
 
 	if (ds->ds_phys->ds_snapnames_zapobj == 0)
@@ -1008,7 +1258,7 @@ int
 dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp, boolean_t *case_conflict)
 {
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
@@ -1045,12 +1295,12 @@ int
 dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp)
 {
-	dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir;
+	dsl_dir_t *dd = os->os_dsl_dataset->ds_dir;
 	zap_cursor_t cursor;
 	zap_attribute_t attr;
 
 	/* there is no next dir on a snapshot! */
-	if (os->os->os_dsl_dataset->ds_object !=
+	if (os->os_dsl_dataset->ds_object !=
 	    dd->dd_phys->dd_head_dataset_obj)
 		return (ENOENT);
 
@@ -1079,7 +1329,7 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name,
 }
 
 struct findarg {
-	int (*func)(char *, void *);
+	int (*func)(const char *, void *);
 	void *arg;
 };
 
@@ -1088,7 +1338,7 @@ static int
 findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 {
 	struct findarg *fa = arg;
-	return (fa->func((char *)dsname, fa->arg));
+	return (fa->func(dsname, fa->arg));
 }
 
 /*
@@ -1096,7 +1346,8 @@ findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
  * Perhaps change all callers to use dmu_objset_find_spa()?
  */
 int
-dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags)
+dmu_objset_find(char *name, int func(const char *, void *), void *arg,
+    int flags)
 {
 	struct findarg fa;
 	fa.func = func;
@@ -1147,12 +1398,9 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
 			ASSERT(attr->za_integer_length == sizeof (uint64_t));
 			ASSERT(attr->za_num_integers == 1);
 
-			child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-			(void) strcpy(child, name);
-			(void) strcat(child, "/");
-			(void) strcat(child, attr->za_name);
+			child = kmem_asprintf("%s/%s", name, attr->za_name);
 			err = dmu_objset_find_spa(spa, child, func, arg, flags);
-			kmem_free(child, MAXPATHLEN);
+			strfree(child);
 			if (err)
 				break;
 		}
@@ -1186,13 +1434,11 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
 				    sizeof (uint64_t));
 				ASSERT(attr->za_num_integers == 1);
 
-				child = kmem_alloc(MAXPATHLEN, KM_SLEEP);
-				(void) strcpy(child, name);
-				(void) strcat(child, "@");
-				(void) strcat(child, attr->za_name);
+				child = kmem_asprintf("%s@%s",
+				    name, attr->za_name);
 				err = func(spa, attr->za_first_integer,
 				    child, arg);
-				kmem_free(child, MAXPATHLEN);
+				strfree(child);
 				if (err)
 					break;
 			}
@@ -1215,46 +1461,45 @@ dmu_objset_find_spa(spa_t *spa, const char *name,
 
 /* ARGSUSED */
 int
-dmu_objset_prefetch(char *name, void *arg)
+dmu_objset_prefetch(const char *name, void *arg)
 {
-	objset_t *os;
 	dsl_dataset_t *ds;
 
-	os = kmem_alloc(sizeof (objset_t), KM_SLEEP);
-	if (dsl_dataset_hold(name, os, &ds)) {
-		kmem_free(os, sizeof (objset_t));
+	if (dsl_dataset_hold(name, FTAG, &ds))
 		return (0);
-	}
 
 	if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) {
-		uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
-		zbookmark_t zb;
-
-		zb.zb_objset = ds->ds_object;
-		zb.zb_object = 0;
-		zb.zb_level = -1;
-		zb.zb_blkid = 0;
-
-		(void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
-		    &ds->ds_phys->ds_bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ,
-		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb);
+		mutex_enter(&ds->ds_opening_lock);
+		if (ds->ds_objset == NULL) {
+			uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
+			zbookmark_t zb;
+
+			SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT,
+			    ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
+
+			(void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds),
+			    &ds->ds_phys->ds_bp, NULL, NULL,
+			    ZIO_PRIORITY_ASYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+			    &aflags, &zb);
+		}
+		mutex_exit(&ds->ds_opening_lock);
 	}
 
-	dsl_dataset_rele(ds, os);
-	kmem_free(os, sizeof (objset_t));
+	dsl_dataset_rele(ds, FTAG);
 	return (0);
 }
 
 void
 dmu_objset_set_user(objset_t *os, void *user_ptr)
 {
-	ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
-	os->os->os_user_ptr = user_ptr;
+	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+	os->os_user_ptr = user_ptr;
 }
 
 void *
 dmu_objset_get_user(objset_t *os)
 {
-	ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock));
-	return (os->os->os_user_ptr);
+	ASSERT(MUTEX_HELD(&os->os_user_ptr_lock));
+	return (os->os_user_ptr);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c
index 857b9a343fd2c..b23db0c83c2a9 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -33,21 +33,38 @@
 #include <sys/dmu_traverse.h>
 #include <sys/dsl_dataset.h>
 #include <sys/dsl_dir.h>
+#include <sys/dsl_prop.h>
 #include <sys/dsl_pool.h>
 #include <sys/dsl_synctask.h>
 #include <sys/zfs_ioctl.h>
 #include <sys/zap.h>
 #include <sys/zio_checksum.h>
+#include <sys/avl.h>
+#include <sys/ddt.h>
 
 static char *dmu_recv_tag = "dmu_recv_tag";
 
+/*
+ * The list of data whose inclusion in a send stream can be pending from
+ * one call to backup_cb to another.  Multiple calls to dump_free() and
+ * dump_freeobjects() can be aggregated into a single DRR_FREE or
+ * DRR_FREEOBJECTS replay record.
+ */
+typedef enum {
+	PENDING_NONE,
+	PENDING_FREE,
+	PENDING_FREEOBJECTS
+} pendop_t;
+
 struct backuparg {
 	dmu_replay_record_t *drr;
 	vnode_t *vp;
 	offset_t *off;
 	objset_t *os;
 	zio_cksum_t zc;
+	uint64_t toguid;
 	int err;
+	pendop_t pending_op;
 };
 
 static int
@@ -68,33 +85,99 @@ static int
 dump_free(struct backuparg *ba, uint64_t object, uint64_t offset,
     uint64_t length)
 {
-	/* write a FREE record */
+	struct drr_free *drrf = &(ba->drr->drr_u.drr_free);
+
+	/*
+	 * If there is a pending op, but it's not PENDING_FREE, push it out,
+	 * since free block aggregation can only be done for blocks of the
+	 * same type (i.e., DRR_FREE records can only be aggregated with
+	 * other DRR_FREE records.  DRR_FREEOBJECTS records can only be
+	 * aggregated with other DRR_FREEOBJECTS records.
+	 */
+	if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) {
+		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+			return (EINTR);
+		ba->pending_op = PENDING_NONE;
+	}
+
+	if (ba->pending_op == PENDING_FREE) {
+		/*
+		 * There should never be a PENDING_FREE if length is -1
+		 * (because dump_dnode is the only place where this
+		 * function is called with a -1, and only after flushing
+		 * any pending record).
+		 */
+		ASSERT(length != -1ULL);
+		/*
+		 * Check to see whether this free block can be aggregated
+		 * with pending one.
+		 */
+		if (drrf->drr_object == object && drrf->drr_offset +
+		    drrf->drr_length == offset) {
+			drrf->drr_length += length;
+			return (0);
+		} else {
+			/* not a continuation.  Push out pending record */
+			if (dump_bytes(ba, ba->drr,
+			    sizeof (dmu_replay_record_t)) != 0)
+				return (EINTR);
+			ba->pending_op = PENDING_NONE;
+		}
+	}
+	/* create a FREE record and make it pending */
 	bzero(ba->drr, sizeof (dmu_replay_record_t));
 	ba->drr->drr_type = DRR_FREE;
-	ba->drr->drr_u.drr_free.drr_object = object;
-	ba->drr->drr_u.drr_free.drr_offset = offset;
-	ba->drr->drr_u.drr_free.drr_length = length;
+	drrf->drr_object = object;
+	drrf->drr_offset = offset;
+	drrf->drr_length = length;
+	drrf->drr_toguid = ba->toguid;
+	if (length == -1ULL) {
+		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+			return (EINTR);
+	} else {
+		ba->pending_op = PENDING_FREE;
+	}
 
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
-		return (EINTR);
 	return (0);
 }
 
 static int
 dump_data(struct backuparg *ba, dmu_object_type_t type,
-    uint64_t object, uint64_t offset, int blksz, void *data)
+    uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data)
 {
+	struct drr_write *drrw = &(ba->drr->drr_u.drr_write);
+
+
+	/*
+	 * If there is any kind of pending aggregation (currently either
+	 * a grouping of free objects or free blocks), push it out to
+	 * the stream, since aggregation can't be done across operations
+	 * of different types.
+	 */
+	if (ba->pending_op != PENDING_NONE) {
+		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+			return (EINTR);
+		ba->pending_op = PENDING_NONE;
+	}
 	/* write a DATA record */
 	bzero(ba->drr, sizeof (dmu_replay_record_t));
 	ba->drr->drr_type = DRR_WRITE;
-	ba->drr->drr_u.drr_write.drr_object = object;
-	ba->drr->drr_u.drr_write.drr_type = type;
-	ba->drr->drr_u.drr_write.drr_offset = offset;
-	ba->drr->drr_u.drr_write.drr_length = blksz;
-
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+	drrw->drr_object = object;
+	drrw->drr_type = type;
+	drrw->drr_offset = offset;
+	drrw->drr_length = blksz;
+	drrw->drr_toguid = ba->toguid;
+	drrw->drr_checksumtype = BP_GET_CHECKSUM(bp);
+	if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup)
+		drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP;
+	DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp));
+	DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp));
+	DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp));
+	drrw->drr_key.ddk_cksum = bp->blk_cksum;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
 		return (EINTR);
-	if (dump_bytes(ba, data, blksz))
+	if (dump_bytes(ba, data, blksz) != 0)
 		return (EINTR);
 	return (0);
 }
@@ -102,39 +185,80 @@ dump_data(struct backuparg *ba, dmu_object_type_t type,
 static int
 dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs)
 {
+	struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects);
+
+	/*
+	 * If there is a pending op, but it's not PENDING_FREEOBJECTS,
+	 * push it out, since free block aggregation can only be done for
+	 * blocks of the same type (i.e., DRR_FREE records can only be
+	 * aggregated with other DRR_FREE records.  DRR_FREEOBJECTS records
+	 * can only be aggregated with other DRR_FREEOBJECTS records.
+	 */
+	if (ba->pending_op != PENDING_NONE &&
+	    ba->pending_op != PENDING_FREEOBJECTS) {
+		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+			return (EINTR);
+		ba->pending_op = PENDING_NONE;
+	}
+	if (ba->pending_op == PENDING_FREEOBJECTS) {
+		/*
+		 * See whether this free object array can be aggregated
+		 * with pending one
+		 */
+		if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) {
+			drrfo->drr_numobjs += numobjs;
+			return (0);
+		} else {
+			/* can't be aggregated.  Push out pending record */
+			if (dump_bytes(ba, ba->drr,
+			    sizeof (dmu_replay_record_t)) != 0)
+				return (EINTR);
+			ba->pending_op = PENDING_NONE;
+		}
+	}
+
 	/* write a FREEOBJECTS record */
 	bzero(ba->drr, sizeof (dmu_replay_record_t));
 	ba->drr->drr_type = DRR_FREEOBJECTS;
-	ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj;
-	ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs;
+	drrfo->drr_firstobj = firstobj;
+	drrfo->drr_numobjs = numobjs;
+	drrfo->drr_toguid = ba->toguid;
+
+	ba->pending_op = PENDING_FREEOBJECTS;
 
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
-		return (EINTR);
 	return (0);
 }
 
 static int
 dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
 {
+	struct drr_object *drro = &(ba->drr->drr_u.drr_object);
+
 	if (dnp == NULL || dnp->dn_type == DMU_OT_NONE)
 		return (dump_freeobjects(ba, object, 1));
 
+	if (ba->pending_op != PENDING_NONE) {
+		if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
+			return (EINTR);
+		ba->pending_op = PENDING_NONE;
+	}
+
 	/* write an OBJECT record */
 	bzero(ba->drr, sizeof (dmu_replay_record_t));
 	ba->drr->drr_type = DRR_OBJECT;
-	ba->drr->drr_u.drr_object.drr_object = object;
-	ba->drr->drr_u.drr_object.drr_type = dnp->dn_type;
-	ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype;
-	ba->drr->drr_u.drr_object.drr_blksz =
-	    dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
-	ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen;
-	ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum;
-	ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress;
-
-	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)))
+	drro->drr_object = object;
+	drro->drr_type = dnp->dn_type;
+	drro->drr_bonustype = dnp->dn_bonustype;
+	drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT;
+	drro->drr_bonuslen = dnp->dn_bonuslen;
+	drro->drr_checksumtype = dnp->dn_checksum;
+	drro->drr_compress = dnp->dn_compress;
+	drro->drr_toguid = ba->toguid;
+
+	if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0)
 		return (EINTR);
 
-	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)))
+	if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0)
 		return (EINTR);
 
 	/* free anything past the end of the file */
@@ -150,9 +274,10 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp)
 	(((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \
 	(level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT)))
 
+/* ARGSUSED */
 static int
-backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct backuparg *ba = arg;
 	dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE;
@@ -161,7 +286,10 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
 	if (issig(JUSTLOOKING) && issig(FORREAL))
 		return (EINTR);
 
-	if (bp == NULL && zb->zb_object == 0) {
+	if (zb->zb_object != DMU_META_DNODE_OBJECT &&
+	    DMU_OBJECT_IS_SPECIAL(zb->zb_object)) {
+		return (0);
+	} else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) {
 		uint64_t span = BP_SPAN(dnp, zb->zb_level);
 		uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT;
 		err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT);
@@ -202,7 +330,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
 			return (EIO);
 
 		err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz,
-		    blksz, abuf->b_data);
+		    blksz, bp, abuf->b_data);
 		(void) arc_buf_remove_ref(abuf, &abuf);
 	}
 
@@ -214,8 +342,8 @@ int
 dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
     vnode_t *vp, offset_t *off)
 {
-	dsl_dataset_t *ds = tosnap->os->os_dsl_dataset;
-	dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL;
+	dsl_dataset_t *ds = tosnap->os_dsl_dataset;
+	dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL;
 	dmu_replay_record_t *drr;
 	struct backuparg ba;
 	int err;
@@ -252,10 +380,11 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
 	drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP);
 	drr->drr_type = DRR_BEGIN;
 	drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC;
-	drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION;
+	DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo,
+	    DMU_SUBSTREAM);
 	drr->drr_u.drr_begin.drr_creation_time =
 	    ds->ds_phys->ds_creation_time;
-	drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type;
+	drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type;
 	if (fromorigin)
 		drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE;
 	drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid;
@@ -275,9 +404,11 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
 	ba.vp = vp;
 	ba.os = tosnap;
 	ba.off = off;
+	ba.toguid = ds->ds_phys->ds_guid;
 	ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0);
+	ba.pending_op = PENDING_NONE;
 
-	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
 		kmem_free(drr, sizeof (dmu_replay_record_t));
 		return (ba.err);
 	}
@@ -285,6 +416,10 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
 	err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH,
 	    backup_cb, &ba);
 
+	if (ba.pending_op != PENDING_NONE)
+		if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0)
+			err = EINTR;
+
 	if (err) {
 		if (err == EINTR && ba.err)
 			err = ba.err;
@@ -295,8 +430,9 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin,
 	bzero(drr, sizeof (dmu_replay_record_t));
 	drr->drr_type = DRR_END;
 	drr->drr_u.drr_end.drr_checksum = ba.zc;
+	drr->drr_u.drr_end.drr_toguid = ba.toguid;
 
-	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) {
+	if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) {
 		kmem_free(drr, sizeof (dmu_replay_record_t));
 		return (ba.err);
 	}
@@ -319,31 +455,9 @@ struct recvbeginsyncarg {
 	dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */
 };
 
-static dsl_dataset_t *
-recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type,
-    cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds;
-
-	/* This should always work, since we just created it */
-	/* XXX - create should return an owned ds */
-	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
-	    DS_MODE_INCONSISTENT, dmu_recv_tag, &ds));
-
-	if (type != DMU_OST_NONE) {
-		(void) dmu_objset_create_impl(dp->dp_spa,
-		    ds, &ds->ds_phys->ds_bp, type, tx);
-	}
-
-	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
-	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
-
-	return (ds);
-}
-
 /* ARGSUSED */
 static int
-recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
@@ -361,7 +475,7 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		/* make sure it's a snap in the same pool */
 		if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool)
 			return (EXDEV);
-		if (rbsa->origin->ds_phys->ds_num_children == 0)
+		if (!dsl_dataset_is_snapshot(rbsa->origin))
 			return (EINVAL);
 		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
 			return (ENODEV);
@@ -371,77 +485,31 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx)
 }
 
 static void
-recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dir_t *dd = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
 	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 	uint64_t dsobj;
 
+	/* Create and open new dataset. */
 	dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1,
 	    rbsa->origin, flags, cr, tx);
+	VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj,
+	    B_TRUE, dmu_recv_tag, &rbsa->ds));
 
-	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
-	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
-}
-
-static int
-recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	int err;
-
-	/* must be a head ds */
-	if (ds->ds_phys->ds_next_snap_obj != 0)
-		return (EINVAL);
-
-	/* must not be a clone ds */
-	if (dsl_dir_is_clone(ds->ds_dir))
-		return (EINVAL);
-
-	err = dsl_dataset_destroy_check(ds, rbsa->tag, tx);
-	if (err)
-		return (err);
-
-	if (rbsa->origin) {
-		/* make sure it's a snap in the same pool */
-		if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool)
-			return (EXDEV);
-		if (rbsa->origin->ds_phys->ds_num_children == 0)
-			return (EINVAL);
-		if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid)
-			return (ENODEV);
+	if (rbsa->origin == NULL) {
+		(void) dmu_objset_create_impl(dd->dd_pool->dp_spa,
+		    rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx);
 	}
 
-	return (0);
-}
-
-static void
-recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	struct recvbeginsyncarg *rbsa = arg2;
-	dsl_dir_t *dd = ds->ds_dir;
-	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
-	uint64_t dsobj;
-
-	/*
-	 * NB: caller must provide an extra hold on the dsl_dir_t, so it
-	 * won't go away when dsl_dataset_destroy_sync() closes the
-	 * dataset.
-	 */
-	dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx);
-
-	dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx);
-
-	rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj,
-	    rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx);
+	spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC,
+	    dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj);
 }
 
 /* ARGSUSED */
 static int
-recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
+recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
@@ -452,13 +520,43 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds))
 		return (ETXTBSY);
 
-	/* must already be a snapshot of this fs */
-	if (ds->ds_phys->ds_prev_snap_obj == 0)
-		return (ENODEV);
+	if (rbsa->fromguid) {
+		/* if incremental, most recent snapshot must match fromguid */
+		if (ds->ds_prev == NULL)
+			return (ENODEV);
 
-	/* most recent snapshot must match fromguid */
-	if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid)
-		return (ENODEV);
+		/*
+		 * most recent snapshot must match fromguid, or there are no
+		 * changes since the fromguid one
+		 */
+		if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) {
+			uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth;
+			uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj;
+			while (obj != 0) {
+				dsl_dataset_t *snap;
+				err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool,
+				    obj, FTAG, &snap);
+				if (err)
+					return (ENODEV);
+				if (snap->ds_phys->ds_creation_txg < birth) {
+					dsl_dataset_rele(snap, FTAG);
+					return (ENODEV);
+				}
+				if (snap->ds_phys->ds_guid == rbsa->fromguid) {
+					dsl_dataset_rele(snap, FTAG);
+					break; /* it's ok */
+				}
+				obj = snap->ds_phys->ds_prev_snap_obj;
+				dsl_dataset_rele(snap, FTAG);
+			}
+			if (obj == 0)
+				return (ENODEV);
+		}
+	} else {
+		/* if full, most recent snapshot must be $ORIGIN */
+		if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL)
+			return (ENODEV);
+	}
 
 	/* temporary clone name must not exist */
 	err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
@@ -481,29 +579,28 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 /* ARGSUSED */
 static void
-recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ohds = arg1;
 	struct recvbeginsyncarg *rbsa = arg2;
 	dsl_pool_t *dp = ohds->ds_dir->dd_pool;
-	dsl_dataset_t *ods, *cds;
+	dsl_dataset_t *cds;
 	uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags;
 	uint64_t dsobj;
 
-	/* create the temporary clone */
-	VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj,
-	    FTAG, &ods));
-	dsobj = dsl_dataset_create_sync(ohds->ds_dir,
-	    rbsa->clonelastname, ods, flags, cr, tx);
-	dsl_dataset_rele(ods, FTAG);
+	/* create and open the temporary clone */
+	dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname,
+	    ohds->ds_prev, flags, cr, tx);
+	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds));
 
-	/* open the temporary clone */
-	VERIFY(0 == dsl_dataset_own_obj(dp, dsobj,
-	    DS_MODE_INCONSISTENT, dmu_recv_tag, &cds));
-
-	/* copy the refquota from the target fs to the clone */
-	if (ohds->ds_quota > 0)
-		dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx);
+	/*
+	 * If we actually created a non-clone, we need to create the
+	 * objset in our new dataset.
+	 */
+	if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) {
+		(void) dmu_objset_create_impl(dp->dp_spa,
+		    cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx);
+	}
 
 	rbsa->ds = cds;
 
@@ -511,32 +608,18 @@ recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	    dp->dp_spa, tx, cr, "dataset = %lld", dsobj);
 }
 
-/* ARGSUSED */
-static void
-recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-	ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT;
-
-	spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC,
-	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld",
-	    ds->ds_object);
-}
-
 /*
  * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin()
  * succeeds; otherwise we will leak the holds on the datasets.
  */
 int
-dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
-    boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc)
+dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb,
+    boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc)
 {
 	int err = 0;
 	boolean_t byteswap;
-	struct recvbeginsyncarg rbsa;
-	uint64_t version;
+	struct recvbeginsyncarg rbsa = { 0 };
+	uint64_t versioninfo;
 	int flags;
 	dsl_dataset_t *ds;
 
@@ -549,22 +632,22 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
 
 	rbsa.tofs = tofs;
 	rbsa.tosnap = tosnap;
-	rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL;
+	rbsa.origin = origin ? origin->os_dsl_dataset : NULL;
 	rbsa.fromguid = drrb->drr_fromguid;
 	rbsa.type = drrb->drr_type;
 	rbsa.tag = FTAG;
 	rbsa.dsflags = 0;
-	version = drrb->drr_version;
+	versioninfo = drrb->drr_versioninfo;
 	flags = drrb->drr_flags;
 
 	if (byteswap) {
 		rbsa.type = BSWAP_32(rbsa.type);
 		rbsa.fromguid = BSWAP_64(rbsa.fromguid);
-		version = BSWAP_64(version);
+		versioninfo = BSWAP_64(versioninfo);
 		flags = BSWAP_32(flags);
 	}
 
-	if (version != DMU_BACKUP_STREAM_VERSION ||
+	if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM ||
 	    rbsa.type >= DMU_OST_NUMTYPES ||
 	    ((flags & DRR_FLAG_CLONE) && origin == NULL))
 		return (EINVAL);
@@ -575,102 +658,72 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb,
 	bzero(drc, sizeof (dmu_recv_cookie_t));
 	drc->drc_drrb = drrb;
 	drc->drc_tosnap = tosnap;
+	drc->drc_top_ds = top_ds;
 	drc->drc_force = force;
 
 	/*
 	 * Process the begin in syncing context.
 	 */
-	if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) {
-		/* offline incremental receive */
-		err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds);
-		if (err)
-			return (err);
 
-		/*
-		 * Only do the rollback if the most recent snapshot
-		 * matches the incremental source
-		 */
-		if (force) {
-			if (ds->ds_prev == NULL ||
-			    ds->ds_prev->ds_phys->ds_guid !=
-			    rbsa.fromguid) {
-				dsl_dataset_disown(ds, dmu_recv_tag);
-				return (ENODEV);
-			}
-			(void) dsl_dataset_rollback(ds, DMU_OST_NONE);
+	/* open the dataset we are logically receiving into */
+	err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
+	if (err == 0) {
+		/* target fs already exists; recv into temp clone */
+
+		/* Can't recv a clone into an existing fs */
+		if (flags & DRR_FLAG_CLONE) {
+			dsl_dataset_rele(ds, dmu_recv_tag);
+			return (EINVAL);
 		}
-		rbsa.force = B_FALSE;
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    recv_incremental_check,
-		    recv_offline_incremental_sync, ds, &rbsa, 1);
-		if (err) {
-			dsl_dataset_disown(ds, dmu_recv_tag);
-			return (err);
+
+		/* must not have an incremental recv already in progress */
+		if (!mutex_tryenter(&ds->ds_recvlock)) {
+			dsl_dataset_rele(ds, dmu_recv_tag);
+			return (EBUSY);
 		}
-		drc->drc_logical_ds = drc->drc_real_ds = ds;
-	} else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) {
-		/* online incremental receive */
 
 		/* tmp clone name is: tofs/%tosnap" */
 		(void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname),
 		    "%%%s", tosnap);
-
-		/* open the dataset we are logically receiving into */
-		err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds);
-		if (err)
-			return (err);
-
 		rbsa.force = force;
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    recv_incremental_check,
-		    recv_online_incremental_sync, ds, &rbsa, 5);
+		    recv_existing_check, recv_existing_sync, ds, &rbsa, 5);
 		if (err) {
+			mutex_exit(&ds->ds_recvlock);
 			dsl_dataset_rele(ds, dmu_recv_tag);
 			return (err);
 		}
 		drc->drc_logical_ds = ds;
 		drc->drc_real_ds = rbsa.ds;
-	} else {
-		/* create new fs -- full backup or clone */
-		dsl_dir_t *dd = NULL;
-		const char *tail;
+	} else if (err == ENOENT) {
+		/* target fs does not exist; must be a full backup or clone */
+		char *cp;
 
-		err = dsl_dir_open(tofs, FTAG, &dd, &tail);
+		/*
+		 * If it's a non-clone incremental, we are missing the
+		 * target fs, so fail the recv.
+		 */
+		if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE))
+			return (ENOENT);
+
+		/* Open the parent of tofs */
+		cp = strrchr(tofs, '/');
+		*cp = '\0';
+		err = dsl_dataset_hold(tofs, FTAG, &ds);
+		*cp = '/';
 		if (err)
 			return (err);
-		if (tail == NULL) {
-			if (!force) {
-				dsl_dir_close(dd, FTAG);
-				return (EEXIST);
-			}
 
-			rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
-			err = dsl_dataset_own_obj(dd->dd_pool,
-			    dd->dd_phys->dd_head_dataset_obj,
-			    DS_MODE_INCONSISTENT, FTAG, &ds);
-			rw_exit(&dd->dd_pool->dp_config_rwlock);
-			if (err) {
-				dsl_dir_close(dd, FTAG);
-				return (err);
-			}
-
-			dsl_dataset_make_exclusive(ds, FTAG);
-			err = dsl_sync_task_do(dd->dd_pool,
-			    recv_full_existing_check,
-			    recv_full_existing_sync, ds, &rbsa, 5);
-			dsl_dataset_disown(ds, FTAG);
-		} else {
-			err = dsl_sync_task_do(dd->dd_pool, recv_full_check,
-			    recv_full_sync, dd, &rbsa, 5);
-		}
-		dsl_dir_close(dd, FTAG);
+		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+		    recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5);
+		dsl_dataset_rele(ds, FTAG);
 		if (err)
 			return (err);
 		drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds;
 		drc->drc_newfs = B_TRUE;
 	}
 
-	return (0);
+	return (err);
 }
 
 struct restorearg {
@@ -681,8 +734,83 @@ struct restorearg {
 	uint64_t voff;
 	int bufsize; /* amount of memory allocated for buf */
 	zio_cksum_t cksum;
+	avl_tree_t guid_to_ds_map;
 };
 
+typedef struct guid_map_entry {
+	uint64_t	guid;
+	dsl_dataset_t	*gme_ds;
+	avl_node_t	avlnode;
+} guid_map_entry_t;
+
+static int
+guid_compare(const void *arg1, const void *arg2)
+{
+	const guid_map_entry_t *gmep1 = arg1;
+	const guid_map_entry_t *gmep2 = arg2;
+
+	if (gmep1->guid < gmep2->guid)
+		return (-1);
+	else if (gmep1->guid > gmep2->guid)
+		return (1);
+	return (0);
+}
+
+/*
+ * This function is a callback used by dmu_objset_find() (which
+ * enumerates the object sets) to build an avl tree that maps guids
+ * to datasets.  The resulting table is used when processing DRR_WRITE_BYREF
+ * send stream records.  These records, which are used in dedup'ed
+ * streams, do not contain data themselves, but refer to a copy
+ * of the data block that has already been written because it was
+ * earlier in the stream.  That previous copy is identified by the
+ * guid of the dataset with the referenced data.
+ */
+int
+find_ds_by_guid(const char *name, void *arg)
+{
+	avl_tree_t *guid_map = arg;
+	dsl_dataset_t *ds, *snapds;
+	guid_map_entry_t *gmep;
+	dsl_pool_t *dp;
+	int err;
+	uint64_t lastobj, firstobj;
+
+	if (dsl_dataset_hold(name, FTAG, &ds) != 0)
+		return (0);
+
+	dp = ds->ds_dir->dd_pool;
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	firstobj = ds->ds_dir->dd_phys->dd_origin_obj;
+	lastobj = ds->ds_phys->ds_prev_snap_obj;
+
+	while (lastobj != firstobj) {
+		err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds);
+		if (err) {
+			/*
+			 * Skip this snapshot and move on. It's not
+			 * clear why this would ever happen, but the
+			 * remainder of the snapshot streadm can be
+			 * processed.
+			 */
+			rw_exit(&dp->dp_config_rwlock);
+			dsl_dataset_rele(ds, FTAG);
+			return (0);
+		}
+
+		gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP);
+		gmep->guid = snapds->ds_phys->ds_guid;
+		gmep->gme_ds = snapds;
+		avl_add(guid_map, gmep);
+		lastobj = snapds->ds_phys->ds_prev_snap_obj;
+	}
+
+	rw_exit(&dp->dp_config_rwlock);
+	dsl_dataset_rele(ds, FTAG);
+
+	return (0);
+}
+
 static void *
 restore_read(struct restorearg *ra, int len)
 {
@@ -727,7 +855,7 @@ backup_byteswap(dmu_replay_record_t *drr)
 	switch (drr->drr_type) {
 	case DRR_BEGIN:
 		DO64(drr_begin.drr_magic);
-		DO64(drr_begin.drr_version);
+		DO64(drr_begin.drr_versioninfo);
 		DO64(drr_begin.drr_creation_time);
 		DO32(drr_begin.drr_type);
 		DO32(drr_begin.drr_flags);
@@ -741,27 +869,51 @@ backup_byteswap(dmu_replay_record_t *drr)
 		DO32(drr_object.drr_bonustype);
 		DO32(drr_object.drr_blksz);
 		DO32(drr_object.drr_bonuslen);
+		DO64(drr_object.drr_toguid);
 		break;
 	case DRR_FREEOBJECTS:
 		DO64(drr_freeobjects.drr_firstobj);
 		DO64(drr_freeobjects.drr_numobjs);
+		DO64(drr_freeobjects.drr_toguid);
 		break;
 	case DRR_WRITE:
 		DO64(drr_write.drr_object);
 		DO32(drr_write.drr_type);
 		DO64(drr_write.drr_offset);
 		DO64(drr_write.drr_length);
+		DO64(drr_write.drr_toguid);
+		DO64(drr_write.drr_key.ddk_cksum.zc_word[0]);
+		DO64(drr_write.drr_key.ddk_cksum.zc_word[1]);
+		DO64(drr_write.drr_key.ddk_cksum.zc_word[2]);
+		DO64(drr_write.drr_key.ddk_cksum.zc_word[3]);
+		DO64(drr_write.drr_key.ddk_prop);
+		break;
+	case DRR_WRITE_BYREF:
+		DO64(drr_write_byref.drr_object);
+		DO64(drr_write_byref.drr_offset);
+		DO64(drr_write_byref.drr_length);
+		DO64(drr_write_byref.drr_toguid);
+		DO64(drr_write_byref.drr_refguid);
+		DO64(drr_write_byref.drr_refobject);
+		DO64(drr_write_byref.drr_refoffset);
+		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]);
+		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]);
+		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]);
+		DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]);
+		DO64(drr_write_byref.drr_key.ddk_prop);
 		break;
 	case DRR_FREE:
 		DO64(drr_free.drr_object);
 		DO64(drr_free.drr_offset);
 		DO64(drr_free.drr_length);
+		DO64(drr_free.drr_toguid);
 		break;
 	case DRR_END:
 		DO64(drr_end.drr_checksum.zc_word[0]);
 		DO64(drr_end.drr_checksum.zc_word[1]);
 		DO64(drr_end.drr_checksum.zc_word[2]);
 		DO64(drr_end.drr_checksum.zc_word[3]);
+		DO64(drr_end.drr_toguid);
 		break;
 	}
 #undef DO64
@@ -775,15 +927,10 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 	dmu_tx_t *tx;
 	void *data = NULL;
 
-	err = dmu_object_info(os, drro->drr_object, NULL);
-
-	if (err != 0 && err != ENOENT)
-		return (EINVAL);
-
 	if (drro->drr_type == DMU_OT_NONE ||
 	    drro->drr_type >= DMU_OT_NUMTYPES ||
 	    drro->drr_bonustype >= DMU_OT_NUMTYPES ||
-	    drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS ||
+	    drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS ||
 	    drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS ||
 	    P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) ||
 	    drro->drr_blksz < SPA_MINBLOCKSIZE ||
@@ -792,18 +939,21 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 		return (EINVAL);
 	}
 
+	err = dmu_object_info(os, drro->drr_object, NULL);
+
+	if (err != 0 && err != ENOENT)
+		return (EINVAL);
+
 	if (drro->drr_bonuslen) {
 		data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8));
 		if (ra->err)
 			return (ra->err);
 	}
 
-	tx = dmu_tx_create(os);
-
 	if (err == ENOENT) {
 		/* currently free, want to be allocated */
+		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1);
 		err = dmu_tx_assign(tx, TXG_WAIT);
 		if (err) {
 			dmu_tx_abort(tx);
@@ -812,30 +962,26 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro)
 		err = dmu_object_claim(os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
 		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+		dmu_tx_commit(tx);
 	} else {
 		/* currently allocated, want to be allocated */
-		dmu_tx_hold_bonus(tx, drro->drr_object);
-		/*
-		 * We may change blocksize, so need to
-		 * hold_write
-		 */
-		dmu_tx_hold_write(tx, drro->drr_object, 0, 1);
-		err = dmu_tx_assign(tx, TXG_WAIT);
-		if (err) {
-			dmu_tx_abort(tx);
-			return (err);
-		}
-
 		err = dmu_object_reclaim(os, drro->drr_object,
 		    drro->drr_type, drro->drr_blksz,
-		    drro->drr_bonustype, drro->drr_bonuslen, tx);
+		    drro->drr_bonustype, drro->drr_bonuslen);
 	}
-	if (err) {
-		dmu_tx_commit(tx);
+	if (err)
 		return (EINVAL);
+
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, drro->drr_object);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
 	}
 
-	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx);
+	dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype,
+	    tx);
 	dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx);
 
 	if (data != NULL) {
@@ -917,6 +1063,64 @@ restore_write(struct restorearg *ra, objset_t *os,
 	return (0);
 }
 
+/*
+ * Handle a DRR_WRITE_BYREF record.  This record is used in dedup'ed
+ * streams to refer to a copy of the data that is already on the
+ * system because it came in earlier in the stream.  This function
+ * finds the earlier copy of the data, and uses that copy instead of
+ * data from the stream to fulfill this write.
+ */
+static int
+restore_write_byref(struct restorearg *ra, objset_t *os,
+    struct drr_write_byref *drrwbr)
+{
+	dmu_tx_t *tx;
+	int err;
+	guid_map_entry_t gmesrch;
+	guid_map_entry_t *gmep;
+	avl_index_t	where;
+	objset_t *ref_os = NULL;
+	dmu_buf_t *dbp;
+
+	if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset)
+		return (EINVAL);
+
+	/*
+	 * If the GUID of the referenced dataset is different from the
+	 * GUID of the target dataset, find the referenced dataset.
+	 */
+	if (drrwbr->drr_toguid != drrwbr->drr_refguid) {
+		gmesrch.guid = drrwbr->drr_refguid;
+		if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch,
+		    &where)) == NULL) {
+			return (EINVAL);
+		}
+		if (dmu_objset_from_ds(gmep->gme_ds, &ref_os))
+			return (EINVAL);
+	} else {
+		ref_os = os;
+	}
+
+	if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject,
+	    drrwbr->drr_refoffset, FTAG, &dbp))
+		return (err);
+
+	tx = dmu_tx_create(os);
+
+	dmu_tx_hold_write(tx, drrwbr->drr_object,
+	    drrwbr->drr_offset, drrwbr->drr_length);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+	dmu_write(os, drrwbr->drr_object,
+	    drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx);
+	dmu_buf_rele(dbp, FTAG);
+	dmu_tx_commit(tx);
+	return (0);
+}
+
 /* ARGSUSED */
 static int
 restore_free(struct restorearg *ra, objset_t *os,
@@ -936,26 +1140,6 @@ restore_free(struct restorearg *ra, objset_t *os,
 	return (err);
 }
 
-void
-dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc)
-{
-	if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) {
-		/*
-		 * online incremental or new fs: destroy the fs (which
-		 * may be a clone) that we created
-		 */
-		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
-		if (drc->drc_real_ds != drc->drc_logical_ds)
-			dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
-	} else {
-		/*
-		 * offline incremental: rollback to most recent snapshot.
-		 */
-		(void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE);
-		dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag);
-	}
-}
-
 /*
  * NB: callers *must* call dmu_recv_end() if this succeeds.
  */
@@ -966,6 +1150,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
 	dmu_replay_record_t *drr;
 	objset_t *os;
 	zio_cksum_t pcksum;
+	guid_map_entry_t *gmep;
+	int featureflags;
 
 	if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC))
 		ra.byteswap = TRUE;
@@ -990,7 +1176,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
 	if (ra.byteswap) {
 		struct drr_begin *drrb = drc->drc_drrb;
 		drrb->drr_magic = BSWAP_64(drrb->drr_magic);
-		drrb->drr_version = BSWAP_64(drrb->drr_version);
+		drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo);
 		drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time);
 		drrb->drr_type = BSWAP_32(drrb->drr_type);
 		drrb->drr_toguid = BSWAP_64(drrb->drr_toguid);
@@ -1003,16 +1189,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
 	ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP);
 
 	/* these were verified in dmu_recv_begin */
-	ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION);
+	ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) ==
+	    DMU_SUBSTREAM);
 	ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES);
 
 	/*
 	 * Open the objset we are modifying.
 	 */
-	VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0);
+	VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0);
 
 	ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT);
 
+	featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo);
+
+	/* if this stream is dedup'ed, set up the avl tree for guid mapping */
+	if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
+		avl_create(&ra.guid_to_ds_map, guid_compare,
+		    sizeof (guid_map_entry_t),
+		    offsetof(guid_map_entry_t, avlnode));
+		(void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid,
+		    (void *)&ra.guid_to_ds_map,
+		    DS_FIND_CHILDREN);
+	}
+
 	/*
 	 * Read records and process them.
 	 */
@@ -1052,6 +1251,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
 			ra.err = restore_write(&ra, os, &drrw);
 			break;
 		}
+		case DRR_WRITE_BYREF:
+		{
+			struct drr_write_byref drrwbr =
+			    drr->drr_u.drr_write_byref;
+			ra.err = restore_write_byref(&ra, os, &drrwbr);
+			break;
+		}
 		case DRR_FREE:
 		{
 			struct drr_free drrf = drr->drr_u.drr_free;
@@ -1079,15 +1285,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp)
 	ASSERT(ra.err != 0);
 
 out:
-	dmu_objset_close(os);
-
 	if (ra.err != 0) {
 		/*
-		 * rollback or destroy what we created, so we don't
-		 * leave it in the restoring state.
+		 * destroy what we created, so we don't leave it in the
+		 * inconsistent restoring state.
 		 */
 		txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0);
-		dmu_recv_abort_cleanup(drc);
+
+		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+		    B_FALSE);
+		if (drc->drc_real_ds != drc->drc_logical_ds) {
+			mutex_exit(&drc->drc_logical_ds->ds_recvlock);
+			dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag);
+		}
+	}
+
+	if (featureflags & DMU_BACKUP_FEATURE_DEDUP) {
+		void *cookie = NULL;
+
+		while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) {
+			dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map);
+			kmem_free(gmep, sizeof (guid_map_entry_t));
+		}
+		avl_destroy(&ra.guid_to_ds_map);
 	}
 
 	kmem_free(ra.buf, ra.bufsize);
@@ -1128,35 +1348,31 @@ recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT;
 }
 
-int
-dmu_recv_end(dmu_recv_cookie_t *drc)
+static int
+dmu_recv_existing_end(dmu_recv_cookie_t *drc)
 {
 	struct recvendsyncarg resa;
 	dsl_dataset_t *ds = drc->drc_logical_ds;
 	int err;
 
 	/*
-	 * XXX hack; seems the ds is still dirty and
-	 * dsl_pool_zil_clean() expects it to have a ds_user_ptr
-	 * (and zil), but clone_swap() can close it.
+	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
+	 * can close it.
 	 */
 	txg_wait_synced(ds->ds_dir->dd_pool, 0);
 
-	if (ds != drc->drc_real_ds) {
-		/* we are doing an online recv */
-		if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
-			err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
-			    drc->drc_force);
-			if (err)
-				dsl_dataset_disown(ds, dmu_recv_tag);
-		} else {
-			err = EBUSY;
-			dsl_dataset_rele(ds, dmu_recv_tag);
-		}
-		/* dsl_dataset_destroy() will disown the ds */
-		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag);
+	if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) {
+		err = dsl_dataset_clone_swap(drc->drc_real_ds, ds,
+		    drc->drc_force);
 		if (err)
-			return (err);
+			goto out;
+	} else {
+		mutex_exit(&ds->ds_recvlock);
+		dsl_dataset_rele(ds, dmu_recv_tag);
+		(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag,
+		    B_FALSE);
+		return (EBUSY);
 	}
 
 	resa.creation_time = drc->drc_drrb->drr_creation_time;
@@ -1166,16 +1382,52 @@ dmu_recv_end(dmu_recv_cookie_t *drc)
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    recv_end_check, recv_end_sync, ds, &resa, 3);
 	if (err) {
-		if (drc->drc_newfs) {
-			ASSERT(ds == drc->drc_real_ds);
-			(void) dsl_dataset_destroy(ds, dmu_recv_tag);
-			return (err);
-		} else {
-			(void) dsl_dataset_rollback(ds, DMU_OST_NONE);
-		}
+		/* swap back */
+		(void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE);
 	}
 
-	/* release the hold from dmu_recv_begin */
+out:
+	mutex_exit(&ds->ds_recvlock);
 	dsl_dataset_disown(ds, dmu_recv_tag);
+	(void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE);
 	return (err);
 }
+
+static int
+dmu_recv_new_end(dmu_recv_cookie_t *drc)
+{
+	struct recvendsyncarg resa;
+	dsl_dataset_t *ds = drc->drc_logical_ds;
+	int err;
+
+	/*
+	 * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean()
+	 * expects it to have a ds_user_ptr (and zil), but clone_swap()
+	 * can close it.
+	 */
+	txg_wait_synced(ds->ds_dir->dd_pool, 0);
+
+	resa.creation_time = drc->drc_drrb->drr_creation_time;
+	resa.toguid = drc->drc_drrb->drr_toguid;
+	resa.tosnap = drc->drc_tosnap;
+
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    recv_end_check, recv_end_sync, ds, &resa, 3);
+	if (err) {
+		/* clean up the fs we just recv'd into */
+		(void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE);
+	} else {
+		/* release the hold from dmu_recv_begin */
+		dsl_dataset_disown(ds, dmu_recv_tag);
+	}
+	return (err);
+}
+
+int
+dmu_recv_end(dmu_recv_cookie_t *drc)
+{
+	if (drc->drc_logical_ds != drc->drc_real_ds)
+		return (dmu_recv_existing_end(drc));
+	else
+		return (dmu_recv_new_end(drc));
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c
index 5124014707731..692feb6809b1b 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,14 +35,6 @@
 #include <sys/dmu_impl.h>
 #include <sys/callb.h>
 
-#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-	(zb)->zb_objset = objset;                       \
-	(zb)->zb_object = object;                       \
-	(zb)->zb_level = level;                         \
-	(zb)->zb_blkid = blkid;                         \
-}
-
 struct prefetch_data {
 	kmutex_t pd_mtx;
 	kcondvar_t pd_cv;
@@ -64,28 +56,32 @@ struct traverse_data {
 	void *td_arg;
 };
 
+static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+    arc_buf_t *buf, uint64_t objset, uint64_t object);
+
 /* ARGSUSED */
-static void
+static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	struct traverse_data *td = arg;
 	zbookmark_t zb;
 
 	if (bp->blk_birth == 0)
-		return;
+		return (0);
 
 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa))
-		return;
+		return (0);
 
-	zb.zb_objset = td->td_objset;
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
-	VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+	SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+	    bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg);
+
+	return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
 	struct traverse_data *td = arg;
@@ -96,17 +92,18 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 		zbookmark_t zb;
 
 		if (bp->blk_birth == 0)
-			return;
+			return (0);
 
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
-			return;
+			return (0);
+
+		SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
 
-		zb.zb_objset = td->td_objset;
-		zb.zb_object = lr->lr_foid;
-		zb.zb_level = BP_GET_LEVEL(bp);
-		zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
-		VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg));
+		(void) td->td_func(td->td_spa, zilog, bp, &zb, NULL,
+		    td->td_arg);
 	}
+	return (0);
 }
 
 static void
@@ -117,9 +114,9 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh)
 
 	/*
 	 * We only want to visit blocks that have been claimed but not yet
-	 * replayed (or, in read-only mode, blocks that *would* be claimed).
+	 * replayed; plus, in read-only mode, blocks that are already stable.
 	 */
-	if (claim_txg == 0 && (spa_mode & FWRITE))
+	if (claim_txg == 0 && spa_writeable(td->td_spa))
 		return;
 
 	zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh);
@@ -135,12 +132,13 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
 {
 	zbookmark_t czb;
-	int err = 0;
+	int err = 0, lasterr = 0;
 	arc_buf_t *buf = NULL;
 	struct prefetch_data *pd = td->td_pfd;
+	boolean_t hard = td->td_flags & TRAVERSE_HARD;
 
 	if (bp->blk_birth == 0) {
-		err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg);
+		err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg);
 		return (err);
 	}
 
@@ -160,7 +158,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 	}
 
 	if (td->td_flags & TRAVERSE_PRE) {
-		err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 		if (err)
 			return (err);
 	}
@@ -184,12 +182,15 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 			    zb->zb_level - 1,
 			    zb->zb_blkid * epb + i);
 			err = traverse_visitbp(td, dnp, buf, cbp, &czb);
-			if (err)
-				break;
+			if (err) {
+				if (!hard)
+					break;
+				lasterr = err;
+			}
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
-		int i, j;
+		int i;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
 		err = arc_read(NULL, td->td_spa, bp, pbuf,
@@ -200,21 +201,19 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 
 		/* recursively visitbp() blocks below this */
 		dnp = buf->b_data;
-		for (i = 0; i < epb && err == 0; i++, dnp++) {
-			for (j = 0; j < dnp->dn_nblkptr; j++) {
-				SET_BOOKMARK(&czb, zb->zb_objset,
-				    zb->zb_blkid * epb + i,
-				    dnp->dn_nlevels - 1, j);
-				err = traverse_visitbp(td, dnp, buf,
-				    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
-				if (err)
+		for (i = 0; i < epb; i++, dnp++) {
+			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+			    zb->zb_blkid * epb + i);
+			if (err) {
+				if (!hard)
 					break;
+				lasterr = err;
 			}
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
 		objset_phys_t *osp;
-		int j;
+		dnode_phys_t *dnp;
 
 		err = arc_read_nolock(NULL, td->td_spa, bp,
 		    arc_getbuf_func, &buf,
@@ -223,36 +222,65 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp,
 			return (err);
 
 		osp = buf->b_data;
-		/*
-		 * traverse_zil is just here for zdb's leak checking.
-		 * For other consumers, there will be no ZIL blocks.
-		 */
 		traverse_zil(td, &osp->os_zil_header);
 
-		for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
-			SET_BOOKMARK(&czb, zb->zb_objset, 0,
-			    osp->os_meta_dnode.dn_nlevels - 1, j);
-			err = traverse_visitbp(td, &osp->os_meta_dnode, buf,
-			    (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j],
-			    &czb);
-			if (err)
-				break;
+		dnp = &osp->os_meta_dnode;
+		err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+		    DMU_META_DNODE_OBJECT);
+		if (err && hard) {
+			lasterr = err;
+			err = 0;
+		}
+		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			dnp = &osp->os_userused_dnode;
+			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+			    DMU_USERUSED_OBJECT);
+		}
+		if (err && hard) {
+			lasterr = err;
+			err = 0;
+		}
+		if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			dnp = &osp->os_groupused_dnode;
+			err = traverse_dnode(td, dnp, buf, zb->zb_objset,
+			    DMU_GROUPUSED_OBJECT);
 		}
 	}
 
 	if (buf)
 		(void) arc_buf_remove_ref(buf, &buf);
 
-	if (err == 0 && (td->td_flags & TRAVERSE_POST))
-		err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg);
+	if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST))
+		err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg);
 
-	return (err);
+	return (err != 0 ? err : lasterr);
+}
+
+static int
+traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp,
+    arc_buf_t *buf, uint64_t objset, uint64_t object)
+{
+	int j, err = 0, lasterr = 0;
+	zbookmark_t czb;
+	boolean_t hard = (td->td_flags & TRAVERSE_HARD);
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+		err = traverse_visitbp(td, dnp, buf,
+		    (blkptr_t *)&dnp->dn_blkptr[j], &czb);
+		if (err) {
+			if (!hard)
+				break;
+			lasterr = err;
+		}
+	}
+	return (err != 0 ? err : lasterr);
 }
 
 /* ARGSUSED */
 static int
-traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct prefetch_data *pfd = arg;
 	uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH;
@@ -262,7 +290,8 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
 		return (EINTR);
 
 	if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) ||
-	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0))
+	    BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) ||
+	    BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG)
 		return (0);
 
 	mutex_enter(&pfd->pd_mtx);
@@ -291,7 +320,8 @@ traverse_prefetch_thread(void *arg)
 	td.td_arg = td_main->td_pfd;
 	td.td_pfd = NULL;
 
-	SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0);
+	SET_BOOKMARK(&czb, td.td_objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	(void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb);
 
 	mutex_enter(&td_main->td_pfd->pd_mtx);
@@ -332,7 +362,8 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp,
 	    &td, TQ_NOQUEUE))
 		pd.pd_exited = B_TRUE;
 
-	SET_BOOKMARK(&czb, objset, 0, -1, 0);
+	SET_BOOKMARK(&czb, objset,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb);
 
 	mutex_enter(&pd.pd_mtx);
@@ -364,43 +395,59 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags,
  * NB: pool must not be changing on-disk (eg, from zdb or sync context).
  */
 int
-traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg)
+traverse_pool(spa_t *spa, uint64_t txg_start, int flags,
+    blkptr_cb_t func, void *arg)
 {
-	int err;
+	int err, lasterr = 0;
 	uint64_t obj;
 	dsl_pool_t *dp = spa_get_dsl(spa);
 	objset_t *mos = dp->dp_meta_objset;
+	boolean_t hard = (flags & TRAVERSE_HARD);
 
 	/* visit the MOS */
 	err = traverse_impl(spa, 0, spa_get_rootblkptr(spa),
-	    0, TRAVERSE_PRE, func, arg);
+	    txg_start, flags, func, arg);
 	if (err)
 		return (err);
 
 	/* visit each dataset */
-	for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) {
+	for (obj = 1; err == 0 || (err != ESRCH && hard);
+	    err = dmu_object_next(mos, &obj, FALSE, txg_start)) {
 		dmu_object_info_t doi;
 
 		err = dmu_object_info(mos, obj, &doi);
-		if (err)
-			return (err);
+		if (err) {
+			if (!hard)
+				return (err);
+			lasterr = err;
+			continue;
+		}
 
 		if (doi.doi_type == DMU_OT_DSL_DATASET) {
 			dsl_dataset_t *ds;
+			uint64_t txg = txg_start;
+
 			rw_enter(&dp->dp_config_rwlock, RW_READER);
 			err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds);
 			rw_exit(&dp->dp_config_rwlock);
-			if (err)
-				return (err);
-			err = traverse_dataset(ds,
-			    ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE,
-			    func, arg);
+			if (err) {
+				if (!hard)
+					return (err);
+				lasterr = err;
+				continue;
+			}
+			if (ds->ds_phys->ds_prev_snap_txg > txg)
+				txg = ds->ds_phys->ds_prev_snap_txg;
+			err = traverse_dataset(ds, txg, flags, func, arg);
 			dsl_dataset_rele(ds, FTAG);
-			if (err)
-				return (err);
+			if (err) {
+				if (!hard)
+					return (err);
+				lasterr = err;
+			}
 		}
 	}
 	if (err == ESRCH)
 		err = 0;
-	return (err);
+	return (err != 0 ? err : lasterr);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c
index bf560e5657c1c..87907a6e33bae 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,6 +48,8 @@ dmu_tx_create_dd(dsl_dir_t *dd)
 		tx->tx_pool = dd->dd_pool;
 	list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t),
 	    offsetof(dmu_tx_hold_t, txh_node));
+	list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t),
+	    offsetof(dmu_tx_callback_t, dcb_node));
 #ifdef ZFS_DEBUG
 	refcount_create(&tx->tx_space_written);
 	refcount_create(&tx->tx_space_freed);
@@ -58,9 +60,9 @@ dmu_tx_create_dd(dsl_dir_t *dd)
 dmu_tx_t *
 dmu_tx_create(objset_t *os)
 {
-	dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir);
+	dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir);
 	tx->tx_objset = os;
-	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset);
+	tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset);
 	return (tx);
 }
 
@@ -98,7 +100,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object,
 	int err;
 
 	if (object != DMU_NEW_OBJECT) {
-		err = dnode_hold(os->os, object, tx, &dn);
+		err = dnode_hold(os, object, tx, &dn);
 		if (err) {
 			tx->tx_err = err;
 			return (NULL);
@@ -160,6 +162,50 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid)
 	return (err);
 }
 
+static void
+dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db,
+    int level, uint64_t blkid, boolean_t freeable, uint64_t *history)
+{
+	objset_t *os = dn->dn_objset;
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+	dmu_buf_impl_t *parent = NULL;
+	blkptr_t *bp = NULL;
+	uint64_t space;
+
+	if (level >= dn->dn_nlevels || history[level] == blkid)
+		return;
+
+	history[level] = blkid;
+
+	space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift);
+
+	if (db == NULL || db == dn->dn_dbuf) {
+		ASSERT(level != 0);
+		db = NULL;
+	} else {
+		ASSERT(db->db_dnode == dn);
+		ASSERT(db->db_level == level);
+		ASSERT(db->db.db_size == space);
+		ASSERT(db->db_blkid == blkid);
+		bp = db->db_blkptr;
+		parent = db->db_parent;
+	}
+
+	freeable = (bp && (freeable ||
+	    dsl_dataset_block_freeable(ds, bp->blk_birth)));
+
+	if (freeable)
+		txh->txh_space_tooverwrite += space;
+	else
+		txh->txh_space_towrite += space;
+	if (bp)
+		txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp);
+
+	dmu_tx_count_twig(txh, dn, parent, level + 1,
+	    blkid >> epbs, freeable, history);
+}
+
 /* ARGSUSED */
 static void
 dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
@@ -177,18 +223,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	min_ibs = DN_MIN_INDBLKSHIFT;
 	max_ibs = DN_MAX_INDBLKSHIFT;
 
-	/*
-	 * For i/o error checking, read the first and last level-0
-	 * blocks (if they are not aligned), and all the level-1 blocks.
-	 */
-
 	if (dn) {
+		uint64_t history[DN_MAX_LEVELS];
+		int nlvls = dn->dn_nlevels;
+		int delta;
+
+		/*
+		 * For i/o error checking, read the first and last level-0
+		 * blocks (if they are not aligned), and all the level-1 blocks.
+		 */
 		if (dn->dn_maxblkid == 0) {
-			if ((off > 0 || len < dn->dn_datablksz) &&
-			    off < dn->dn_datablksz) {
+			delta = dn->dn_datablksz;
+			start = (off < dn->dn_datablksz) ? 0 : 1;
+			end = (off+len <= dn->dn_datablksz) ? 0 : 1;
+			if (start == 0 && (off > 0 || len < dn->dn_datablksz)) {
 				err = dmu_tx_check_ioerr(NULL, dn, 0, 0);
 				if (err)
 					goto out;
+				delta -= off;
 			}
 		} else {
 			zio_t *zio = zio_root(dn->dn_objset->os_spa,
@@ -213,10 +265,9 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 			}
 
 			/* level-1 blocks */
-			if (dn->dn_nlevels > 1) {
-				start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-				end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT;
-				for (i = start+1; i < end; i++) {
+			if (nlvls > 1) {
+				int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
+				for (i = (start>>shft)+1; i < end>>shft; i++) {
 					err = dmu_tx_check_ioerr(zio, dn, 1, i);
 					if (err)
 						goto out;
@@ -226,20 +277,59 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 			err = zio_wait(zio);
 			if (err)
 				goto out;
+			delta = P2NPHASE(off, dn->dn_datablksz);
 		}
-	}
 
-	/*
-	 * If there's more than one block, the blocksize can't change,
-	 * so we can make a more precise estimate.  Alternatively,
-	 * if the dnode's ibs is larger than max_ibs, always use that.
-	 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
-	 * the code will still work correctly on existing pools.
-	 */
-	if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) {
-		min_ibs = max_ibs = dn->dn_indblkshift;
-		if (dn->dn_datablkshift != 0)
+		if (dn->dn_maxblkid > 0) {
+			/*
+			 * The blocksize can't change,
+			 * so we can make a more precise estimate.
+			 */
+			ASSERT(dn->dn_datablkshift != 0);
 			min_bs = max_bs = dn->dn_datablkshift;
+			min_ibs = max_ibs = dn->dn_indblkshift;
+		} else if (dn->dn_indblkshift > max_ibs) {
+			/*
+			 * This ensures that if we reduce DN_MAX_INDBLKSHIFT,
+			 * the code will still work correctly on older pools.
+			 */
+			min_ibs = max_ibs = dn->dn_indblkshift;
+		}
+
+		/*
+		 * If this write is not off the end of the file
+		 * we need to account for overwrites/unref.
+		 */
+		if (start <= dn->dn_maxblkid) {
+			for (int l = 0; l < DN_MAX_LEVELS; l++)
+				history[l] = -1ULL;
+		}
+		while (start <= dn->dn_maxblkid) {
+			dmu_buf_impl_t *db;
+
+			rw_enter(&dn->dn_struct_rwlock, RW_READER);
+			db = dbuf_hold_level(dn, 0, start, FTAG);
+			rw_exit(&dn->dn_struct_rwlock);
+			dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE,
+			    history);
+			dbuf_rele(db, FTAG);
+			if (++start > end) {
+				/*
+				 * Account for new indirects appearing
+				 * before this IO gets assigned into a txg.
+				 */
+				bits = 64 - min_bs;
+				epbs = min_ibs - SPA_BLKPTRSHIFT;
+				for (bits -= epbs * (nlvls - 1);
+				    bits >= 0; bits -= epbs)
+					txh->txh_fudge += 1ULL << max_ibs;
+				goto out;
+			}
+			off += delta;
+			if (len >= delta)
+				len -= delta;
+			delta = dn->dn_datablksz;
+		}
 	}
 
 	/*
@@ -262,20 +352,22 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 	for (bits = 64 - min_bs; bits >= 0; bits -= epbs) {
 		start >>= epbs;
 		end >>= epbs;
-		/*
-		 * If we increase the number of levels of indirection,
-		 * we'll need new blkid=0 indirect blocks.  If start == 0,
-		 * we're already accounting for that blocks; and if end == 0,
-		 * we can't increase the number of levels beyond that.
-		 */
-		if (start != 0 && end != 0)
-			txh->txh_space_towrite += 1ULL << max_ibs;
+		ASSERT3U(end, >=, start);
 		txh->txh_space_towrite += (end - start + 1) << max_ibs;
+		if (start != 0) {
+			/*
+			 * We also need a new blkid=0 indirect block
+			 * to reference any existing file data.
+			 */
+			txh->txh_space_towrite += 1ULL << max_ibs;
+		}
 	}
 
-	ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS);
-
 out:
+	if (txh->txh_space_towrite + txh->txh_space_tooverwrite >
+	    2 * DMU_MAX_ACCESS)
+		err = EFBIG;
+
 	if (err)
 		txh->txh_tx->tx_err = err;
 }
@@ -284,7 +376,7 @@ static void
 dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 {
 	dnode_t *dn = txh->txh_dnode;
-	dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode;
+	dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode;
 	uint64_t space = mdn->dn_datablksz +
 	    ((mdn->dn_nlevels-1) << mdn->dn_indblkshift);
 
@@ -292,6 +384,7 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh)
 	    dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset,
 	    dn->dn_dbuf->db_blkptr->blk_birth)) {
 		txh->txh_space_tooverwrite += space;
+		txh->txh_space_tounref += space;
 	} else {
 		txh->txh_space_towrite += space;
 		if (dn && dn->dn_dbuf->db_blkptr)
@@ -366,7 +459,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 			bp += blkid + i;
 			if (dsl_dataset_block_freeable(ds, bp->blk_birth)) {
 				dprintf_bp(bp, "can free old%s", "");
-				space += bp_get_dasize(spa, bp);
+				space += bp_get_dsize(spa, bp);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
@@ -425,11 +518,15 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 		dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG);
 
 		txh->txh_memory_tohold += dbuf->db.db_size;
-		if (txh->txh_memory_tohold > DMU_MAX_ACCESS) {
-			txh->txh_tx->tx_err = E2BIG;
-			dbuf_rele(dbuf, FTAG);
-			break;
-		}
+
+		/*
+		 * We don't check memory_tohold against DMU_MAX_ACCESS because
+		 * memory_tohold is an over-estimation (especially the >L1
+		 * indirect blocks), so it could fail.  Callers should have
+		 * already verified that they will not be holding too much
+		 * memory.
+		 */
+
 		err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL);
 		if (err != 0) {
 			txh->txh_tx->tx_err = err;
@@ -443,7 +540,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len)
 		for (i = 0; i < tochk; i++) {
 			if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) {
 				dprintf_bp(&bp[i], "can free old%s", "");
-				space += bp_get_dasize(spa, &bp[i]);
+				space += bp_get_dsize(spa, &bp[i]);
 			}
 			unref += BP_GET_ASIZE(bp);
 		}
@@ -488,6 +585,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 	if (len != DMU_OBJECT_END)
 		dmu_tx_count_write(txh, off+len, 1);
 
+	dmu_tx_count_dnode(txh);
+
 	if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz)
 		return;
 	if (len == DMU_OBJECT_END)
@@ -530,12 +629,11 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len)
 		}
 	}
 
-	dmu_tx_count_dnode(txh);
 	dmu_tx_count_free(txh, off, len);
 }
 
 void
-dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
+dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name)
 {
 	dmu_tx_hold_t *txh;
 	dnode_t *dn;
@@ -584,9 +682,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 			txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE;
 		} else {
 			txh->txh_space_towrite += SPA_MAXBLOCKSIZE;
-			txh->txh_space_tounref +=
-			    BP_GET_ASIZE(dn->dn_phys->dn_blkptr);
 		}
+		if (dn->dn_phys->dn_blkptr[0].blk_birth)
+			txh->txh_space_tounref += SPA_MAXBLOCKSIZE;
 		return;
 	}
 
@@ -595,7 +693,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 		 * access the name in this fat-zap so that we'll check
 		 * for i/o errors to the leaf blocks, etc.
 		 */
-		err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name,
+		err = zap_lookup(dn->dn_objset, dn->dn_object, name,
 		    8, 0, NULL);
 		if (err == EIO) {
 			tx->tx_err = err;
@@ -603,12 +701,8 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 		}
 	}
 
-	/*
-	 * 3 blocks overwritten: target leaf, ptrtbl block, header block
-	 * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks
-	 */
-	dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz,
-	    (3 + (add ? 3 : 0)) << dn->dn_datablkshift);
+	err = zap_count_write(dn->dn_objset, dn->dn_object, name, add,
+	    &txh->txh_space_towrite, &txh->txh_space_tooverwrite);
 
 	/*
 	 * If the modified blocks are scattered to the four winds,
@@ -616,7 +710,10 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name)
 	 */
 	epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT;
 	for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs)
-		txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+		if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj)
+			txh->txh_space_towrite += 3 << dn->dn_indblkshift;
+		else
+			txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift;
 }
 
 void
@@ -679,7 +776,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db)
 	dnode_t *dn = db->db_dnode;
 
 	ASSERT(tx->tx_txg != 0);
-	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os);
+	ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset);
 	ASSERT3U(dn->dn_object, ==, db->db.db_object);
 
 	if (tx->tx_anyobj)
@@ -839,7 +936,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how)
 	 * assume that we won't be able to free or overwrite anything.
 	 */
 	if (tx->tx_objset &&
-	    dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) >
+	    dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) >
 	    tx->tx_lastsnap_txg) {
 		towrite += tooverwrite;
 		tooverwrite = tofree = 0;
@@ -1020,8 +1117,13 @@ dmu_tx_commit(dmu_tx_t *tx)
 	if (tx->tx_tempreserve_cookie)
 		dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx);
 
+	if (!list_is_empty(&tx->tx_callbacks))
+		txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks);
+
 	if (tx->tx_anyobj == FALSE)
 		txg_rele_to_sync(&tx->tx_txgh);
+
+	list_destroy(&tx->tx_callbacks);
 	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n",
@@ -1050,6 +1152,14 @@ dmu_tx_abort(dmu_tx_t *tx)
 		if (dn != NULL)
 			dnode_rele(dn, tx);
 	}
+
+	/*
+	 * Call any registered callbacks with an error code.
+	 */
+	if (!list_is_empty(&tx->tx_callbacks))
+		dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED);
+
+	list_destroy(&tx->tx_callbacks);
 	list_destroy(&tx->tx_holds);
 #ifdef ZFS_DEBUG
 	refcount_destroy_many(&tx->tx_space_written,
@@ -1066,3 +1176,31 @@ dmu_tx_get_txg(dmu_tx_t *tx)
 	ASSERT(tx->tx_txg != 0);
 	return (tx->tx_txg);
 }
+
+void
+dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data)
+{
+	dmu_tx_callback_t *dcb;
+
+	dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP);
+
+	dcb->dcb_func = func;
+	dcb->dcb_data = data;
+
+	list_insert_tail(&tx->tx_callbacks, dcb);
+}
+
+/*
+ * Call all the commit callbacks on a list, with a given error code.
+ */
+void
+dmu_tx_do_callbacks(list_t *cb_list, int error)
+{
+	dmu_tx_callback_t *dcb;
+
+	while (dcb = list_head(cb_list)) {
+		list_remove(cb_list, dcb);
+		dcb->dcb_func(dcb->dcb_data, error);
+		kmem_free(dcb, sizeof (dmu_tx_callback_t));
+	}
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c
index 4d79fe98e17ee..37037c30f6235 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c
@@ -19,18 +19,17 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/dnode.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_zfetch.h>
 #include <sys/dmu.h>
 #include <sys/dbuf.h>
+#include <sys/kstat.h>
 
 /*
  * I'm against tune-ables, but these should probably exist as tweakable globals
@@ -59,6 +58,41 @@ static zstream_t	*dmu_zfetch_stream_reclaim(zfetch_t *);
 static void		dmu_zfetch_stream_remove(zfetch_t *, zstream_t *);
 static int		dmu_zfetch_streams_equal(zstream_t *, zstream_t *);
 
+typedef struct zfetch_stats {
+	kstat_named_t zfetchstat_hits;
+	kstat_named_t zfetchstat_misses;
+	kstat_named_t zfetchstat_colinear_hits;
+	kstat_named_t zfetchstat_colinear_misses;
+	kstat_named_t zfetchstat_stride_hits;
+	kstat_named_t zfetchstat_stride_misses;
+	kstat_named_t zfetchstat_reclaim_successes;
+	kstat_named_t zfetchstat_reclaim_failures;
+	kstat_named_t zfetchstat_stream_resets;
+	kstat_named_t zfetchstat_stream_noresets;
+	kstat_named_t zfetchstat_bogus_streams;
+} zfetch_stats_t;
+
+static zfetch_stats_t zfetch_stats = {
+	{ "hits",			KSTAT_DATA_UINT64 },
+	{ "misses",			KSTAT_DATA_UINT64 },
+	{ "colinear_hits",		KSTAT_DATA_UINT64 },
+	{ "colinear_misses",		KSTAT_DATA_UINT64 },
+	{ "stride_hits",		KSTAT_DATA_UINT64 },
+	{ "stride_misses",		KSTAT_DATA_UINT64 },
+	{ "reclaim_successes",		KSTAT_DATA_UINT64 },
+	{ "reclaim_failures",		KSTAT_DATA_UINT64 },
+	{ "streams_resets",		KSTAT_DATA_UINT64 },
+	{ "streams_noresets",		KSTAT_DATA_UINT64 },
+	{ "bogus_streams",		KSTAT_DATA_UINT64 },
+};
+
+#define	ZFETCHSTAT_INCR(stat, val) \
+	atomic_add_64(&zfetch_stats.stat.value.ui64, (val));
+
+#define	ZFETCHSTAT_BUMP(stat)		ZFETCHSTAT_INCR(stat, 1);
+
+kstat_t		*zfetch_ksp;
+
 /*
  * Given a zfetch structure and a zstream structure, determine whether the
  * blocks to be read are part of a co-linear pair of existing prefetch
@@ -192,7 +226,30 @@ dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs)
 			break;
 	}
 	zs->zst_ph_offset = prefetch_tail;
-	zs->zst_last = lbolt;
+	zs->zst_last = ddi_get_lbolt();
+}
+
+void
+zfetch_init(void)
+{
+
+	zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc",
+	    KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t),
+	    KSTAT_FLAG_VIRTUAL);
+
+	if (zfetch_ksp != NULL) {
+		zfetch_ksp->ks_data = &zfetch_stats;
+		kstat_install(zfetch_ksp);
+	}
+}
+
+void
+zfetch_fini(void)
+{
+	if (zfetch_ksp != NULL) {
+		kstat_delete(zfetch_ksp);
+		zfetch_ksp = NULL;
+	}
 }
 
 /*
@@ -265,7 +322,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks)
 }
 
 /*
- * given a zfetch and a zsearch structure, see if there is an associated zstream
+ * given a zfetch and a zstream structure, see if there is an associated zstream
  * for this block read.  If so, it starts a prefetch for the stream it
  * located and returns true, otherwise it returns false
  */
@@ -297,6 +354,7 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
 		 */
 		if (zs->zst_len == 0) {
 			/* bogus stream */
+			ZFETCHSTAT_BUMP(zfetchstat_bogus_streams);
 			continue;
 		}
 
@@ -306,9 +364,14 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
 		 */
 		if (zh->zst_offset >= zs->zst_offset &&
 		    zh->zst_offset < zs->zst_offset + zs->zst_len) {
-			/* already fetched */
-			rc = 1;
-			goto out;
+			if (prefetched) {
+				/* already fetched */
+				ZFETCHSTAT_BUMP(zfetchstat_stride_hits);
+				rc = 1;
+				goto out;
+			} else {
+				ZFETCHSTAT_BUMP(zfetchstat_stride_misses);
+			}
 		}
 
 		/*
@@ -413,6 +476,7 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
 		if (reset) {
 			zstream_t *remove = zs;
 
+			ZFETCHSTAT_BUMP(zfetchstat_stream_resets);
 			rc = 0;
 			mutex_exit(&zs->zst_lock);
 			rw_exit(&zf->zf_rwlock);
@@ -431,6 +495,7 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched)
 				}
 			}
 		} else {
+			ZFETCHSTAT_BUMP(zfetchstat_stream_noresets);
 			rc = 1;
 			dmu_zfetch_dofetch(zf, zs);
 			mutex_exit(&zs->zst_lock);
@@ -487,13 +552,12 @@ dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs)
 		zs_next = list_next(&zf->zf_stream, zs_walk);
 
 		if (dmu_zfetch_streams_equal(zs_walk, zs)) {
-		    return (0);
+			return (0);
 		}
 	}
 
 	list_insert_head(&zf->zf_stream, zs);
 	zf->zf_stream_cnt++;
-
 	return (1);
 }
 
@@ -513,7 +577,7 @@ dmu_zfetch_stream_reclaim(zfetch_t *zf)
 	for (zs = list_head(&zf->zf_stream); zs;
 	    zs = list_next(&zf->zf_stream, zs)) {
 
-		if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap)
+		if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap)
 			break;
 	}
 
@@ -597,8 +661,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
 	    P2ALIGN(offset, blksz)) >> blkshft;
 
 	fetched = dmu_zfetch_find(zf, &zst, prefetched);
-	if (!fetched) {
-		fetched = dmu_zfetch_colinear(zf, &zst);
+	if (fetched) {
+		ZFETCHSTAT_BUMP(zfetchstat_hits);
+	} else {
+		ZFETCHSTAT_BUMP(zfetchstat_misses);
+		if (fetched = dmu_zfetch_colinear(zf, &zst)) {
+			ZFETCHSTAT_BUMP(zfetchstat_colinear_hits);
+		} else {
+			ZFETCHSTAT_BUMP(zfetchstat_colinear_misses);
+		}
 	}
 
 	if (!fetched) {
@@ -608,11 +679,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
 		 * we still couldn't find a stream, drop the lock, and allocate
 		 * one if possible.  Otherwise, give up and go home.
 		 */
-		if (newstream == NULL) {
+		if (newstream) {
+			ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes);
+		} else {
 			uint64_t	maxblocks;
 			uint32_t	max_streams;
 			uint32_t	cur_streams;
 
+			ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures);
 			cur_streams = zf->zf_stream_cnt;
 			maxblocks = zf->zf_dnode->dn_maxblkid;
 
@@ -625,7 +699,6 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
 			if (cur_streams >= max_streams) {
 				return;
 			}
-
 			newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP);
 		}
 
@@ -635,7 +708,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched)
 		newstream->zst_ph_offset = zst.zst_len + zst.zst_offset;
 		newstream->zst_cap = zst.zst_len;
 		newstream->zst_direction = ZFETCH_FORWARD;
-		newstream->zst_last = lbolt;
+		newstream->zst_last = ddi_get_lbolt();
 
 		mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL);
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c
index e77834d60dcc3..d15fe8d86243b 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -56,6 +56,8 @@ dnode_cons(void *arg, void *unused, int kmflag)
 	rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL);
+	cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL);
+
 	refcount_create(&dn->dn_holds);
 	refcount_create(&dn->dn_tx_holds);
 
@@ -84,6 +86,7 @@ dnode_dest(void *arg, void *unused)
 	rw_destroy(&dn->dn_struct_rwlock);
 	mutex_destroy(&dn->dn_mtx);
 	mutex_destroy(&dn->dn_dbufs_mtx);
+	cv_destroy(&dn->dn_notxholds);
 	refcount_destroy(&dn->dn_holds);
 	refcount_destroy(&dn->dn_tx_holds);
 
@@ -153,7 +156,7 @@ dnode_verify(dnode_t *dn)
 	}
 	if (dn->dn_phys->dn_type != DMU_OT_NONE)
 		ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels);
-	ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL);
+	ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL);
 	if (dn->dn_dbuf != NULL) {
 		ASSERT3P(dn->dn_phys, ==,
 		    (dnode_phys_t *)dn->dn_dbuf->db.db_data +
@@ -269,7 +272,7 @@ dnode_setdblksz(dnode_t *dn, int size)
 }
 
 static dnode_t *
-dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
+dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
     uint64_t object)
 {
 	dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP);
@@ -299,14 +302,14 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db,
 	list_insert_head(&os->os_dnodes, dn);
 	mutex_exit(&os->os_lock);
 
-	arc_space_consume(sizeof (dnode_t));
+	arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER);
 	return (dn);
 }
 
 static void
 dnode_destroy(dnode_t *dn)
 {
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;
 
 #ifdef ZFS_DEBUG
 	int i;
@@ -318,6 +321,7 @@ dnode_destroy(dnode_t *dn)
 	}
 	ASSERT(NULL == list_head(&dn->dn_dbufs));
 #endif
+	ASSERT(dn->dn_oldphys == NULL);
 
 	mutex_enter(&os->os_lock);
 	list_remove(&os->os_dnodes, dn);
@@ -334,7 +338,7 @@ dnode_destroy(dnode_t *dn)
 		dn->dn_bonus = NULL;
 	}
 	kmem_cache_free(dnode_cache, dn);
-	arc_space_return(sizeof (dnode_t));
+	arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER);
 }
 
 void
@@ -414,8 +418,7 @@ void
 dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
 {
-	int i, old_nblkptr;
-	dmu_buf_impl_t *db = NULL;
+	int nblkptr;
 
 	ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE);
 	ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE);
@@ -427,57 +430,40 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize,
 	ASSERT3U(bonustype, <, DMU_OT_NUMTYPES);
 	ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN);
 
-	for (i = 0; i < TXG_SIZE; i++)
-		ASSERT(!list_link_active(&dn->dn_dirty_link[i]));
-
 	/* clean up any unreferenced dbufs */
 	dnode_evict_dbufs(dn);
-	ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
 
-	/*
-	 * XXX I should really have a generation number to tell if we
-	 * need to do this...
-	 */
-	if (blocksize != dn->dn_datablksz ||
-	    dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) {
-		/* free all old data */
-		dnode_free_range(dn, 0, -1ULL, tx);
-	}
-
-	/* change blocksize */
 	rw_enter(&dn->dn_struct_rwlock, RW_WRITER);
-	if (blocksize != dn->dn_datablksz &&
-	    (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
-	    list_head(&dn->dn_dbufs) != NULL)) {
-		db = dbuf_hold(dn, 0, FTAG);
-		dbuf_new_size(db, blocksize, tx);
-	}
-	dnode_setdblksz(dn, blocksize);
 	dnode_setdirty(dn, tx);
-	dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
-	dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+	if (dn->dn_datablksz != blocksize) {
+		/* change blocksize */
+		ASSERT(dn->dn_maxblkid == 0 &&
+		    (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) ||
+		    dnode_block_freed(dn, 0)));
+		dnode_setdblksz(dn, blocksize);
+		dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize;
+	}
+	if (dn->dn_bonuslen != bonuslen)
+		dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen;
+	nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	if (dn->dn_nblkptr != nblkptr)
+		dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr;
 	rw_exit(&dn->dn_struct_rwlock);
-	if (db)
-		dbuf_rele(db, FTAG);
 
 	/* change type */
 	dn->dn_type = ot;
 
 	/* change bonus size and type */
 	mutex_enter(&dn->dn_mtx);
-	old_nblkptr = dn->dn_nblkptr;
 	dn->dn_bonustype = bonustype;
 	dn->dn_bonuslen = bonuslen;
-	dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT);
+	dn->dn_nblkptr = nblkptr;
 	dn->dn_checksum = ZIO_CHECKSUM_INHERIT;
 	dn->dn_compress = ZIO_COMPRESS_INHERIT;
 	ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR);
 
-	/* XXX - for now, we can't make nblkptr smaller */
-	ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr);
-
-	/* fix up the bonus db_size if dn_nblkptr has changed */
-	if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) {
+	/* fix up the bonus db_size */
+	if (dn->dn_bonus) {
 		dn->dn_bonus->db.db_size =
 		    DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t);
 		ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size);
@@ -502,7 +488,7 @@ dnode_special_close(dnode_t *dn)
 }
 
 dnode_t *
-dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object)
+dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object)
 {
 	dnode_t *dn = dnode_create(os, dnp, NULL, object);
 	DNODE_VERIFY(dn);
@@ -549,7 +535,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg)
  * succeeds even for free dnodes.
  */
 int
-dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
+dnode_hold_impl(objset_t *os, uint64_t object, int flag,
     void *tag, dnode_t **dnp)
 {
 	int epb, idx, err;
@@ -566,6 +552,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
 	 */
 	ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0);
 
+	if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) {
+		dn = (object == DMU_USERUSED_OBJECT) ?
+		    os->os_userused_dnode : os->os_groupused_dnode;
+		if (dn == NULL)
+			return (ENOENT);
+		type = dn->dn_type;
+		if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE)
+			return (ENOENT);
+		if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)
+			return (EEXIST);
+		DNODE_VERIFY(dn);
+		(void) refcount_add(&dn->dn_holds, tag);
+		*dnp = dn;
+		return (0);
+	}
+
 	if (object == 0 || object >= DN_MAX_OBJECT)
 		return (EINVAL);
 
@@ -624,7 +626,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
 	type = dn->dn_type;
 	if (dn->dn_free_txg ||
 	    ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) ||
-	    ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) {
+	    ((flag & DNODE_MUST_BE_FREE) &&
+	    (type != DMU_OT_NONE || dn->dn_oldphys))) {
 		mutex_exit(&dn->dn_mtx);
 		dbuf_rele(db, FTAG);
 		return (type == DMU_OT_NONE ? ENOENT : EEXIST);
@@ -647,7 +650,7 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag,
  * Return held dnode if the object is allocated, NULL if not.
  */
 int
-dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp)
+dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp)
 {
 	return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp));
 }
@@ -686,11 +689,13 @@ dnode_rele(dnode_t *dn, void *tag)
 void
 dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
 {
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;
 	uint64_t txg = tx->tx_txg;
 
-	if (dn->dn_object == DMU_META_DNODE_OBJECT)
+	if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		dsl_dataset_dirty(os->os_dsl_dataset, tx);
 		return;
+	}
 
 	DNODE_VERIFY(dn);
 
@@ -1186,11 +1191,6 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid)
 	if (dn->dn_free_txg)
 		return (TRUE);
 
-	/*
-	 * If dn_datablkshift is not set, then there's only a single
-	 * block, in which case there will never be a free range so it
-	 * won't matter.
-	 */
 	range_tofind.fr_blkid = blkid;
 	mutex_enter(&dn->dn_mtx);
 	for (i = 0; i < TXG_SIZE; i++) {
@@ -1248,7 +1248,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta)
 void
 dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
 {
-	objset_impl_t *os = dn->dn_objset;
+	objset_t *os = dn->dn_objset;
 	dsl_dataset_t *ds = os->os_dsl_dataset;
 
 	if (space > 0)
@@ -1260,6 +1260,22 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx)
 	dmu_tx_willuse_space(tx, space);
 }
 
+/*
+ * This function scans a block at the indicated "level" looking for
+ * a hole or data (depending on 'flags').  If level > 0, then we are
+ * scanning an indirect block looking at its pointers.  If level == 0,
+ * then we are looking at a block of dnodes.  If we don't find what we
+ * are looking for in the block, we return ESRCH.  Otherwise, return
+ * with *offset pointing to the beginning (if searching forwards) or
+ * end (if searching backwards) of the range covered by the block
+ * pointer we matched on (or dnode).
+ *
+ * The basic search algorithm used below by dnode_next_offset() is to
+ * use this function to search up the block tree (widen the search) until
+ * we find something (i.e., we don't return ESRCH) and then search back
+ * down the tree (narrow the search) until we reach our original search
+ * level.
+ */
 static int
 dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	int lvl, uint64_t blkfill, uint64_t txg)
@@ -1275,7 +1291,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 	dprintf("probing object %llu offset %llx level %d of %u\n",
 	    dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels);
 
-	hole = flags & DNODE_FIND_HOLE;
+	hole = ((flags & DNODE_FIND_HOLE) != 0);
 	inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1;
 	ASSERT(txg == 0 || !hole);
 
@@ -1322,16 +1338,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 
 		for (i = (*offset >> span) & (blkfill - 1);
 		    i >= 0 && i < blkfill; i += inc) {
-			boolean_t newcontents = B_TRUE;
-			if (txg) {
-				int j;
-				newcontents = B_FALSE;
-				for (j = 0; j < dnp[i].dn_nblkptr; j++) {
-					if (dnp[i].dn_blkptr[j].blk_birth > txg)
-						newcontents = B_TRUE;
-				}
-			}
-			if (!dnp[i].dn_type == hole && newcontents)
+			if ((dnp[i].dn_type == DMU_OT_NONE) == hole)
 				break;
 			*offset += (1ULL << span) * inc;
 		}
@@ -1339,6 +1346,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 			error = ESRCH;
 	} else {
 		blkptr_t *bp = data;
+		uint64_t start = *offset;
 		span = (lvl - 1) * epbs + dn->dn_datablkshift;
 		minfill = 0;
 		maxfill = blkfill << ((lvl - 1) * epbs);
@@ -1348,18 +1356,25 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset,
 		else
 			minfill++;
 
-		for (i = (*offset >> span) & ((1ULL << epbs) - 1);
+		*offset = *offset >> span;
+		for (i = BF64_GET(*offset, 0, epbs);
 		    i >= 0 && i < epb; i += inc) {
 			if (bp[i].blk_fill >= minfill &&
 			    bp[i].blk_fill <= maxfill &&
 			    (hole || bp[i].blk_birth > txg))
 				break;
-			if (inc < 0 && *offset < (1ULL << span))
-				*offset = 0;
-			else
-				*offset += (1ULL << span) * inc;
+			if (inc > 0 || *offset > 0)
+				*offset += inc;
+		}
+		*offset = *offset << span;
+		if (inc < 0) {
+			/* traversing backwards; position offset at the end */
+			ASSERT3U(*offset, <=, start);
+			*offset = MIN(*offset + (1ULL << span) - 1, start);
+		} else if (*offset < start) {
+			*offset = start;
 		}
-		if (i < 0 || i == epb)
+		if (i < 0 || i >= epb)
 			error = ESRCH;
 	}
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c
index 779cfc96f9e3c..b2d121ee60483 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/dbuf.h>
 #include <sys/dnode.h>
@@ -122,7 +120,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx)
 		if (BP_IS_HOLE(bp))
 			continue;
 
-		bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx);
+		bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE);
 		ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys));
 		bzero(bp, sizeof (blkptr_t));
 		blocks_freed += 1;
@@ -426,6 +424,9 @@ dnode_undirty_dbufs(list_t *list)
 		dmu_buf_impl_t *db = dr->dr_dbuf;
 		uint64_t txg = dr->dr_txg;
 
+		if (db->db_level != 0)
+			dnode_undirty_dbufs(&dr->dt.di.dr_children);
+
 		mutex_enter(&db->db_mtx);
 		/* XXX - use dbuf_undirty()? */
 		list_remove(list, dr);
@@ -436,13 +437,9 @@ dnode_undirty_dbufs(list_t *list)
 			ASSERT(db->db_blkid == DB_BONUS_BLKID ||
 			    dr->dt.dl.dr_data == db->db_buf);
 			dbuf_unoverride(dr);
-			mutex_exit(&db->db_mtx);
-		} else {
-			mutex_exit(&db->db_mtx);
-			dnode_undirty_dbufs(&dr->dt.di.dr_children);
 		}
 		kmem_free(dr, sizeof (dbuf_dirty_record_t));
-		dbuf_rele(db, (void *)(uintptr_t)txg);
+		dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg);
 	}
 }
 
@@ -506,9 +503,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)
 
 /*
  * Write out the dnode's dirty buffers.
- *
- * NOTE: The dnode is kept in memory by being dirty.  Once the
- * dirty bit is cleared, it may be evicted.  Beware of this!
  */
 void
 dnode_sync(dnode_t *dn, dmu_tx_t *tx)
@@ -517,33 +511,40 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 	dnode_phys_t *dnp = dn->dn_phys;
 	int txgoff = tx->tx_txg & TXG_MASK;
 	list_t *list = &dn->dn_dirty_records[txgoff];
+	static const dnode_phys_t zerodn = { 0 };
 
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg);
+	ASSERT(dnp->dn_type != DMU_OT_NONE ||
+	    bcmp(dnp, &zerodn, DNODE_SIZE) == 0);
 	DNODE_VERIFY(dn);
 
 	ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf));
 
+	if (dmu_objset_userused_enabled(dn->dn_objset) &&
+	    !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
+		ASSERT(dn->dn_oldphys == NULL);
+		dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t));
+		*dn->dn_oldphys = *dn->dn_phys; /* struct assignment */
+		dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED;
+	} else {
+		/* Once we account for it, we should always account for it. */
+		ASSERT(!(dn->dn_phys->dn_flags &
+		    DNODE_FLAG_USERUSED_ACCOUNTED));
+	}
+
 	mutex_enter(&dn->dn_mtx);
 	if (dn->dn_allocated_txg == tx->tx_txg) {
 		/* The dnode is newly allocated or reallocated */
 		if (dnp->dn_type == DMU_OT_NONE) {
 			/* this is a first alloc, not a realloc */
-			/* XXX shouldn't the phys already be zeroed? */
-			bzero(dnp, DNODE_CORE_SIZE);
 			dnp->dn_nlevels = 1;
+			dnp->dn_nblkptr = dn->dn_nblkptr;
 		}
 
-		if (dn->dn_nblkptr > dnp->dn_nblkptr) {
-			/* zero the new blkptrs we are gaining */
-			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
-			    sizeof (blkptr_t) *
-			    (dn->dn_nblkptr - dnp->dn_nblkptr));
-		}
 		dnp->dn_type = dn->dn_type;
 		dnp->dn_bonustype = dn->dn_bonustype;
 		dnp->dn_bonuslen = dn->dn_bonuslen;
-		dnp->dn_nblkptr = dn->dn_nblkptr;
 	}
 
 	ASSERT(dnp->dn_nlevels > 1 ||
@@ -603,6 +604,30 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 		return;
 	}
 
+	if (dn->dn_next_nblkptr[txgoff]) {
+		/* this should only happen on a realloc */
+		ASSERT(dn->dn_allocated_txg == tx->tx_txg);
+		if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) {
+			/* zero the new blkptrs we are gaining */
+			bzero(dnp->dn_blkptr + dnp->dn_nblkptr,
+			    sizeof (blkptr_t) *
+			    (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr));
+#ifdef ZFS_DEBUG
+		} else {
+			int i;
+			ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr);
+			/* the blkptrs we are losing better be unallocated */
+			for (i = dn->dn_next_nblkptr[txgoff];
+			    i < dnp->dn_nblkptr; i++)
+				ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i]));
+#endif
+		}
+		mutex_enter(&dn->dn_mtx);
+		dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff];
+		dn->dn_next_nblkptr[txgoff] = 0;
+		mutex_exit(&dn->dn_mtx);
+	}
+
 	if (dn->dn_next_nlevels[txgoff]) {
 		dnode_increase_indirection(dn, tx);
 		dn->dn_next_nlevels[txgoff] = 0;
@@ -610,7 +635,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx)
 
 	dbuf_sync_list(list, tx);
 
-	if (dn->dn_object != DMU_META_DNODE_OBJECT) {
+	if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) {
 		ASSERT3P(list_head(list), ==, NULL);
 		dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg);
 	}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c
index 93ea8aa111731..30b3811a8ae9c 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,14 +38,12 @@
 #include <sys/zfs_ioctl.h>
 #include <sys/spa.h>
 #include <sys/zfs_znode.h>
-#include <sys/sunddi.h>
+#include <sys/zvol.h>
 
 static char *dsl_reaper = "the grim reaper";
 
 static dsl_checkfunc_t dsl_dataset_destroy_begin_check;
 static dsl_syncfunc_t dsl_dataset_destroy_begin_sync;
-static dsl_checkfunc_t dsl_dataset_rollback_check;
-static dsl_syncfunc_t dsl_dataset_rollback_sync;
 static dsl_syncfunc_t dsl_dataset_set_reservation_sync;
 
 #define	DS_REF_MAX	(1ULL << 62)
@@ -76,9 +74,9 @@ parent_delta(dsl_dataset_t *ds, int64_t delta)
 }
 
 void
-dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
+dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx)
 {
-	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
+	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
 	int compressed = BP_GET_PSIZE(bp);
 	int uncompressed = BP_GET_UCSIZE(bp);
 	int64_t delta;
@@ -119,29 +117,26 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx)
 }
 
 int
-dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
-    dmu_tx_t *tx)
+dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx,
+    boolean_t async)
 {
-	int used = bp_get_dasize(tx->tx_pool->dp_spa, bp);
-	int compressed = BP_GET_PSIZE(bp);
-	int uncompressed = BP_GET_UCSIZE(bp);
-
-	ASSERT(pio != NULL);
-	ASSERT(dmu_tx_is_syncing(tx));
-	/* No block pointer => nothing to free */
 	if (BP_IS_HOLE(bp))
 		return (0);
 
+	ASSERT(dmu_tx_is_syncing(tx));
+	ASSERT(bp->blk_birth <= tx->tx_txg);
+
+	int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp);
+	int compressed = BP_GET_PSIZE(bp);
+	int uncompressed = BP_GET_UCSIZE(bp);
+
 	ASSERT(used > 0);
 	if (ds == NULL) {
-		int err;
 		/*
 		 * Account for the meta-objset space in its placeholder
 		 * dataset.
 		 */
-		err = dsl_free(pio, tx->tx_pool,
-		    tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
-		ASSERT(err == 0);
+		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
 		dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD,
 		    -used, -compressed, -uncompressed, tx);
@@ -154,13 +149,10 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) {
-		int err;
 		int64_t delta;
 
 		dprintf_bp(bp, "freeing: %s", "");
-		err = dsl_free(pio, tx->tx_pool,
-		    tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT);
-		ASSERT(err == 0);
+		dsl_free(tx->tx_pool, tx->tx_txg, bp);
 
 		mutex_enter(&ds->ds_dir->dd_lock);
 		mutex_enter(&ds->ds_lock);
@@ -176,7 +168,18 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
 		mutex_exit(&ds->ds_dir->dd_lock);
 	} else {
 		dprintf_bp(bp, "putting on dead list: %s", "");
-		VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+		if (async) {
+			/*
+			 * We are here as part of zio's write done callback,
+			 * which means we're a zio interrupt thread.  We can't
+			 * call bplist_enqueue() now because it may block
+			 * waiting for I/O.  Instead, put bp on the deferred
+			 * queue and let dsl_pool_sync() finish the job.
+			 */
+			bplist_enqueue_deferred(&ds->ds_deadlist, bp);
+		} else {
+			VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx));
+		}
 		ASSERT3U(ds->ds_prev->ds_object, ==,
 		    ds->ds_phys->ds_prev_snap_obj);
 		ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0);
@@ -229,7 +232,7 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds)
 	return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap));
 }
 
-int
+boolean_t
 dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth)
 {
 	return (blk_birth > dsl_dataset_prev_snap_txg(ds));
@@ -243,12 +246,10 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 
 	ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds));
 
-	dprintf_ds(ds, "evicting %s\n", "");
-
 	unique_remove(ds->ds_fsid_guid);
 
-	if (ds->ds_user_ptr != NULL)
-		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
+	if (ds->ds_objset != NULL)
+		dmu_objset_evict(ds->ds_objset);
 
 	if (ds->ds_prev) {
 		dsl_dataset_drop_ref(ds->ds_prev, ds);
@@ -262,10 +263,11 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv)
 	ASSERT(!list_link_active(&ds->ds_synced_link));
 
 	mutex_destroy(&ds->ds_lock);
+	mutex_destroy(&ds->ds_recvlock);
 	mutex_destroy(&ds->ds_opening_lock);
-	mutex_destroy(&ds->ds_deadlist.bpl_lock);
 	rw_destroy(&ds->ds_rwlock);
 	cv_destroy(&ds->ds_exclusive_cv);
+	bplist_fini(&ds->ds_deadlist);
 
 	kmem_free(ds, sizeof (dsl_dataset_t));
 }
@@ -323,6 +325,8 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx)
 	matchtype_t mt;
 	int err;
 
+	dsl_dir_snap_cmtime_update(ds->ds_dir);
+
 	if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET)
 		mt = MT_FIRST;
 	else
@@ -359,11 +363,11 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 		ds->ds_phys = dbuf->db_data;
 
 		mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL);
+		mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL);
 		mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL);
-		mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT,
-		    NULL);
 		rw_init(&ds->ds_rwlock, 0, 0, 0);
 		cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL);
+		bplist_init(&ds->ds_deadlist);
 
 		err = bplist_open(&ds->ds_deadlist,
 		    mos, ds->ds_phys->ds_deadlist_obj);
@@ -377,10 +381,11 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 			 * just opened it.
 			 */
 			mutex_destroy(&ds->ds_lock);
+			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
-			mutex_destroy(&ds->ds_deadlist.bpl_lock);
 			rw_destroy(&ds->ds_rwlock);
 			cv_destroy(&ds->ds_exclusive_cv);
+			bplist_fini(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			dmu_buf_rele(dbuf, tag);
 			return (err);
@@ -406,8 +411,15 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 					dsl_dataset_rele(origin, FTAG);
 				}
 			}
-		} else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) {
-			err = dsl_dataset_get_snapname(ds);
+		} else {
+			if (zfs_flags & ZFS_DEBUG_SNAPNAMES)
+				err = dsl_dataset_get_snapname(ds);
+			if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) {
+				err = zap_count(
+				    ds->ds_dir->dd_pool->dp_meta_objset,
+				    ds->ds_phys->ds_userrefs_obj,
+				    &ds->ds_userrefs);
+			}
 		}
 
 		if (err == 0 && !dsl_dataset_is_snapshot(ds)) {
@@ -448,10 +460,11 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 				dsl_dataset_drop_ref(ds->ds_prev, ds);
 			dsl_dir_close(ds->ds_dir, ds);
 			mutex_destroy(&ds->ds_lock);
+			mutex_destroy(&ds->ds_recvlock);
 			mutex_destroy(&ds->ds_opening_lock);
-			mutex_destroy(&ds->ds_deadlist.bpl_lock);
 			rw_destroy(&ds->ds_rwlock);
 			cv_destroy(&ds->ds_exclusive_cv);
+			bplist_fini(&ds->ds_deadlist);
 			kmem_free(ds, sizeof (dsl_dataset_t));
 			if (err) {
 				dmu_buf_rele(dbuf, tag);
@@ -519,7 +532,15 @@ dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag)
 			rw_enter(&dp->dp_config_rwlock, RW_READER);
 			return (ENOENT);
 		}
+		/*
+		 * The dp_config_rwlock lives above the ds_lock. And
+		 * we need to check DSL_DATASET_IS_DESTROYED() while
+		 * holding the ds_lock, so we have to drop and reacquire
+		 * the ds_lock here.
+		 */
+		mutex_exit(&ds->ds_lock);
 		rw_enter(&dp->dp_config_rwlock, RW_READER);
+		mutex_enter(&ds->ds_lock);
 	}
 	mutex_exit(&ds->ds_lock);
 	return (0);
@@ -537,17 +558,15 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
 }
 
 int
-dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner,
-    dsl_dataset_t **dsp)
+dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok,
+    void *tag, dsl_dataset_t **dsp)
 {
-	int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp);
-
-	ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER);
-
+	int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp);
 	if (err)
 		return (err);
-	if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
-		dsl_dataset_rele(*dsp, owner);
+	if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+		dsl_dataset_rele(*dsp, tag);
+		*dsp = NULL;
 		return (EBUSY);
 	}
 	return (0);
@@ -613,18 +632,14 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp)
 }
 
 int
-dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp)
+dsl_dataset_own(const char *name, boolean_t inconsistentok,
+    void *tag, dsl_dataset_t **dsp)
 {
-	int err = dsl_dataset_hold(name, owner, dsp);
+	int err = dsl_dataset_hold(name, tag, dsp);
 	if (err)
 		return (err);
-	if ((*dsp)->ds_phys->ds_num_children > 0 &&
-	    !DS_MODE_IS_READONLY(flags)) {
-		dsl_dataset_rele(*dsp, owner);
-		return (EROFS);
-	}
-	if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) {
-		dsl_dataset_rele(*dsp, owner);
+	if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) {
+		dsl_dataset_rele(*dsp, tag);
 		return (EBUSY);
 	}
 	return (0);
@@ -696,9 +711,9 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag)
 }
 
 void
-dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
+dsl_dataset_disown(dsl_dataset_t *ds, void *tag)
 {
-	ASSERT((ds->ds_owner == owner && ds->ds_dbuf) ||
+	ASSERT((ds->ds_owner == tag && ds->ds_dbuf) ||
 	    (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL));
 
 	mutex_enter(&ds->ds_lock);
@@ -709,20 +724,20 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *owner)
 	}
 	mutex_exit(&ds->ds_lock);
 	if (ds->ds_dbuf)
-		dsl_dataset_drop_ref(ds, owner);
+		dsl_dataset_drop_ref(ds, tag);
 	else
 		dsl_dataset_evict(ds->ds_dbuf, ds);
 }
 
 boolean_t
-dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner)
+dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag)
 {
 	boolean_t gotit = FALSE;
 
 	mutex_enter(&ds->ds_lock);
 	if (ds->ds_owner == NULL &&
 	    (!DS_IS_INCONSISTENT(ds) || inconsistentok)) {
-		ds->ds_owner = owner;
+		ds->ds_owner = tag;
 		if (!dsl_pool_sync_context(ds->ds_dir->dd_pool))
 			rw_exit(&ds->ds_rwlock);
 		gotit = TRUE;
@@ -844,30 +859,33 @@ struct destroyarg {
 	dsl_sync_task_group_t *dstg;
 	char *snapname;
 	char *failed;
+	boolean_t defer;
 };
 
 static int
-dsl_snapshot_destroy_one(char *name, void *arg)
+dsl_snapshot_destroy_one(const char *name, void *arg)
 {
 	struct destroyarg *da = arg;
 	dsl_dataset_t *ds;
-	char *cp;
 	int err;
+	char *dsname;
 
-	(void) strcat(name, "@");
-	(void) strcat(name, da->snapname);
-	err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT,
-	    da->dstg, &ds);
-	cp = strchr(name, '@');
-	*cp = '\0';
+	dsname = kmem_asprintf("%s@%s", name, da->snapname);
+	err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds);
+	strfree(dsname);
 	if (err == 0) {
+		struct dsl_ds_destroyarg *dsda;
+
 		dsl_dataset_make_exclusive(ds, da->dstg);
-		if (ds->ds_user_ptr) {
-			ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-			ds->ds_user_ptr = NULL;
+		if (ds->ds_objset != NULL) {
+			dmu_objset_evict(ds->ds_objset);
+			ds->ds_objset = NULL;
 		}
+		dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP);
+		dsda->ds = ds;
+		dsda->defer = da->defer;
 		dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check,
-		    dsl_dataset_destroy_sync, ds, da->dstg, 0);
+		    dsl_dataset_destroy_sync, dsda, da->dstg, 0);
 	} else if (err == ENOENT) {
 		err = 0;
 	} else {
@@ -881,7 +899,7 @@ dsl_snapshot_destroy_one(char *name, void *arg)
  */
 #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy
 int
-dsl_snapshots_destroy(char *fsname, char *snapname)
+dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer)
 {
 	int err;
 	struct destroyarg da;
@@ -894,6 +912,7 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
 	da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
 	da.snapname = snapname;
 	da.failed = fsname;
+	da.defer = defer;
 
 	err = dmu_objset_find(fsname,
 	    dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN);
@@ -903,7 +922,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
 
 	for (dst = list_head(&da.dstg->dstg_tasks); dst;
 	    dst = list_next(&da.dstg->dstg_tasks, dst)) {
-		dsl_dataset_t *ds = dst->dst_arg1;
+		struct dsl_ds_destroyarg *dsda = dst->dst_arg1;
+		dsl_dataset_t *ds = dsda->ds;
+
 		/*
 		 * Return the file system name that triggered the error
 		 */
@@ -911,7 +932,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
 			dsl_dataset_name(ds, fsname);
 			*strchr(fsname, '@') = '\0';
 		}
+		ASSERT3P(dsda->rm_origin, ==, NULL);
 		dsl_dataset_disown(ds, da.dstg);
+		kmem_free(dsda, sizeof (struct dsl_ds_destroyarg));
 	}
 
 	dsl_sync_task_group_destroy(da.dstg);
@@ -919,34 +942,103 @@ dsl_snapshots_destroy(char *fsname, char *snapname)
 	return (err);
 }
 
+static boolean_t
+dsl_dataset_might_destroy_origin(dsl_dataset_t *ds)
+{
+	boolean_t might_destroy = B_FALSE;
+
+	mutex_enter(&ds->ds_lock);
+	if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 &&
+	    DS_IS_DEFER_DESTROY(ds))
+		might_destroy = B_TRUE;
+	mutex_exit(&ds->ds_lock);
+
+	return (might_destroy);
+}
+
+/*
+ * If we're removing a clone, and these three conditions are true:
+ *	1) the clone's origin has no other children
+ *	2) the clone's origin has no user references
+ *	3) the clone's origin has been marked for deferred destruction
+ * Then, prepare to remove the origin as part of this sync task group.
+ */
+static int
+dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag)
+{
+	dsl_dataset_t *ds = dsda->ds;
+	dsl_dataset_t *origin = ds->ds_prev;
+
+	if (dsl_dataset_might_destroy_origin(origin)) {
+		char *name;
+		int namelen;
+		int error;
+
+		namelen = dsl_dataset_namelen(origin) + 1;
+		name = kmem_alloc(namelen, KM_SLEEP);
+		dsl_dataset_name(origin, name);
+#ifdef _KERNEL
+		error = zfs_unmount_snap(name, NULL);
+		if (error) {
+			kmem_free(name, namelen);
+			return (error);
+		}
+#endif
+		error = dsl_dataset_own(name, B_TRUE, tag, &origin);
+		kmem_free(name, namelen);
+		if (error)
+			return (error);
+		dsda->rm_origin = origin;
+		dsl_dataset_make_exclusive(origin, tag);
+
+		if (origin->ds_objset != NULL) {
+			dmu_objset_evict(origin->ds_objset);
+			origin->ds_objset = NULL;
+		}
+	}
+
+	return (0);
+}
+
 /*
  * ds must be opened as OWNER.  On return (whether successful or not),
  * ds will be closed and caller can no longer dereference it.
  */
 int
-dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
+dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer)
 {
 	int err;
 	dsl_sync_task_group_t *dstg;
 	objset_t *os;
 	dsl_dir_t *dd;
 	uint64_t obj;
+	struct dsl_ds_destroyarg dsda = { 0 };
+	dsl_dataset_t dummy_ds = { 0 };
+
+	dsda.ds = ds;
 
 	if (dsl_dataset_is_snapshot(ds)) {
 		/* Destroying a snapshot is simpler */
 		dsl_dataset_make_exclusive(ds, tag);
 
-		if (ds->ds_user_ptr) {
-			ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-			ds->ds_user_ptr = NULL;
+		if (ds->ds_objset != NULL) {
+			dmu_objset_evict(ds->ds_objset);
+			ds->ds_objset = NULL;
 		}
+		dsda.defer = defer;
 		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 		    dsl_dataset_destroy_check, dsl_dataset_destroy_sync,
-		    ds, tag, 0);
+		    &dsda, tag, 0);
+		ASSERT3P(dsda.rm_origin, ==, NULL);
+		goto out;
+	} else if (defer) {
+		err = EINVAL;
 		goto out;
 	}
 
 	dd = ds->ds_dir;
+	dummy_ds.ds_dir = dd;
+	dummy_ds.ds_object = ds->ds_object;
 
 	/*
 	 * Check for errors and mark this ds as inconsistent, in
@@ -957,7 +1049,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
 	if (err)
 		goto out;
 
-	err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os);
+	err = dmu_objset_from_ds(ds, &os);
 	if (err)
 		goto out;
 
@@ -974,7 +1066,27 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
 		(void) dmu_free_object(os, obj);
 	}
 
-	dmu_objset_close(os);
+	/*
+	 * We need to sync out all in-flight IO before we try to evict
+	 * (the dataset evict func is trying to clear the cached entries
+	 * for this dataset in the ARC).
+	 */
+	txg_wait_synced(dd->dd_pool, 0);
+
+	/*
+	 * If we managed to free all the objects in open
+	 * context, the user space accounting should be zero.
+	 */
+	if (ds->ds_phys->ds_bp.blk_fill == 0 &&
+	    dmu_objset_userused_enabled(os)) {
+		uint64_t count;
+
+		ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 ||
+		    count == 0);
+		ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 ||
+		    count == 0);
+	}
+
 	if (err != ESRCH)
 		goto out;
 
@@ -985,7 +1097,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
 	if (err)
 		goto out;
 
-	if (ds->ds_user_ptr) {
+	if (ds->ds_objset) {
 		/*
 		 * We need to sync out all in-flight IO before we try
 		 * to evict (the dataset evict func is trying to clear
@@ -998,17 +1110,49 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
 	 * Blow away the dsl_dir + head dataset.
 	 */
 	dsl_dataset_make_exclusive(ds, tag);
-	if (ds->ds_user_ptr) {
-		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-		ds->ds_user_ptr = NULL;
-	}
-	dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
-	dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
-	    dsl_dataset_destroy_sync, ds, tag, 0);
-	dsl_sync_task_create(dstg, dsl_dir_destroy_check,
-	    dsl_dir_destroy_sync, dd, FTAG, 0);
-	err = dsl_sync_task_group_wait(dstg);
-	dsl_sync_task_group_destroy(dstg);
+	if (ds->ds_objset) {
+		dmu_objset_evict(ds->ds_objset);
+		ds->ds_objset = NULL;
+	}
+
+	/*
+	 * If we're removing a clone, we might also need to remove its
+	 * origin.
+	 */
+	do {
+		dsda.need_prep = B_FALSE;
+		if (dsl_dir_is_clone(dd)) {
+			err = dsl_dataset_origin_rm_prep(&dsda, tag);
+			if (err) {
+				dsl_dir_close(dd, FTAG);
+				goto out;
+			}
+		}
+
+		dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool);
+		dsl_sync_task_create(dstg, dsl_dataset_destroy_check,
+		    dsl_dataset_destroy_sync, &dsda, tag, 0);
+		dsl_sync_task_create(dstg, dsl_dir_destroy_check,
+		    dsl_dir_destroy_sync, &dummy_ds, FTAG, 0);
+		err = dsl_sync_task_group_wait(dstg);
+		dsl_sync_task_group_destroy(dstg);
+
+		/*
+		 * We could be racing against 'zfs release' or 'zfs destroy -d'
+		 * on the origin snap, in which case we can get EBUSY if we
+		 * needed to destroy the origin snap but were not ready to
+		 * do so.
+		 */
+		if (dsda.need_prep) {
+			ASSERT(err == EBUSY);
+			ASSERT(dsl_dir_is_clone(dd));
+			ASSERT(dsda.rm_origin == NULL);
+		}
+	} while (dsda.need_prep);
+
+	if (dsda.rm_origin != NULL)
+		dsl_dataset_disown(dsda.rm_origin, tag);
+
 	/* if it is successful, dsl_dir_destroy_sync will close the dd */
 	if (err)
 		dsl_dir_close(dd, FTAG);
@@ -1017,48 +1161,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag)
 	return (err);
 }
 
-int
-dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost)
-{
-	int err;
-
-	ASSERT(ds->ds_owner);
-
-	dsl_dataset_make_exclusive(ds, ds->ds_owner);
-	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-	    dsl_dataset_rollback_check, dsl_dataset_rollback_sync,
-	    ds, &ost, 0);
-	/* drop exclusive access */
-	mutex_enter(&ds->ds_lock);
-	rw_exit(&ds->ds_rwlock);
-	cv_broadcast(&ds->ds_exclusive_cv);
-	mutex_exit(&ds->ds_lock);
-	return (err);
-}
-
-void *
-dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
-    void *p, dsl_dataset_evict_func_t func)
-{
-	void *old;
-
-	mutex_enter(&ds->ds_lock);
-	old = ds->ds_user_ptr;
-	if (old == NULL) {
-		ds->ds_user_ptr = p;
-		ds->ds_user_evict_func = func;
-	}
-	mutex_exit(&ds->ds_lock);
-	return (old);
-}
-
-void *
-dsl_dataset_get_user_ptr(dsl_dataset_t *ds)
-{
-	return (ds->ds_user_ptr);
-}
-
-
 blkptr_t *
 dsl_dataset_get_blkptr(dsl_dataset_t *ds)
 {
@@ -1092,7 +1194,7 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx)
 	if (ds == NULL) /* this is the meta-objset */
 		return;
 
-	ASSERT(ds->ds_user_ptr != NULL);
+	ASSERT(ds->ds_objset != NULL);
 
 	if (ds->ds_phys->ds_next_snap_obj != 0)
 		panic("dirtying snapshot!");
@@ -1150,165 +1252,34 @@ dsl_dataset_unique(dsl_dataset_t *ds)
 
 struct killarg {
 	dsl_dataset_t *ds;
-	zio_t *zio;
 	dmu_tx_t *tx;
 };
 
 /* ARGSUSED */
 static int
-kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct killarg *ka = arg;
+	dmu_tx_t *tx = ka->tx;
 
 	if (bp == NULL)
 		return (0);
 
-	ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
-	(void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static int
-dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dmu_objset_type_t *ost = arg2;
-
-	/*
-	 * We can only roll back to emptyness if it is a ZPL objset.
-	 */
-	if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0)
-		return (EINVAL);
-
-	/*
-	 * This must not be a snapshot.
-	 */
-	if (ds->ds_phys->ds_next_snap_obj != 0)
-		return (EINVAL);
-
-	/*
-	 * If we made changes this txg, traverse_dataset won't find
-	 * them.  Try again.
-	 */
-	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
-		return (EAGAIN);
-
-	return (0);
-}
-
-/* ARGSUSED */
-static void
-dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
-{
-	dsl_dataset_t *ds = arg1;
-	dmu_objset_type_t *ost = arg2;
-	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
-
-	/*
-	 * Before the roll back destroy the zil.
-	 */
-	if (ds->ds_user_ptr != NULL) {
-		zil_rollback_destroy(
-		    ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx);
-
+	if (zb->zb_level == ZB_ZIL_LEVEL) {
+		ASSERT(zilog != NULL);
 		/*
-		 * We need to make sure that the objset_impl_t is reopened after
-		 * we do the rollback, otherwise it will have the wrong
-		 * objset_phys_t.  Normally this would happen when this
-		 * dataset-open is closed, thus causing the
-		 * dataset to be immediately evicted.  But when doing "zfs recv
-		 * -F", we reopen the objset before that, so that there is no
-		 * window where the dataset is closed and inconsistent.
+		 * It's a block in the intent log.  It has no
+		 * accounting, so just free it.
 		 */
-		ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-		ds->ds_user_ptr = NULL;
-	}
-
-	/* Transfer space that was freed since last snap back to the head. */
-	{
-		uint64_t used;
-
-		VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist,
-		    ds->ds_origin_txg, UINT64_MAX, &used));
-		dsl_dir_transfer_space(ds->ds_dir, used,
-		    DD_USED_SNAP, DD_USED_HEAD, tx);
-	}
-
-	/* Zero out the deadlist. */
-	bplist_close(&ds->ds_deadlist);
-	bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx);
-	ds->ds_phys->ds_deadlist_obj =
-	    bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx);
-	VERIFY(0 == bplist_open(&ds->ds_deadlist, mos,
-	    ds->ds_phys->ds_deadlist_obj));
-
-	{
-		/* Free blkptrs that we gave birth to */
-		zio_t *zio;
-		struct killarg ka;
-
-		zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL,
-		    ZIO_FLAG_MUSTSUCCEED);
-		ka.ds = ds;
-		ka.zio = zio;
-		ka.tx = tx;
-		(void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
-		    TRAVERSE_POST, kill_blkptr, &ka);
-		(void) zio_wait(zio);
-	}
-
-	ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) ||
-	    ds->ds_phys->ds_unique_bytes == 0);
-
-	if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) {
-		/* Change our contents to that of the prev snapshot */
-
-		ASSERT3U(ds->ds_prev->ds_object, ==,
-		    ds->ds_phys->ds_prev_snap_obj);
-		ASSERT3U(ds->ds_phys->ds_used_bytes, <=,
-		    ds->ds_prev->ds_phys->ds_used_bytes);
-
-		ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp;
-		ds->ds_phys->ds_used_bytes =
-		    ds->ds_prev->ds_phys->ds_used_bytes;
-		ds->ds_phys->ds_compressed_bytes =
-		    ds->ds_prev->ds_phys->ds_compressed_bytes;
-		ds->ds_phys->ds_uncompressed_bytes =
-		    ds->ds_prev->ds_phys->ds_uncompressed_bytes;
-		ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags;
-
-		if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) {
-			dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx);
-			ds->ds_prev->ds_phys->ds_unique_bytes = 0;
-		}
+		dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp);
 	} else {
-		objset_impl_t *osi;
-
-		ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0);
-		ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0);
-		ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0);
-
-		bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t));
-		ds->ds_phys->ds_flags = 0;
-		ds->ds_phys->ds_unique_bytes = 0;
-		if (spa_version(ds->ds_dir->dd_pool->dp_spa) >=
-		    SPA_VERSION_UNIQUE_ACCURATE)
-			ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE;
-
-		osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds,
-		    &ds->ds_phys->ds_bp, *ost, tx);
-#ifdef _KERNEL
-		zfs_create_fs(&osi->os, kcred, NULL, tx);
-#endif
+		ASSERT(zilog == NULL);
+		ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg);
+		(void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE);
 	}
 
-	spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa,
-	    tx, cr, "dataset = %llu", ds->ds_object);
+	return (0);
 }
 
 /* ARGSUSED */
@@ -1327,7 +1298,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	 */
 	if (ds->ds_prev != NULL &&
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-		return (EINVAL);
+		return (EBUSY);
 
 	/*
 	 * This is really a dsl_dir thing, but check it here so that
@@ -1358,18 +1329,63 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	    cr, "dataset = %llu", ds->ds_object);
 }
 
+static int
+dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag,
+    dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = dsda->ds;
+	dsl_dataset_t *ds_prev = ds->ds_prev;
+
+	if (dsl_dataset_might_destroy_origin(ds_prev)) {
+		struct dsl_ds_destroyarg ndsda = {0};
+
+		/*
+		 * If we're not prepared to remove the origin, don't remove
+		 * the clone either.
+		 */
+		if (dsda->rm_origin == NULL) {
+			dsda->need_prep = B_TRUE;
+			return (EBUSY);
+		}
+
+		ndsda.ds = ds_prev;
+		ndsda.is_origin_rm = B_TRUE;
+		return (dsl_dataset_destroy_check(&ndsda, tag, tx));
+	}
+
+	/*
+	 * If we're not going to remove the origin after all,
+	 * undo the open context setup.
+	 */
+	if (dsda->rm_origin != NULL) {
+		dsl_dataset_disown(dsda->rm_origin, tag);
+		dsda->rm_origin = NULL;
+	}
+
+	return (0);
+}
+
 /* ARGSUSED */
 int
 dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
+	struct dsl_ds_destroyarg *dsda = arg1;
+	dsl_dataset_t *ds = dsda->ds;
 
 	/* we have an owner hold, so noone else can destroy us */
 	ASSERT(!DSL_DATASET_IS_DESTROYED(ds));
 
-	/* Can't delete a branch point. */
-	if (ds->ds_phys->ds_num_children > 1)
-		return (EEXIST);
+	/*
+	 * Only allow deferred destroy on pools that support it.
+	 * NOTE: deferred destroy is only supported on snapshots.
+	 */
+	if (dsda->defer) {
+		if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
+		    SPA_VERSION_USERREFS)
+			return (ENOTSUP);
+		ASSERT(dsl_dataset_is_snapshot(ds));
+		return (0);
+	}
 
 	/*
 	 * Can't delete a head dataset if there are snapshots of it.
@@ -1378,7 +1394,7 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	 */
 	if (ds->ds_prev != NULL &&
 	    ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object)
-		return (EINVAL);
+		return (EBUSY);
 
 	/*
 	 * If we made changes this txg, traverse_dsl_dataset won't find
@@ -1387,6 +1403,31 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg)
 		return (EAGAIN);
 
+	if (dsl_dataset_is_snapshot(ds)) {
+		/*
+		 * If this snapshot has an elevated user reference count,
+		 * we can't destroy it yet.
+		 */
+		if (ds->ds_userrefs > 0 && !dsda->releasing)
+			return (EBUSY);
+
+		mutex_enter(&ds->ds_lock);
+		/*
+		 * Can't delete a branch point. However, if we're destroying
+		 * a clone and removing its origin due to it having a user
+		 * hold count of 0 and having been marked for deferred destroy,
+		 * it's OK for the origin to have a single clone.
+		 */
+		if (ds->ds_phys->ds_num_children >
+		    (dsda->is_origin_rm ? 2 : 1)) {
+			mutex_exit(&ds->ds_lock);
+			return (EEXIST);
+		}
+		mutex_exit(&ds->ds_lock);
+	} else if (dsl_dir_is_clone(ds->ds_dir)) {
+		return (dsl_dataset_origin_check(dsda, arg2, tx));
+	}
+
 	/* XXX we should do some i/o error checking... */
 	return (0);
 }
@@ -1431,11 +1472,38 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag)
 	cv_destroy(&arg.cv);
 }
 
+static void
+remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t count;
+	int err;
+
+	ASSERT(ds->ds_phys->ds_num_children >= 2);
+	err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx);
+	/*
+	 * The err should not be ENOENT, but a bug in a previous version
+	 * of the code could cause upgrade_clones_cb() to not set
+	 * ds_next_snap_obj when it should, leading to a missing entry.
+	 * If we knew that the pool was created after
+	 * SPA_VERSION_NEXT_CLONES, we could assert that it isn't
+	 * ENOENT.  However, at least we can check that we don't have
+	 * too many entries in the next_clones_obj even after failing to
+	 * remove this one.
+	 */
+	if (err != ENOENT) {
+		VERIFY3U(err, ==, 0);
+	}
+	ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj,
+	    &count));
+	ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2);
+}
+
 void
 dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 {
-	dsl_dataset_t *ds = arg1;
-	zio_t *zio;
+	struct dsl_ds_destroyarg *dsda = arg1;
+	dsl_dataset_t *ds = dsda->ds;
 	int err;
 	int after_branch_point = FALSE;
 	dsl_pool_t *dp = ds->ds_dir->dd_pool;
@@ -1444,11 +1512,20 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 	uint64_t obj;
 
 	ASSERT(ds->ds_owner);
-	ASSERT3U(ds->ds_phys->ds_num_children, <=, 1);
+	ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1);
 	ASSERT(ds->ds_prev == NULL ||
 	    ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object);
 	ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg);
 
+	if (dsda->defer) {
+		ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+		if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) {
+			dmu_buf_will_dirty(ds->ds_dbuf, tx);
+			ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY;
+			return;
+		}
+	}
+
 	/* signal any waiters that this dataset is going away */
 	mutex_enter(&ds->ds_lock);
 	ds->ds_owner = dsl_reaper;
@@ -1457,8 +1534,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 
 	/* Remove our reservation */
 	if (ds->ds_reserved != 0) {
-		uint64_t val = 0;
-		dsl_dataset_set_reservation_sync(ds, &val, cr, tx);
+		dsl_prop_setarg_t psa;
+		uint64_t value = 0;
+
+		dsl_prop_setarg_init_uint64(&psa, "refreservation",
+		    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+		    &value);
+		psa.psa_effective_value = 0;	/* predict default value */
+
+		dsl_dataset_set_reservation_sync(ds, &psa, cr, tx);
 		ASSERT3U(ds->ds_reserved, ==, 0);
 	}
 
@@ -1481,8 +1565,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		dmu_buf_will_dirty(ds_prev->ds_dbuf, tx);
 		if (after_branch_point &&
 		    ds_prev->ds_phys->ds_next_clones_obj != 0) {
-			VERIFY(0 == zap_remove_int(mos,
-			    ds_prev->ds_phys->ds_next_clones_obj, obj, tx));
+			remove_from_next_clones(ds_prev, obj, tx);
 			if (ds->ds_phys->ds_next_snap_obj != 0) {
 				VERIFY(0 == zap_add_int(mos,
 				    ds_prev->ds_phys->ds_next_clones_obj,
@@ -1494,14 +1577,26 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 			/* This clone is toast. */
 			ASSERT(ds_prev->ds_phys->ds_num_children > 1);
 			ds_prev->ds_phys->ds_num_children--;
+
+			/*
+			 * If the clone's origin has no other clones, no
+			 * user holds, and has been marked for deferred
+			 * deletion, then we should have done the necessary
+			 * destroy setup for it.
+			 */
+			if (ds_prev->ds_phys->ds_num_children == 1 &&
+			    ds_prev->ds_userrefs == 0 &&
+			    DS_IS_DEFER_DESTROY(ds_prev)) {
+				ASSERT3P(dsda->rm_origin, !=, NULL);
+			} else {
+				ASSERT3P(dsda->rm_origin, ==, NULL);
+			}
 		} else if (!after_branch_point) {
 			ds_prev->ds_phys->ds_next_snap_obj =
 			    ds->ds_phys->ds_next_snap_obj;
 		}
 	}
 
-	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-
 	if (ds->ds_phys->ds_next_snap_obj != 0) {
 		blkptr_t bp;
 		dsl_dataset_t *ds_next;
@@ -1539,15 +1634,13 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 				    bp.blk_birth >
 				    ds_prev->ds_phys->ds_prev_snap_txg) {
 					ds_prev->ds_phys->ds_unique_bytes +=
-					    bp_get_dasize(dp->dp_spa, &bp);
+					    bp_get_dsize_sync(dp->dp_spa, &bp);
 				}
 			} else {
-				used += bp_get_dasize(dp->dp_spa, &bp);
+				used += bp_get_dsize_sync(dp->dp_spa, &bp);
 				compressed += BP_GET_PSIZE(&bp);
 				uncompressed += BP_GET_UCSIZE(&bp);
-				/* XXX check return value? */
-				(void) dsl_free(zio, dp, tx->tx_txg,
-				    &bp, NULL, NULL, ARC_NOWAIT);
+				dsl_free(dp, tx->tx_txg, &bp);
 			}
 		}
 
@@ -1649,17 +1742,18 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 		 * freed all the objects in open context.
 		 */
 		ka.ds = ds;
-		ka.zio = zio;
 		ka.tx = tx;
 		err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg,
 		    TRAVERSE_POST, kill_blkptr, &ka);
 		ASSERT3U(err, ==, 0);
-		ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE ||
+		ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) ||
 		    ds->ds_phys->ds_unique_bytes == 0);
-	}
 
-	err = zio_wait(zio);
-	ASSERT3U(err, ==, 0);
+		if (ds->ds_prev != NULL) {
+			dsl_dataset_rele(ds->ds_prev, ds);
+			ds->ds_prev = ds_prev = NULL;
+		}
+	}
 
 	if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) {
 		/* Erase the link in the dir */
@@ -1706,10 +1800,22 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 	}
 	if (ds->ds_phys->ds_props_obj != 0)
 		VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx));
+	if (ds->ds_phys->ds_userrefs_obj != 0)
+		VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx));
 	dsl_dir_close(ds->ds_dir, ds);
 	ds->ds_dir = NULL;
 	dsl_dataset_drain_refs(ds, tag);
 	VERIFY(0 == dmu_object_free(mos, obj, tx));
+
+	if (dsda->rm_origin) {
+		/*
+		 * Remove the origin of the clone we just destroyed.
+		 */
+		struct dsl_ds_destroyarg ndsda = {0};
+
+		ndsda.ds = dsda->rm_origin;
+		dsl_dataset_destroy_sync(&ndsda, tag, cr, tx);
+	}
 }
 
 static int
@@ -1838,8 +1944,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 			    ds->ds_prev->ds_phys->ds_creation_txg);
 			ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj;
 		} else if (next_clones_obj != 0) {
-			VERIFY3U(0, ==, zap_remove_int(mos,
-			    next_clones_obj, dsphys->ds_next_snap_obj, tx));
+			remove_from_next_clones(ds->ds_prev,
+			    dsphys->ds_next_snap_obj, tx);
 			VERIFY3U(0, ==, zap_add_int(mos,
 			    next_clones_obj, dsobj, tx));
 		}
@@ -1881,6 +1987,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 	dsl_pool_ds_snapshotted(ds, tx);
 
+	dsl_dir_snap_cmtime_update(ds->ds_dir);
+
 	spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr,
 	    "dataset = %llu", dsobj);
 }
@@ -1889,7 +1997,7 @@ void
 dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 {
 	ASSERT(dmu_tx_is_syncing(tx));
-	ASSERT(ds->ds_user_ptr != NULL);
+	ASSERT(ds->ds_objset != NULL);
 	ASSERT(ds->ds_phys->ds_next_snap_obj == 0);
 
 	/*
@@ -1900,7 +2008,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx)
 	ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid;
 
 	dsl_dir_dirty(ds->ds_dir, tx);
-	dmu_objset_sync(ds->ds_user_ptr, zio, tx);
+	dmu_objset_sync(ds->ds_objset, zio, tx);
 }
 
 void
@@ -1924,6 +2032,14 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv)
 	    ds->ds_reserved);
 	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID,
 	    ds->ds_phys->ds_guid);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE,
+	    dsl_dataset_unique(ds));
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID,
+	    ds->ds_object);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS,
+	    ds->ds_userrefs);
+	dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY,
+	    DS_IS_DEFER_DESTROY(ds) ? 1 : 0);
 
 	if (ds->ds_phys->ds_next_snap_obj) {
 		/*
@@ -1948,6 +2064,9 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 	if (ds->ds_phys->ds_next_snap_obj) {
 		stat->dds_is_snapshot = B_TRUE;
 		stat->dds_num_clones = ds->ds_phys->ds_num_children - 1;
+	} else {
+		stat->dds_is_snapshot = B_FALSE;
+		stat->dds_num_clones = 0;
 	}
 
 	/* clone origin is really a dsl_dir thing... */
@@ -1959,6 +2078,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat)
 		    ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods));
 		dsl_dataset_name(ods, stat->dds_origin);
 		dsl_dataset_drop_ref(ods, FTAG);
+	} else {
+		stat->dds_origin[0] = '\0';
 	}
 	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
 }
@@ -2078,43 +2199,36 @@ struct renamesnaparg {
 };
 
 static int
-dsl_snapshot_rename_one(char *name, void *arg)
+dsl_snapshot_rename_one(const char *name, void *arg)
 {
 	struct renamesnaparg *ra = arg;
 	dsl_dataset_t *ds = NULL;
-	char *cp;
+	char *snapname;
 	int err;
 
-	cp = name + strlen(name);
-	*cp = '@';
-	(void) strcpy(cp + 1, ra->oldsnap);
+	snapname = kmem_asprintf("%s@%s", name, ra->oldsnap);
+	(void) strlcpy(ra->failed, snapname, sizeof (ra->failed));
 
 	/*
 	 * For recursive snapshot renames the parent won't be changing
 	 * so we just pass name for both the to/from argument.
 	 */
-	err = zfs_secpolicy_rename_perms(name, name, CRED());
-	if (err == ENOENT) {
-		return (0);
-	} else if (err) {
-		(void) strcpy(ra->failed, name);
-		return (err);
+	err = zfs_secpolicy_rename_perms(snapname, snapname, CRED());
+	if (err != 0) {
+		strfree(snapname);
+		return (err == ENOENT ? 0 : err);
 	}
 
 #ifdef _KERNEL
 	/*
 	 * For all filesystems undergoing rename, we'll need to unmount it.
 	 */
-	(void) zfs_unmount_snap(name, NULL);
+	(void) zfs_unmount_snap(snapname, NULL);
 #endif
-	err = dsl_dataset_hold(name, ra->dstg, &ds);
-	*cp = '\0';
-	if (err == ENOENT) {
-		return (0);
-	} else if (err) {
-		(void) strcpy(ra->failed, name);
-		return (err);
-	}
+	err = dsl_dataset_hold(snapname, ra->dstg, &ds);
+	strfree(snapname);
+	if (err != 0)
+		return (err == ENOENT ? 0 : err);
 
 	dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check,
 	    dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0);
@@ -2130,7 +2244,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
 	dsl_sync_task_t *dst;
 	spa_t *spa;
 	char *cp, *fsname = spa_strdup(oldname);
-	int len = strlen(oldname);
+	int len = strlen(oldname) + 1;
 
 	/* truncate the snapshot name to get the fsname */
 	cp = strchr(fsname, '@');
@@ -2138,7 +2252,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
 
 	err = spa_open(fsname, &spa, FTAG);
 	if (err) {
-		kmem_free(fsname, len + 1);
+		kmem_free(fsname, len);
 		return (err);
 	}
 	ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP);
@@ -2150,7 +2264,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
 
 	err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra,
 	    DS_FIND_CHILDREN);
-	kmem_free(fsname, len + 1);
+	kmem_free(fsname, len);
 
 	if (err == 0) {
 		err = dsl_sync_task_group_wait(ra->dstg);
@@ -2161,14 +2275,15 @@ dsl_recursive_rename(char *oldname, const char *newname)
 		dsl_dataset_t *ds = dst->dst_arg1;
 		if (dst->dst_err) {
 			dsl_dir_name(ds->ds_dir, ra->failed);
-			(void) strcat(ra->failed, "@");
-			(void) strcat(ra->failed, ra->newsnap);
+			(void) strlcat(ra->failed, "@", sizeof (ra->failed));
+			(void) strlcat(ra->failed, ra->newsnap,
+			    sizeof (ra->failed));
 		}
 		dsl_dataset_rele(ds, ra->dstg);
 	}
 
 	if (err)
-		(void) strcpy(oldname, ra->failed);
+		(void) strlcpy(oldname, ra->failed, sizeof (ra->failed));
 
 	dsl_sync_task_group_destroy(ra->dstg);
 	kmem_free(ra, sizeof (struct renamesnaparg));
@@ -2177,7 +2292,7 @@ dsl_recursive_rename(char *oldname, const char *newname)
 }
 
 static int
-dsl_valid_rename(char *oldname, void *arg)
+dsl_valid_rename(const char *oldname, void *arg)
 {
 	int delta = *(int *)arg;
 
@@ -2199,6 +2314,7 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
 	err = dsl_dir_open(oldname, FTAG, &dd, &tail);
 	if (err)
 		return (err);
+
 	if (tail == NULL) {
 		int delta = strlen(newname) - strlen(oldname);
 
@@ -2207,13 +2323,23 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive)
 			err = dmu_objset_find(oldname, dsl_valid_rename,
 			    &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS);
 
-		if (!err)
+		if (!err) {
+			/*
+			 * If there are more than 2 references there may be
+			 * holds hanging around that haven't been cleared
+			 * out yet.
+			 */
+			if (dmu_buf_refcount(dd->dd_dbuf) > 2)
+				txg_wait_synced(dd->dd_pool, 0);
+
 			err = dsl_dir_rename(dd, newname);
+		}
 		dsl_dir_close(dd, FTAG);
 		return (err);
 	}
+
 	if (tail[0] != '@') {
-		/* the name ended in a nonexistant component */
+		/* the name ended in a nonexistent component */
 		dsl_dir_close(dd, FTAG);
 		return (ENOENT);
 	}
@@ -2254,6 +2380,7 @@ struct promotearg {
 	list_t shared_snaps, origin_snaps, clone_snaps;
 	dsl_dataset_t *origin_origin, *origin_head;
 	uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap;
+	char *err_ds;
 };
 
 static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep);
@@ -2313,10 +2440,12 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		/* Check that the snapshot name does not conflict */
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
 		err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val);
-		if (err == 0)
-			return (EEXIST);
+		if (err == 0) {
+			err = EEXIST;
+			goto out;
+		}
 		if (err != ENOENT)
-			return (err);
+			goto out;
 
 		/* The very first snapshot does not have a deadlist */
 		if (ds->ds_phys->ds_prev_snap_obj == 0)
@@ -2324,7 +2453,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 		if (err = bplist_space(&ds->ds_deadlist,
 		    &dlused, &dlcomp, &dluncomp))
-			return (err);
+			goto out;
 		pa->used += dlused;
 		pa->comp += dlcomp;
 		pa->uncomp += dluncomp;
@@ -2382,6 +2511,9 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	}
 
 	return (0);
+out:
+	pa->err_ds =  snap->ds->ds_snapname;
+	return (err);
 }
 
 static void
@@ -2419,9 +2551,7 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 	/* change the origin's next clone */
 	if (origin_ds->ds_phys->ds_next_clones_obj) {
-		VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset,
-		    origin_ds->ds_phys->ds_next_clones_obj,
-		    origin_ds->ds_phys->ds_next_snap_obj, tx));
+		remove_from_next_clones(origin_ds, snap->ds->ds_object, tx);
 		VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset,
 		    origin_ds->ds_phys->ds_next_clones_obj,
 		    oldnext_obj, tx));
@@ -2442,9 +2572,9 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		dsl_dataset_t *ds = snap->ds;
 
 		/* unregister props as dsl_dir is changing */
-		if (ds->ds_user_ptr) {
-			ds->ds_user_evict_func(ds, ds->ds_user_ptr);
-			ds->ds_user_ptr = NULL;
+		if (ds->ds_objset) {
+			dmu_objset_evict(ds->ds_objset);
+			ds->ds_objset = NULL;
 		}
 		/* move snap name entry */
 		VERIFY(0 == dsl_dataset_get_snapname(ds));
@@ -2572,7 +2702,7 @@ snaplist_destroy(list_t *l, boolean_t own)
 {
 	struct promotenode *snap;
 
-	if (!list_link_active(&l->list_head))
+	if (!l || !list_link_active(&l->list_head))
 		return;
 
 	while ((snap = list_tail(l)) != NULL) {
@@ -2596,7 +2726,7 @@ snaplist_destroy(list_t *l, boolean_t own)
  * NULL, indicating that the clone is not a clone of a clone).
  */
 int
-dsl_dataset_promote(const char *name)
+dsl_dataset_promote(const char *name, char *conflsnap)
 {
 	dsl_dataset_t *ds;
 	dsl_dir_t *dd;
@@ -2667,7 +2797,9 @@ dsl_dataset_promote(const char *name)
 	if (err == 0) {
 		err = dsl_sync_task_do(dp, dsl_dataset_promote_check,
 		    dsl_dataset_promote_sync, ds, &pa,
-		    2 + 2 * doi.doi_physical_blks);
+		    2 + 2 * doi.doi_physical_blocks_512);
+		if (err && pa.err_ds && conflsnap)
+			(void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN);
 	}
 
 	snaplist_destroy(&pa.shared_snaps, B_TRUE);
@@ -2701,9 +2833,11 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	if (csa->cds->ds_prev != csa->ohds->ds_prev)
 		return (EINVAL);
 
-	/* cds should be the clone */
-	if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj !=
-	    csa->ohds->ds_object)
+	/* cds should be the clone (unless they are unrelated) */
+	if (csa->cds->ds_prev != NULL &&
+	    csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap &&
+	    csa->ohds->ds_object !=
+	    csa->cds->ds_prev->ds_phys->ds_next_snap_obj)
 		return (EINVAL);
 
 	/* the clone should be a child of the origin */
@@ -2726,6 +2860,10 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	    dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE))
 		return (ENOSPC);
 
+	if (csa->ohds->ds_quota != 0 &&
+	    csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota)
+		return (EDQUOT);
+
 	return (0);
 }
 
@@ -2737,27 +2875,32 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	dsl_pool_t *dp = csa->cds->ds_dir->dd_pool;
 
 	ASSERT(csa->cds->ds_reserved == 0);
-	ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota);
+	ASSERT(csa->ohds->ds_quota == 0 ||
+	    csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota);
 
 	dmu_buf_will_dirty(csa->cds->ds_dbuf, tx);
 	dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx);
-	dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx);
 
-	if (csa->cds->ds_user_ptr != NULL) {
-		csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr);
-		csa->cds->ds_user_ptr = NULL;
+	if (csa->cds->ds_objset != NULL) {
+		dmu_objset_evict(csa->cds->ds_objset);
+		csa->cds->ds_objset = NULL;
 	}
 
-	if (csa->ohds->ds_user_ptr != NULL) {
-		csa->ohds->ds_user_evict_func(csa->ohds,
-		    csa->ohds->ds_user_ptr);
-		csa->ohds->ds_user_ptr = NULL;
+	if (csa->ohds->ds_objset != NULL) {
+		dmu_objset_evict(csa->ohds->ds_objset);
+		csa->ohds->ds_objset = NULL;
 	}
 
-	/* reset origin's unique bytes */
-	VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
-	    csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX,
-	    &csa->cds->ds_prev->ds_phys->ds_unique_bytes));
+	/*
+	 * Reset origin's unique bytes, if it exists.
+	 */
+	if (csa->cds->ds_prev) {
+		dsl_dataset_t *origin = csa->cds->ds_prev;
+		dmu_buf_will_dirty(origin->ds_dbuf, tx);
+		VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist,
+		    origin->ds_phys->ds_prev_snap_txg, UINT64_MAX,
+		    &origin->ds_phys->ds_unique_bytes));
+	}
 
 	/* swap blkptrs */
 	{
@@ -2843,8 +2986,10 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 }
 
 /*
- * Swap 'clone' with its origin head file system.  Used at the end
- * of "online recv" to swizzle the file system to the new version.
+ * Swap 'clone' with its origin head datasets.  Used at the end of "zfs
+ * recv" into an existing fs to swizzle the file system to the new
+ * version, and by "zfs rollback".  Can also be used to swap two
+ * independent head datasets if neither has any snapshots.
  */
 int
 dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
@@ -2953,62 +3098,70 @@ static int
 dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
-	uint64_t *quotap = arg2;
-	uint64_t new_quota = *quotap;
+	dsl_prop_setarg_t *psa = arg2;
+	int err;
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA)
 		return (ENOTSUP);
 
-	if (new_quota == 0)
+	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+		return (err);
+
+	if (psa->psa_effective_value == 0)
 		return (0);
 
-	if (new_quota < ds->ds_phys->ds_used_bytes ||
-	    new_quota < ds->ds_reserved)
+	if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes ||
+	    psa->psa_effective_value < ds->ds_reserved)
 		return (ENOSPC);
 
 	return (0);
 }
 
-/* ARGSUSED */
+extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *);
+
 void
 dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
-	uint64_t *quotap = arg2;
-	uint64_t new_quota = *quotap;
+	dsl_prop_setarg_t *psa = arg2;
+	uint64_t effective_value = psa->psa_effective_value;
 
-	dmu_buf_will_dirty(ds->ds_dbuf, tx);
+	dsl_prop_set_sync(ds, psa, cr, tx);
+	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
 
-	ds->ds_quota = new_quota;
-
-	dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx);
+	if (ds->ds_quota != effective_value) {
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		ds->ds_quota = effective_value;
 
-	spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa,
-	    tx, cr, "%lld dataset = %llu ",
-	    (longlong_t)new_quota, ds->ds_object);
+		spa_history_internal_log(LOG_DS_REFQUOTA,
+		    ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ",
+		    (longlong_t)ds->ds_quota, ds->ds_object);
+	}
 }
 
 int
-dsl_dataset_set_quota(const char *dsname, uint64_t quota)
+dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota)
 {
 	dsl_dataset_t *ds;
+	dsl_prop_setarg_t psa;
 	int err;
 
+	dsl_prop_setarg_init_uint64(&psa, "refquota", source, &quota);
+
 	err = dsl_dataset_hold(dsname, FTAG, &ds);
 	if (err)
 		return (err);
 
-	if (quota != ds->ds_quota) {
-		/*
-		 * If someone removes a file, then tries to set the quota, we
-		 * want to make sure the file freeing takes effect.
-		 */
-		txg_wait_open(ds->ds_dir->dd_pool, 0);
+	/*
+	 * If someone removes a file, then tries to set the quota, we
+	 * want to make sure the file freeing takes effect.
+	 */
+	txg_wait_open(ds->ds_dir->dd_pool, 0);
+
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
+	    ds, &psa, 0);
 
-		err = dsl_sync_task_do(ds->ds_dir->dd_pool,
-		    dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync,
-		    ds, &quota, 0);
-	}
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
@@ -3017,13 +3170,10 @@ static int
 dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
-	uint64_t *reservationp = arg2;
-	uint64_t new_reservation = *reservationp;
-	int64_t delta;
+	dsl_prop_setarg_t *psa = arg2;
+	uint64_t effective_value;
 	uint64_t unique;
-
-	if (new_reservation > INT64_MAX)
-		return (EOVERFLOW);
+	int err;
 
 	if (spa_version(ds->ds_dir->dd_pool->dp_spa) <
 	    SPA_VERSION_REFRESERVATION)
@@ -3032,6 +3182,11 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	if (dsl_dataset_is_snapshot(ds))
 		return (EINVAL);
 
+	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+		return (err);
+
+	effective_value = psa->psa_effective_value;
+
 	/*
 	 * If we are doing the preliminary check in open context, the
 	 * space estimates may be inaccurate.
@@ -3041,15 +3196,18 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 	mutex_enter(&ds->ds_lock);
 	unique = dsl_dataset_unique(ds);
-	delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved);
 	mutex_exit(&ds->ds_lock);
 
-	if (delta > 0 &&
-	    delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
-		return (ENOSPC);
-	if (delta > 0 && ds->ds_quota > 0 &&
-	    new_reservation > ds->ds_quota)
-		return (ENOSPC);
+	if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) {
+		uint64_t delta = MAX(unique, effective_value) -
+		    MAX(unique, ds->ds_reserved);
+
+		if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE))
+			return (ENOSPC);
+		if (ds->ds_quota > 0 &&
+		    effective_value > ds->ds_quota)
+			return (ENOSPC);
+	}
 
 	return (0);
 }
@@ -3060,44 +3218,546 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr,
     dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
-	uint64_t *reservationp = arg2;
-	uint64_t new_reservation = *reservationp;
+	dsl_prop_setarg_t *psa = arg2;
+	uint64_t effective_value = psa->psa_effective_value;
 	uint64_t unique;
 	int64_t delta;
 
+	dsl_prop_set_sync(ds, psa, cr, tx);
+	DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa);
+
 	dmu_buf_will_dirty(ds->ds_dbuf, tx);
 
 	mutex_enter(&ds->ds_dir->dd_lock);
 	mutex_enter(&ds->ds_lock);
 	unique = dsl_dataset_unique(ds);
-	delta = MAX(0, (int64_t)(new_reservation - unique)) -
+	delta = MAX(0, (int64_t)(effective_value - unique)) -
 	    MAX(0, (int64_t)(ds->ds_reserved - unique));
-	ds->ds_reserved = new_reservation;
+	ds->ds_reserved = effective_value;
 	mutex_exit(&ds->ds_lock);
 
 	dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx);
 	mutex_exit(&ds->ds_dir->dd_lock);
-	dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation",
-	    new_reservation, cr, tx);
 
 	spa_history_internal_log(LOG_DS_REFRESERV,
 	    ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu",
-	    (longlong_t)new_reservation, ds->ds_object);
+	    (longlong_t)effective_value, ds->ds_object);
 }
 
 int
-dsl_dataset_set_reservation(const char *dsname, uint64_t reservation)
+dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+    uint64_t reservation)
 {
 	dsl_dataset_t *ds;
+	dsl_prop_setarg_t psa;
 	int err;
 
+	dsl_prop_setarg_init_uint64(&psa, "refreservation", source,
+	    &reservation);
+
 	err = dsl_dataset_hold(dsname, FTAG, &ds);
 	if (err)
 		return (err);
 
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    dsl_dataset_set_reservation_check,
-	    dsl_dataset_set_reservation_sync, ds, &reservation, 0);
+	    dsl_dataset_set_reservation_sync, ds, &psa, 0);
+
 	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
+
+struct dsl_ds_holdarg {
+	dsl_sync_task_group_t *dstg;
+	char *htag;
+	char *snapname;
+	boolean_t recursive;
+	boolean_t gotone;
+	boolean_t temphold;
+	char failed[MAXPATHLEN];
+};
+
+/*
+ * The max length of a temporary tag prefix is the number of hex digits
+ * required to express UINT64_MAX plus one for the hyphen.
+ */
+#define	MAX_TAG_PREFIX_LEN	17
+
+static int
+dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	struct dsl_ds_holdarg *ha = arg2;
+	char *htag = ha->htag;
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	int error = 0;
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+		return (ENOTSUP);
+
+	if (!dsl_dataset_is_snapshot(ds))
+		return (EINVAL);
+
+	/* tags must be unique */
+	mutex_enter(&ds->ds_lock);
+	if (ds->ds_phys->ds_userrefs_obj) {
+		error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag,
+		    8, 1, tx);
+		if (error == 0)
+			error = EEXIST;
+		else if (error == ENOENT)
+			error = 0;
+	}
+	mutex_exit(&ds->ds_lock);
+
+	if (error == 0 && ha->temphold &&
+	    strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN)
+		error = E2BIG;
+
+	return (error);
+}
+
+static void
+dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	struct dsl_ds_holdarg *ha = arg2;
+	char *htag = ha->htag;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t now = gethrestime_sec();
+	uint64_t zapobj;
+
+	mutex_enter(&ds->ds_lock);
+	if (ds->ds_phys->ds_userrefs_obj == 0) {
+		/*
+		 * This is the first user hold for this dataset.  Create
+		 * the userrefs zap object.
+		 */
+		dmu_buf_will_dirty(ds->ds_dbuf, tx);
+		zapobj = ds->ds_phys->ds_userrefs_obj =
+		    zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx);
+	} else {
+		zapobj = ds->ds_phys->ds_userrefs_obj;
+	}
+	ds->ds_userrefs++;
+	mutex_exit(&ds->ds_lock);
+
+	VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx));
+
+	if (ha->temphold) {
+		VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object,
+		    htag, &now, tx));
+	}
+
+	spa_history_internal_log(LOG_DS_USER_HOLD,
+	    dp->dp_spa, tx, cr, "<%s> temp = %d dataset = %llu", htag,
+	    (int)ha->temphold, ds->ds_object);
+}
+
+static int
+dsl_dataset_user_hold_one(const char *dsname, void *arg)
+{
+	struct dsl_ds_holdarg *ha = arg;
+	dsl_dataset_t *ds;
+	int error;
+	char *name;
+
+	/* alloc a buffer to hold dsname@snapname plus terminating NULL */
+	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
+	error = dsl_dataset_hold(name, ha->dstg, &ds);
+	strfree(name);
+	if (error == 0) {
+		ha->gotone = B_TRUE;
+		dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check,
+		    dsl_dataset_user_hold_sync, ds, ha, 0);
+	} else if (error == ENOENT && ha->recursive) {
+		error = 0;
+	} else {
+		(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+	}
+	return (error);
+}
+
+int
+dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+    boolean_t recursive, boolean_t temphold)
+{
+	struct dsl_ds_holdarg *ha;
+	dsl_sync_task_t *dst;
+	spa_t *spa;
+	int error;
+
+	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+	error = spa_open(dsname, &spa, FTAG);
+	if (error) {
+		kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+		return (error);
+	}
+
+	ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+	ha->htag = htag;
+	ha->snapname = snapname;
+	ha->recursive = recursive;
+	ha->temphold = temphold;
+	if (recursive) {
+		error = dmu_objset_find(dsname, dsl_dataset_user_hold_one,
+		    ha, DS_FIND_CHILDREN);
+	} else {
+		error = dsl_dataset_user_hold_one(dsname, ha);
+	}
+	if (error == 0)
+		error = dsl_sync_task_group_wait(ha->dstg);
+
+	for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+	    dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+		dsl_dataset_t *ds = dst->dst_arg1;
+
+		if (dst->dst_err) {
+			dsl_dataset_name(ds, ha->failed);
+			*strchr(ha->failed, '@') = '\0';
+		}
+		dsl_dataset_rele(ds, ha->dstg);
+	}
+
+	if (error == 0 && recursive && !ha->gotone)
+		error = ENOENT;
+
+	if (error)
+		(void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
+
+	dsl_sync_task_group_destroy(ha->dstg);
+	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+struct dsl_ds_releasearg {
+	dsl_dataset_t *ds;
+	const char *htag;
+	boolean_t own;		/* do we own or just hold ds? */
+};
+
+static int
+dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag,
+    boolean_t *might_destroy)
+{
+	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+	uint64_t zapobj;
+	uint64_t tmp;
+	int error;
+
+	*might_destroy = B_FALSE;
+
+	mutex_enter(&ds->ds_lock);
+	zapobj = ds->ds_phys->ds_userrefs_obj;
+	if (zapobj == 0) {
+		/* The tag can't possibly exist */
+		mutex_exit(&ds->ds_lock);
+		return (ESRCH);
+	}
+
+	/* Make sure the tag exists */
+	error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp);
+	if (error) {
+		mutex_exit(&ds->ds_lock);
+		if (error == ENOENT)
+			error = ESRCH;
+		return (error);
+	}
+
+	if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 &&
+	    DS_IS_DEFER_DESTROY(ds))
+		*might_destroy = B_TRUE;
+
+	mutex_exit(&ds->ds_lock);
+	return (0);
+}
+
+static int
+dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx)
+{
+	struct dsl_ds_releasearg *ra = arg1;
+	dsl_dataset_t *ds = ra->ds;
+	boolean_t might_destroy;
+	int error;
+
+	if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS)
+		return (ENOTSUP);
+
+	error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy);
+	if (error)
+		return (error);
+
+	if (might_destroy) {
+		struct dsl_ds_destroyarg dsda = {0};
+
+		if (dmu_tx_is_syncing(tx)) {
+			/*
+			 * If we're not prepared to remove the snapshot,
+			 * we can't allow the release to happen right now.
+			 */
+			if (!ra->own)
+				return (EBUSY);
+			if (ds->ds_objset) {
+				dmu_objset_evict(ds->ds_objset);
+				ds->ds_objset = NULL;
+			}
+		}
+		dsda.ds = ds;
+		dsda.releasing = B_TRUE;
+		return (dsl_dataset_destroy_check(&dsda, tag, tx));
+	}
+
+	return (0);
+}
+
+static void
+dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
+{
+	struct dsl_ds_releasearg *ra = arg1;
+	dsl_dataset_t *ds = ra->ds;
+	dsl_pool_t *dp = ds->ds_dir->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t zapobj;
+	uint64_t dsobj = ds->ds_object;
+	uint64_t refs;
+	int error;
+
+	mutex_enter(&ds->ds_lock);
+	ds->ds_userrefs--;
+	refs = ds->ds_userrefs;
+	mutex_exit(&ds->ds_lock);
+	error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx);
+	VERIFY(error == 0 || error == ENOENT);
+	zapobj = ds->ds_phys->ds_userrefs_obj;
+	VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx));
+	if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 &&
+	    DS_IS_DEFER_DESTROY(ds)) {
+		struct dsl_ds_destroyarg dsda = {0};
+
+		ASSERT(ra->own);
+		dsda.ds = ds;
+		dsda.releasing = B_TRUE;
+		/* We already did the destroy_check */
+		dsl_dataset_destroy_sync(&dsda, tag, cr, tx);
+	}
+
+	spa_history_internal_log(LOG_DS_USER_RELEASE,
+	    dp->dp_spa, tx, cr, "<%s> %lld dataset = %llu",
+	    ra->htag, (longlong_t)refs, dsobj);
+}
+
+static int
+dsl_dataset_user_release_one(const char *dsname, void *arg)
+{
+	struct dsl_ds_holdarg *ha = arg;
+	struct dsl_ds_releasearg *ra;
+	dsl_dataset_t *ds;
+	int error;
+	void *dtag = ha->dstg;
+	char *name;
+	boolean_t own = B_FALSE;
+	boolean_t might_destroy;
+
+	/* alloc a buffer to hold dsname@snapname, plus the terminating NULL */
+	name = kmem_asprintf("%s@%s", dsname, ha->snapname);
+	error = dsl_dataset_hold(name, dtag, &ds);
+	strfree(name);
+	if (error == ENOENT && ha->recursive)
+		return (0);
+	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+	if (error)
+		return (error);
+
+	ha->gotone = B_TRUE;
+
+	ASSERT(dsl_dataset_is_snapshot(ds));
+
+	error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy);
+	if (error) {
+		dsl_dataset_rele(ds, dtag);
+		return (error);
+	}
+
+	if (might_destroy) {
+#ifdef _KERNEL
+		error = zfs_unmount_snap(name, NULL);
+		if (error) {
+			dsl_dataset_rele(ds, dtag);
+			return (error);
+		}
+#endif
+		if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) {
+			dsl_dataset_rele(ds, dtag);
+			return (EBUSY);
+		} else {
+			own = B_TRUE;
+			dsl_dataset_make_exclusive(ds, dtag);
+		}
+	}
+
+	ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP);
+	ra->ds = ds;
+	ra->htag = ha->htag;
+	ra->own = own;
+	dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check,
+	    dsl_dataset_user_release_sync, ra, dtag, 0);
+
+	return (0);
+}
+
+int
+dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+    boolean_t recursive)
+{
+	struct dsl_ds_holdarg *ha;
+	dsl_sync_task_t *dst;
+	spa_t *spa;
+	int error;
+
+top:
+	ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP);
+
+	(void) strlcpy(ha->failed, dsname, sizeof (ha->failed));
+
+	error = spa_open(dsname, &spa, FTAG);
+	if (error) {
+		kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+		return (error);
+	}
+
+	ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa));
+	ha->htag = htag;
+	ha->snapname = snapname;
+	ha->recursive = recursive;
+	if (recursive) {
+		error = dmu_objset_find(dsname, dsl_dataset_user_release_one,
+		    ha, DS_FIND_CHILDREN);
+	} else {
+		error = dsl_dataset_user_release_one(dsname, ha);
+	}
+	if (error == 0)
+		error = dsl_sync_task_group_wait(ha->dstg);
+
+	for (dst = list_head(&ha->dstg->dstg_tasks); dst;
+	    dst = list_next(&ha->dstg->dstg_tasks, dst)) {
+		struct dsl_ds_releasearg *ra = dst->dst_arg1;
+		dsl_dataset_t *ds = ra->ds;
+
+		if (dst->dst_err)
+			dsl_dataset_name(ds, ha->failed);
+
+		if (ra->own)
+			dsl_dataset_disown(ds, ha->dstg);
+		else
+			dsl_dataset_rele(ds, ha->dstg);
+
+		kmem_free(ra, sizeof (struct dsl_ds_releasearg));
+	}
+
+	if (error == 0 && recursive && !ha->gotone)
+		error = ENOENT;
+
+	if (error && error != EBUSY)
+		(void) strlcpy(dsname, ha->failed, sizeof (ha->failed));
+
+	dsl_sync_task_group_destroy(ha->dstg);
+	kmem_free(ha, sizeof (struct dsl_ds_holdarg));
+	spa_close(spa, FTAG);
+
+	/*
+	 * We can get EBUSY if we were racing with deferred destroy and
+	 * dsl_dataset_user_release_check() hadn't done the necessary
+	 * open context setup.  We can also get EBUSY if we're racing
+	 * with destroy and that thread is the ds_owner.  Either way
+	 * the busy condition should be transient, and we should retry
+	 * the release operation.
+	 */
+	if (error == EBUSY)
+		goto top;
+
+	return (error);
+}
+
+/*
+ * Called at spa_load time to release a stale temporary user hold.
+ */
+int
+dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag)
+{
+	dsl_dataset_t *ds;
+	char *snap;
+	char *name;
+	int namelen;
+	int error;
+
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+	error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds);
+	rw_exit(&dp->dp_config_rwlock);
+	if (error)
+		return (error);
+	namelen = dsl_dataset_namelen(ds)+1;
+	name = kmem_alloc(namelen, KM_SLEEP);
+	dsl_dataset_name(ds, name);
+	dsl_dataset_rele(ds, FTAG);
+
+	snap = strchr(name, '@');
+	*snap = '\0';
+	++snap;
+	return (dsl_dataset_user_release(name, snap, htag, B_FALSE));
+}
+
+int
+dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp)
+{
+	dsl_dataset_t *ds;
+	int err;
+
+	err = dsl_dataset_hold(dsname, FTAG, &ds);
+	if (err)
+		return (err);
+
+	VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP));
+	if (ds->ds_phys->ds_userrefs_obj != 0) {
+		zap_attribute_t *za;
+		zap_cursor_t zc;
+
+		za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP);
+		for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset,
+		    ds->ds_phys->ds_userrefs_obj);
+		    zap_cursor_retrieve(&zc, za) == 0;
+		    zap_cursor_advance(&zc)) {
+			VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name,
+			    za->za_first_integer));
+		}
+		zap_cursor_fini(&zc);
+		kmem_free(za, sizeof (zap_attribute_t));
+	}
+	dsl_dataset_rele(ds, FTAG);
+	return (0);
+}
+
+/*
+ * Note, this fuction is used as the callback for dmu_objset_find().  We
+ * always return 0 so that we will continue to find and process
+ * inconsistent datasets, even if we encounter an error trying to
+ * process one of them.
+ */
+/* ARGSUSED */
+int
+dsl_destroy_inconsistent(const char *dsname, void *arg)
+{
+	dsl_dataset_t *ds;
+
+	if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) {
+		if (DS_IS_INCONSISTENT(ds))
+			(void) dsl_dataset_destroy(ds, FTAG, B_FALSE);
+		else
+			dsl_dataset_disown(ds, FTAG);
+	}
+	return (0);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c
index da5d15787570f..04053fdf206ec 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -66,8 +66,6 @@
  * The ZAP OBJ is referred to as the jump object.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
@@ -77,8 +75,6 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
-#include <sys/spa_impl.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/cred.h>
@@ -540,7 +536,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
 	dsl_pool_t *dp;
 	void *cookie;
 	int	error;
-	char	checkflag = ZFS_DELEG_LOCAL;
+	char	checkflag;
 	objset_t *mos;
 	avl_tree_t permsets;
 	perm_set_t *setnode;
@@ -563,6 +559,16 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
 		return (EPERM);
 	}
 
+	if (dsl_dataset_is_snapshot(ds)) {
+		/*
+		 * Snapshots are treated as descendents only,
+		 * local permissions do not apply.
+		 */
+		checkflag = ZFS_DELEG_DESCENDENT;
+	} else {
+		checkflag = ZFS_DELEG_LOCAL;
+	}
+
 	avl_create(&permsets, perm_set_compare, sizeof (perm_set_t),
 	    offsetof(perm_set_t, p_node));
 
@@ -581,7 +587,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr)
 
 			if (dsl_prop_get_dd(dd,
 			    zfs_prop_to_name(ZFS_PROP_ZONED),
-			    8, 1, &zoned, NULL) != 0)
+			    8, 1, &zoned, NULL, B_FALSE) != 0)
 				break;
 			if (!zoned)
 				break;
@@ -731,5 +737,5 @@ dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx)
 boolean_t
 dsl_delegation_on(objset_t *os)
 {
-	return (os->os->os_spa->spa_delegation);
+	return (!!spa_delegation(os->os_spa));
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c
index 48d87f97f6698..a70fa8e4e9c11 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,6 +32,7 @@
 #include <sys/dsl_synctask.h>
 #include <sys/dsl_deleg.h>
 #include <sys/spa.h>
+#include <sys/metaslab.h>
 #include <sys/zap.h>
 #include <sys/zio.h>
 #include <sys/arc.h>
@@ -96,7 +97,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 #endif
 	if (dd == NULL) {
 		dsl_dir_t *winner;
-		int err;
 
 		dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP);
 		dd->dd_object = ddobj;
@@ -108,6 +108,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
 		list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
 		    offsetof(dsl_prop_cb_record_t, cbr_node));
 
+		dsl_dir_snap_cmtime_update(dd);
+
 		if (dd->dd_phys->dd_parent_obj) {
 			err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj,
 			    NULL, dd, &dd->dd_parent);
@@ -227,24 +229,11 @@ dsl_dir_namelen(dsl_dir_t *dd)
 	return (result);
 }
 
-int
-dsl_dir_is_private(dsl_dir_t *dd)
-{
-	int rv = FALSE;
-
-	if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent))
-		rv = TRUE;
-	if (dataset_name_hidden(dd->dd_myname))
-		rv = TRUE;
-	return (rv);
-}
-
-
 static int
 getcomponent(const char *path, char *component, const char **nextp)
 {
 	char *p;
-	if (path == NULL)
+	if ((path == NULL) || (path[0] == '\0'))
 		return (ENOENT);
 	/* This would be a good place to reserve some namespace... */
 	p = strpbrk(path, "/@");
@@ -441,7 +430,8 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name,
 int
 dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
 	dsl_pool_t *dp = dd->dd_pool;
 	objset_t *mos = dp->dp_meta_objset;
 	int err;
@@ -470,17 +460,25 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx)
 void
 dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
-	uint64_t val, obj;
+	dsl_prop_setarg_t psa;
+	uint64_t value = 0;
+	uint64_t obj;
 	dd_used_t t;
 
 	ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock));
 	ASSERT(dd->dd_phys->dd_head_dataset_obj == 0);
 
 	/* Remove our reservation. */
-	val = 0;
-	dsl_dir_set_reservation_sync(dd, &val, cr, tx);
+	dsl_prop_setarg_init_uint64(&psa, "reservation",
+	    (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED),
+	    &value);
+	psa.psa_effective_value = 0;	/* predict default value */
+
+	dsl_dir_set_reservation_sync(ds, &psa, cr, tx);
+
 	ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0);
 	ASSERT3U(dd->dd_phys->dd_reserved, ==, 0);
 	for (t = 0; t < DD_USED_NUM; t++)
@@ -662,7 +660,7 @@ dsl_dir_space_available(dsl_dir_t *dd,
 		 * dsl_pool_adjustedsize()), something is very
 		 * wrong.
 		 */
-		ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa));
+		ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa));
 	} else {
 		/*
 		 * the lesser of the space provided by our parent and
@@ -690,8 +688,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 {
 	uint64_t txg = tx->tx_txg;
 	uint64_t est_inflight, used_on_disk, quota, parent_rsrv;
+	uint64_t deferred = 0;
 	struct tempreserve *tr;
-	int enospc = EDQUOT;
+	int retval = EDQUOT;
 	int txgidx = txg & TXG_MASK;
 	int i;
 	uint64_t ref_rsrv = 0;
@@ -717,7 +716,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 	 */
 	if (first && tx->tx_objset) {
 		int error;
-		dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset;
+		dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset;
 
 		error = dsl_dataset_check_quota(ds, checkrefquota,
 		    asize, est_inflight, &used_on_disk, &ref_rsrv);
@@ -737,7 +736,8 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 		quota = dd->dd_phys->dd_quota;
 
 	/*
-	 * Adjust the quota against the actual pool size at the root.
+	 * Adjust the quota against the actual pool size at the root
+	 * minus any outstanding deferred frees.
 	 * To ensure that it's possible to remove files from a full
 	 * pool without inducing transient overcommits, we throttle
 	 * netfree transactions against a quota that is slightly larger,
@@ -746,10 +746,12 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 	 * removes to get through.
 	 */
 	if (dd->dd_parent == NULL) {
+		spa_t *spa = dd->dd_pool->dp_spa;
 		uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree);
-		if (poolsize < quota) {
-			quota = poolsize;
-			enospc = ENOSPC;
+		deferred = metaslab_class_get_deferred(spa_normal_class(spa));
+		if (poolsize - deferred < quota) {
+			quota = poolsize - deferred;
+			retval = ENOSPC;
 		}
 	}
 
@@ -759,15 +761,16 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree,
 	 * on-disk is over quota and there are no pending changes (which
 	 * may free up space for us).
 	 */
-	if (used_on_disk + est_inflight > quota) {
-		if (est_inflight > 0 || used_on_disk < quota)
-			enospc = ERESTART;
+	if (used_on_disk + est_inflight >= quota) {
+		if (est_inflight > 0 || used_on_disk < quota ||
+		    (retval == ENOSPC && used_on_disk < quota + deferred))
+			retval = ERESTART;
 		dprintf_dd(dd, "failing: used=%lluK inflight = %lluK "
 		    "quota=%lluK tr=%lluK err=%d\n",
 		    used_on_disk>>10, est_inflight>>10,
-		    quota>>10, asize>>10, enospc);
+		    quota>>10, asize>>10, retval);
 		mutex_exit(&dd->dd_lock);
-		return (enospc);
+		return (retval);
 	}
 
 	/* We need to up our estimated delta before dropping dd_lock */
@@ -1001,13 +1004,16 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
 static int
 dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	uint64_t *quotap = arg2;
-	uint64_t new_quota = *quotap;
-	int err = 0;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_setarg_t *psa = arg2;
+	int err;
 	uint64_t towrite;
 
-	if (new_quota == 0)
+	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+		return (err);
+
+	if (psa->psa_effective_value == 0)
 		return (0);
 
 	mutex_enter(&dd->dd_lock);
@@ -1019,68 +1025,89 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx)
 	 */
 	towrite = dsl_dir_space_towrite(dd);
 	if ((dmu_tx_is_syncing(tx) || towrite == 0) &&
-	    (new_quota < dd->dd_phys->dd_reserved ||
-	    new_quota < dd->dd_phys->dd_used_bytes + towrite)) {
+	    (psa->psa_effective_value < dd->dd_phys->dd_reserved ||
+	    psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) {
 		err = ENOSPC;
 	}
 	mutex_exit(&dd->dd_lock);
 	return (err);
 }
 
+extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *);
+
 /* ARGSUSED */
 static void
 dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	uint64_t *quotap = arg2;
-	uint64_t new_quota = *quotap;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_setarg_t *psa = arg2;
+	uint64_t effective_value = psa->psa_effective_value;
+
+	dsl_prop_set_sync(ds, psa, cr, tx);
+	DSL_PROP_CHECK_PREDICTION(dd, psa);
 
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	mutex_enter(&dd->dd_lock);
-	dd->dd_phys->dd_quota = new_quota;
+	dd->dd_phys->dd_quota = effective_value;
 	mutex_exit(&dd->dd_lock);
 
 	spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa,
 	    tx, cr, "%lld dataset = %llu ",
-	    (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj);
+	    (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
-dsl_dir_set_quota(const char *ddname, uint64_t quota)
+dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota)
 {
 	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	dsl_prop_setarg_t psa;
 	int err;
 
-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	dsl_prop_setarg_init_uint64(&psa, "quota", source, &quota);
+
+	err = dsl_dataset_hold(ddname, FTAG, &ds);
 	if (err)
 		return (err);
 
-	if (quota != dd->dd_phys->dd_quota) {
-		/*
-		 * If someone removes a file, then tries to set the quota, we
-		 * want to make sure the file freeing takes effect.
-		 */
-		txg_wait_open(dd->dd_pool, 0);
-
-		err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
-		    dsl_dir_set_quota_sync, dd, &quota, 0);
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err) {
+		dsl_dataset_rele(ds, FTAG);
+		return (err);
 	}
+
+	ASSERT(ds->ds_dir == dd);
+
+	/*
+	 * If someone removes a file, then tries to set the quota, we want to
+	 * make sure the file freeing takes effect.
+	 */
+	txg_wait_open(dd->dd_pool, 0);
+
+	err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check,
+	    dsl_dir_set_quota_sync, ds, &psa, 0);
+
 	dsl_dir_close(dd, FTAG);
+	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
 int
 dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	uint64_t *reservationp = arg2;
-	uint64_t new_reservation = *reservationp;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_setarg_t *psa = arg2;
+	uint64_t effective_value;
 	uint64_t used, avail;
-	int64_t delta;
+	int err;
 
-	if (new_reservation > INT64_MAX)
-		return (EOVERFLOW);
+	if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0)
+		return (err);
+
+	effective_value = psa->psa_effective_value;
 
 	/*
 	 * If we are doing the preliminary check in open context, the
@@ -1091,8 +1118,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 
 	mutex_enter(&dd->dd_lock);
 	used = dd->dd_phys->dd_used_bytes;
-	delta = MAX(used, new_reservation) -
-	    MAX(used, dd->dd_phys->dd_reserved);
 	mutex_exit(&dd->dd_lock);
 
 	if (dd->dd_parent) {
@@ -1102,11 +1127,17 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 		avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used;
 	}
 
-	if (delta > 0 && delta > avail)
-		return (ENOSPC);
-	if (delta > 0 && dd->dd_phys->dd_quota > 0 &&
-	    new_reservation > dd->dd_phys->dd_quota)
-		return (ENOSPC);
+	if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) {
+		uint64_t delta = MAX(used, effective_value) -
+		    MAX(used, dd->dd_phys->dd_reserved);
+
+		if (delta > avail)
+			return (ENOSPC);
+		if (dd->dd_phys->dd_quota > 0 &&
+		    effective_value > dd->dd_phys->dd_quota)
+			return (ENOSPC);
+	}
+
 	return (0);
 }
 
@@ -1114,19 +1145,23 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx)
 static void
 dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
-	dsl_dir_t *dd = arg1;
-	uint64_t *reservationp = arg2;
-	uint64_t new_reservation = *reservationp;
+	dsl_dataset_t *ds = arg1;
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_prop_setarg_t *psa = arg2;
+	uint64_t effective_value = psa->psa_effective_value;
 	uint64_t used;
 	int64_t delta;
 
+	dsl_prop_set_sync(ds, psa, cr, tx);
+	DSL_PROP_CHECK_PREDICTION(dd, psa);
+
 	dmu_buf_will_dirty(dd->dd_dbuf, tx);
 
 	mutex_enter(&dd->dd_lock);
 	used = dd->dd_phys->dd_used_bytes;
-	delta = MAX(used, new_reservation) -
+	delta = MAX(used, effective_value) -
 	    MAX(used, dd->dd_phys->dd_reserved);
-	dd->dd_phys->dd_reserved = new_reservation;
+	dd->dd_phys->dd_reserved = effective_value;
 
 	if (dd->dd_parent != NULL) {
 		/* Roll up this additional usage into our ancestors */
@@ -1137,21 +1172,37 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 	spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa,
 	    tx, cr, "%lld dataset = %llu",
-	    (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj);
+	    (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj);
 }
 
 int
-dsl_dir_set_reservation(const char *ddname, uint64_t reservation)
+dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+    uint64_t reservation)
 {
 	dsl_dir_t *dd;
+	dsl_dataset_t *ds;
+	dsl_prop_setarg_t psa;
 	int err;
 
-	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation);
+
+	err = dsl_dataset_hold(ddname, FTAG, &ds);
 	if (err)
 		return (err);
+
+	err = dsl_dir_open(ddname, FTAG, &dd, NULL);
+	if (err) {
+		dsl_dataset_rele(ds, FTAG);
+		return (err);
+	}
+
+	ASSERT(ds->ds_dir == dd);
+
 	err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check,
-	    dsl_dir_set_reservation_sync, dd, &reservation, 0);
+	    dsl_dir_set_reservation_sync, ds, &psa, 0);
+
 	dsl_dir_close(dd, FTAG);
+	dsl_dataset_rele(ds, FTAG);
 	return (err);
 }
 
@@ -1329,3 +1380,26 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space)
 
 	return (0);
 }
+
+timestruc_t
+dsl_dir_snap_cmtime(dsl_dir_t *dd)
+{
+	timestruc_t t;
+
+	mutex_enter(&dd->dd_lock);
+	t = dd->dd_snap_cmtime;
+	mutex_exit(&dd->dd_lock);
+
+	return (t);
+}
+
+void
+dsl_dir_snap_cmtime_update(dsl_dir_t *dd)
+{
+	timestruc_t t;
+
+	gethrestime(&t);
+	mutex_enter(&dd->dd_lock);
+	dd->dd_snap_cmtime = t;
+	mutex_exit(&dd->dd_lock);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c
index dacc57c81c254..a4ca02e54fa83 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,7 +39,7 @@
 
 int zfs_no_write_throttle = 0;
 int zfs_write_limit_shift = 3;			/* 1/8th of physical memory */
-int zfs_txg_synctime = 5;			/* target secs to sync a txg */
+int zfs_txg_synctime_ms = 5000;		/* target millisecs to sync a txg */
 
 uint64_t zfs_write_limit_min = 32 << 20;	/* min write limit is 32MB */
 uint64_t zfs_write_limit_max = 0;		/* max data payload per txg */
@@ -90,6 +90,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg)
 	mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL);
 
+	dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri,
+	    1, 4, 0);
+
 	return (dp);
 }
 
@@ -100,13 +103,12 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
-	objset_impl_t *osi;
 
 	rw_enter(&dp->dp_config_rwlock, RW_WRITER);
-	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi);
+	err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp,
+	    &dp->dp_meta_objset);
 	if (err)
 		goto out;
-	dp->dp_meta_objset = &osi->os;
 
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1,
@@ -129,16 +131,25 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 			goto out;
 		err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj,
 		    FTAG, &ds);
+		if (err == 0) {
+			err = dsl_dataset_hold_obj(dp,
+			    ds->ds_phys->ds_prev_snap_obj, dp,
+			    &dp->dp_origin_snap);
+			dsl_dataset_rele(ds, FTAG);
+		}
+		dsl_dir_close(dd, dp);
 		if (err)
 			goto out;
-		err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj,
-		    dp, &dp->dp_origin_snap);
-		if (err)
-			goto out;
-		dsl_dataset_rele(ds, FTAG);
-		dsl_dir_close(dd, dp);
 	}
 
+	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1,
+	    &dp->dp_tmp_userrefs_obj);
+	if (err == ENOENT)
+		err = 0;
+	if (err)
+		goto out;
+
 	/* get scrub status */
 	err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1,
@@ -160,10 +171,22 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp)
 		if (err)
 			goto out;
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-		    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+		    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+		    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
 		    &dp->dp_scrub_bookmark);
 		if (err)
 			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+		    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+		    &dp->dp_scrub_ddt_bookmark);
+		if (err && err != ENOENT)
+			goto out;
+		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+		    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+		    &dp->dp_scrub_ddt_class_max);
+		if (err && err != ENOENT)
+			goto out;
 		err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 		    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
 		    &spa->spa_scrub_errors);
@@ -215,7 +238,7 @@ dsl_pool_close(dsl_pool_t *dp)
 
 	/* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */
 	if (dp->dp_meta_objset)
-		dmu_objset_evict(NULL, dp->dp_meta_objset->os);
+		dmu_objset_evict(dp->dp_meta_objset);
 
 	txg_list_destroy(&dp->dp_dirty_datasets);
 	txg_list_destroy(&dp->dp_dirty_dirs);
@@ -226,6 +249,7 @@ dsl_pool_close(dsl_pool_t *dp)
 	rw_destroy(&dp->dp_config_rwlock);
 	mutex_destroy(&dp->dp_lock);
 	mutex_destroy(&dp->dp_scrub_cancel_lock);
+	taskq_destroy(dp->dp_vnrele_taskq);
 	if (dp->dp_blkstats)
 		kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t));
 	kmem_free(dp, sizeof (dsl_pool_t));
@@ -237,13 +261,13 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 	int err;
 	dsl_pool_t *dp = dsl_pool_open_impl(spa, txg);
 	dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg);
-	objset_impl_t *osip;
+	objset_t *os;
 	dsl_dataset_t *ds;
 	uint64_t dsobj;
 
 	/* create and open the MOS (meta-objset) */
-	dp->dp_meta_objset = &dmu_objset_create_impl(spa,
-	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os;
+	dp->dp_meta_objset = dmu_objset_create_impl(spa,
+	    NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx);
 
 	/* create the pool directory */
 	err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
@@ -268,10 +292,10 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg)
 
 	/* create the root objset */
 	VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds));
-	osip = dmu_objset_create_impl(dp->dp_spa, ds,
+	os = dmu_objset_create_impl(dp->dp_spa, ds,
 	    dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx);
 #ifdef _KERNEL
-	zfs_create_fs(&osip->os, kcred, zplprops, tx);
+	zfs_create_fs(os, kcred, zplprops, tx);
 #endif
 	dsl_dataset_rele(ds, FTAG);
 
@@ -288,7 +312,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	dsl_dir_t *dd;
 	dsl_dataset_t *ds;
 	dsl_sync_task_group_t *dstg;
-	objset_impl_t *mosi = dp->dp_meta_objset->os;
+	objset_t *mos = dp->dp_meta_objset;
 	hrtime_t start, write_time;
 	uint64_t data_written;
 	int err;
@@ -296,24 +320,61 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	dp->dp_read_overhead = 0;
+	start = gethrtime();
+
 	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
 	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
-		if (!list_link_active(&ds->ds_synced_link))
-			list_insert_tail(&dp->dp_synced_datasets, ds);
-		else
-			dmu_buf_rele(ds->ds_dbuf, ds);
+		/*
+		 * We must not sync any non-MOS datasets twice, because
+		 * we may have taken a snapshot of them.  However, we
+		 * may sync newly-created datasets on pass 2.
+		 */
+		ASSERT(!list_link_active(&ds->ds_synced_link));
+		list_insert_tail(&dp->dp_synced_datasets, ds);
 		dsl_dataset_sync(ds, zio, tx);
 	}
 	DTRACE_PROBE(pool_sync__1setup);
-
-	start = gethrtime();
 	err = zio_wait(zio);
+
 	write_time = gethrtime() - start;
 	ASSERT(err == 0);
 	DTRACE_PROBE(pool_sync__2rootzio);
 
-	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg))
+	for (ds = list_head(&dp->dp_synced_datasets); ds;
+	    ds = list_next(&dp->dp_synced_datasets, ds))
+		dmu_objset_do_userquota_callbacks(ds->ds_objset, tx);
+
+	/*
+	 * Sync the datasets again to push out the changes due to
+	 * userquota updates.  This must be done before we process the
+	 * sync tasks, because that could cause a snapshot of a dataset
+	 * whose ds_bp will be rewritten when we do this 2nd sync.
+	 */
+	zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
+	while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) {
+		ASSERT(list_link_active(&ds->ds_synced_link));
+		dmu_buf_rele(ds->ds_dbuf, ds);
+		dsl_dataset_sync(ds, zio, tx);
+	}
+	err = zio_wait(zio);
+
+	/*
+	 * If anything was added to a deadlist during a zio done callback,
+	 * it had to be put on the deferred queue.  Enqueue it for real now.
+	 */
+	for (ds = list_head(&dp->dp_synced_datasets); ds;
+	    ds = list_next(&dp->dp_synced_datasets, ds))
+		bplist_sync(&ds->ds_deadlist,
+		    bplist_enqueue_cb, &ds->ds_deadlist, tx);
+
+	while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) {
+		/*
+		 * No more sync tasks should have been added while we
+		 * were syncing.
+		 */
+		ASSERT(spa_sync_pass(dp->dp_spa) == 1);
 		dsl_sync_task_group_sync(dstg, tx);
+	}
 	DTRACE_PROBE(pool_sync__3task);
 
 	start = gethrtime();
@@ -321,14 +382,18 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 		dsl_dir_sync(dd, tx);
 	write_time += gethrtime() - start;
 
-	if (spa_sync_pass(dp->dp_spa) == 1)
+	if (spa_sync_pass(dp->dp_spa) == 1) {
+		dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL,
+		    NULL, ZIO_FLAG_CANFAIL);
 		dsl_pool_scrub_sync(dp, tx);
+		(void) zio_wait(dp->dp_scrub_prefetch_zio_root);
+	}
 
 	start = gethrtime();
-	if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
-	    list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) {
+	if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL ||
+	    list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) {
 		zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED);
-		dmu_objset_sync(mosi, zio, tx);
+		dmu_objset_sync(mos, zio, tx);
 		err = zio_wait(zio);
 		ASSERT(err == 0);
 		dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", "");
@@ -366,10 +431,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 	 * amount of write traffic allowed into each transaction group.
 	 * Weight the throughput calculation towards the current value:
 	 * 	thru = 3/4 old_thru + 1/4 new_thru
+	 *
+	 * Note: write_time is in nanosecs, so write_time/MICROSEC
+	 * yields millisecs
 	 */
 	ASSERT(zfs_write_limit_min > 0);
-	if (data_written > zfs_write_limit_min / 8 && write_time > 0) {
-		uint64_t throughput = (data_written * NANOSEC) / write_time;
+	if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) {
+		uint64_t throughput = data_written / (write_time / MICROSEC);
+
 		if (dp->dp_throughput)
 			dp->dp_throughput = throughput / 4 +
 			    3 * dp->dp_throughput / 4;
@@ -377,21 +446,24 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg)
 			dp->dp_throughput = throughput;
 		dp->dp_write_limit = MIN(zfs_write_limit_inflated,
 		    MAX(zfs_write_limit_min,
-		    dp->dp_throughput * zfs_txg_synctime));
+		    dp->dp_throughput * zfs_txg_synctime_ms));
 	}
 }
 
 void
-dsl_pool_zil_clean(dsl_pool_t *dp)
+dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg)
 {
 	dsl_dataset_t *ds;
+	objset_t *os;
 
 	while (ds = list_head(&dp->dp_synced_datasets)) {
 		list_remove(&dp->dp_synced_datasets, ds);
-		ASSERT(ds->ds_user_ptr != NULL);
-		zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil);
+		os = ds->ds_objset;
+		zil_clean(os->os_zil);
+		ASSERT(!dmu_objset_is_dirty(os, txg));
 		dmu_buf_rele(ds->ds_dbuf, ds);
 	}
+	ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg));
 }
 
 /*
@@ -568,6 +640,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 	ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object);
 
 	if (prev->ds_phys->ds_next_clones_obj == 0) {
+		dmu_buf_will_dirty(prev->ds_dbuf, tx);
 		prev->ds_phys->ds_next_clones_obj =
 		    zap_create(dp->dp_meta_objset,
 		    DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx);
@@ -587,8 +660,8 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx)
 	ASSERT(dmu_tx_is_syncing(tx));
 	ASSERT(dp->dp_origin_snap != NULL);
 
-	(void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
-	    tx, DS_FIND_CHILDREN);
+	VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb,
+	    tx, DS_FIND_CHILDREN));
 }
 
 void
@@ -611,3 +684,114 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx)
 	dsl_dataset_rele(ds, FTAG);
 	rw_exit(&dp->dp_config_rwlock);
 }
+
+taskq_t *
+dsl_pool_vnrele_taskq(dsl_pool_t *dp)
+{
+	return (dp->dp_vnrele_taskq);
+}
+
+/*
+ * Walk through the pool-wide zap object of temporary snapshot user holds
+ * and release them.
+ */
+void
+dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp)
+{
+	zap_attribute_t za;
+	zap_cursor_t zc;
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+
+	if (zapobj == 0)
+		return;
+	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+
+	for (zap_cursor_init(&zc, mos, zapobj);
+	    zap_cursor_retrieve(&zc, &za) == 0;
+	    zap_cursor_advance(&zc)) {
+		char *htag;
+		uint64_t dsobj;
+
+		htag = strchr(za.za_name, '-');
+		*htag = '\0';
+		++htag;
+		dsobj = strtonum(za.za_name, NULL);
+		(void) dsl_dataset_user_release_tmp(dp, dsobj, htag);
+	}
+	zap_cursor_fini(&zc);
+}
+
+/*
+ * Create the pool-wide zap object for storing temporary snapshot holds.
+ */
+void
+dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx)
+{
+	objset_t *mos = dp->dp_meta_objset;
+
+	ASSERT(dp->dp_tmp_userrefs_obj == 0);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS,
+	    DMU_OT_NONE, 0, tx);
+
+	VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS,
+	    sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0);
+}
+
+static int
+dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding)
+{
+	objset_t *mos = dp->dp_meta_objset;
+	uint64_t zapobj = dp->dp_tmp_userrefs_obj;
+	char *name;
+	int error;
+
+	ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS);
+	ASSERT(dmu_tx_is_syncing(tx));
+
+	/*
+	 * If the pool was created prior to SPA_VERSION_USERREFS, the
+	 * zap object for temporary holds might not exist yet.
+	 */
+	if (zapobj == 0) {
+		if (holding) {
+			dsl_pool_user_hold_create_obj(dp, tx);
+			zapobj = dp->dp_tmp_userrefs_obj;
+		} else {
+			return (ENOENT);
+		}
+	}
+
+	name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag);
+	if (holding)
+		error = zap_add(mos, zapobj, name, 8, 1, now, tx);
+	else
+		error = zap_remove(mos, zapobj, name, tx);
+	strfree(name);
+
+	return (error);
+}
+
+/*
+ * Add a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+    uint64_t *now, dmu_tx_t *tx)
+{
+	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE));
+}
+
+/*
+ * Release a temporary hold for the given dataset object and tag.
+ */
+int
+dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag,
+    dmu_tx_t *tx)
+{
+	return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL,
+	    tx, B_FALSE));
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c
index 212acbbc59688..f27305c953229 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c
@@ -19,12 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
+#include <sys/zfs_context.h>
 #include <sys/dmu.h>
 #include <sys/dmu_objset.h>
 #include <sys/dmu_tx.h>
@@ -33,14 +32,16 @@
 #include <sys/dsl_prop.h>
 #include <sys/dsl_synctask.h>
 #include <sys/spa.h>
-#include <sys/zio_checksum.h> /* for the default checksum value */
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 
 #include "zfs_prop.h"
 
+#define	ZPROP_INHERIT_SUFFIX "$inherit"
+#define	ZPROP_RECVD_SUFFIX "$recvd"
+
 static int
-dodefault(const char *propname, int intsz, int numint, void *buf)
+dodefault(const char *propname, int intsz, int numints, void *buf)
 {
 	zfs_prop_t prop;
 
@@ -57,9 +58,9 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
 		if (intsz != 1)
 			return (EOVERFLOW);
 		(void) strncpy(buf, zfs_prop_default_string(prop),
-		    numint);
+		    numints);
 	} else {
-		if (intsz != 8 || numint < 1)
+		if (intsz != 8 || numints < 1)
 			return (EOVERFLOW);
 
 		*(uint64_t *)buf = zfs_prop_default_numeric(prop);
@@ -70,11 +71,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf)
 
 int
 dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
-    int intsz, int numint, void *buf, char *setpoint)
+    int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot)
 {
 	int err = ENOENT;
+	dsl_dir_t *target = dd;
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
 	zfs_prop_t prop;
+	boolean_t inheritable;
+	boolean_t inheriting = B_FALSE;
+	char *inheritstr;
+	char *recvdstr;
 
 	ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
 
@@ -82,51 +88,135 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname,
 		setpoint[0] = '\0';
 
 	prop = zfs_name_to_prop(propname);
+	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
 
 	/*
-	 * Note: dd may be NULL, therefore we shouldn't dereference it
-	 * ouside this loop.
+	 * Note: dd may become NULL, therefore we shouldn't dereference it
+	 * after this loop.
 	 */
 	for (; dd != NULL; dd = dd->dd_parent) {
 		ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock));
-		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
-		    propname, intsz, numint, buf);
+
+		if (dd != target || snapshot) {
+			if (!inheritable)
+				break;
+			inheriting = B_TRUE;
+		}
+
+		/* Check for a local value. */
+		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
+		    intsz, numints, buf);
 		if (err != ENOENT) {
-			if (setpoint)
+			if (setpoint != NULL && err == 0)
 				dsl_dir_name(dd, setpoint);
 			break;
 		}
 
 		/*
-		 * Break out of this loop for non-inheritable properties.
+		 * Skip the check for a received value if there is an explicit
+		 * inheritance entry.
 		 */
-		if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+		err = zap_contains(mos, dd->dd_phys->dd_props_zapobj,
+		    inheritstr);
+		if (err != 0 && err != ENOENT)
 			break;
+
+		if (err == ENOENT) {
+			/* Check for a received value. */
+			err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj,
+			    recvdstr, intsz, numints, buf);
+			if (err != ENOENT) {
+				if (setpoint != NULL && err == 0) {
+					if (inheriting) {
+						dsl_dir_name(dd, setpoint);
+					} else {
+						(void) strcpy(setpoint,
+						    ZPROP_SOURCE_VAL_RECVD);
+					}
+				}
+				break;
+			}
+		}
+
+		/*
+		 * If we found an explicit inheritance entry, err is zero even
+		 * though we haven't yet found the value, so reinitializing err
+		 * at the end of the loop (instead of at the beginning) ensures
+		 * that err has a valid post-loop value.
+		 */
+		err = ENOENT;
 	}
+
 	if (err == ENOENT)
-		err = dodefault(propname, intsz, numint, buf);
+		err = dodefault(propname, intsz, numints, buf);
+
+	strfree(inheritstr);
+	strfree(recvdstr);
 
 	return (err);
 }
 
 int
 dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
-    int intsz, int numint, void *buf, char *setpoint)
+    int intsz, int numints, void *buf, char *setpoint)
 {
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	boolean_t inheritable;
+	boolean_t snapshot;
+	uint64_t zapobj;
+
 	ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock));
+	inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop));
+	snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds));
+	zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj);
 
-	if (ds->ds_phys->ds_props_obj) {
-		int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset,
-		    ds->ds_phys->ds_props_obj, propname, intsz, numint, buf);
+	if (zapobj != 0) {
+		objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
+		int err;
+
+		ASSERT(snapshot);
+
+		/* Check for a local value. */
+		err = zap_lookup(mos, zapobj, propname, intsz, numints, buf);
 		if (err != ENOENT) {
-			if (setpoint)
+			if (setpoint != NULL && err == 0)
 				dsl_dataset_name(ds, setpoint);
 			return (err);
 		}
+
+		/*
+		 * Skip the check for a received value if there is an explicit
+		 * inheritance entry.
+		 */
+		if (inheritable) {
+			char *inheritstr = kmem_asprintf("%s%s", propname,
+			    ZPROP_INHERIT_SUFFIX);
+			err = zap_contains(mos, zapobj, inheritstr);
+			strfree(inheritstr);
+			if (err != 0 && err != ENOENT)
+				return (err);
+		}
+
+		if (err == ENOENT) {
+			/* Check for a received value. */
+			char *recvdstr = kmem_asprintf("%s%s", propname,
+			    ZPROP_RECVD_SUFFIX);
+			err = zap_lookup(mos, zapobj, recvdstr,
+			    intsz, numints, buf);
+			strfree(recvdstr);
+			if (err != ENOENT) {
+				if (setpoint != NULL && err == 0)
+					(void) strcpy(setpoint,
+					    ZPROP_SOURCE_VAL_RECVD);
+				return (err);
+			}
+		}
 	}
 
 	return (dsl_prop_get_dd(ds->ds_dir, propname,
-	    intsz, numint, buf, setpoint));
+	    intsz, numints, buf, setpoint, snapshot));
 }
 
 /*
@@ -212,6 +302,137 @@ dsl_prop_get_integer(const char *ddname, const char *propname,
 	return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint));
 }
 
+void
+dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+    zprop_source_t source, uint64_t *value)
+{
+	psa->psa_name = propname;
+	psa->psa_source = source;
+	psa->psa_intsz = 8;
+	psa->psa_numints = 1;
+	psa->psa_value = value;
+
+	psa->psa_effective_value = -1ULL;
+}
+
+/*
+ * Predict the effective value of the given special property if it were set with
+ * the given value and source. This is not a general purpose function. It exists
+ * only to handle the special requirements of the quota and reservation
+ * properties. The fact that these properties are non-inheritable greatly
+ * simplifies the prediction logic.
+ *
+ * Returns 0 on success, a positive error code on failure, or -1 if called with
+ * a property not handled by this function.
+ */
+int
+dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+{
+	const char *propname = psa->psa_name;
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	zprop_source_t source = psa->psa_source;
+	objset_t *mos;
+	uint64_t zapobj;
+	uint64_t version;
+	char *recvdstr;
+	int err = 0;
+
+	switch (prop) {
+	case ZFS_PROP_QUOTA:
+	case ZFS_PROP_RESERVATION:
+	case ZFS_PROP_REFQUOTA:
+	case ZFS_PROP_REFRESERVATION:
+		break;
+	default:
+		return (-1);
+	}
+
+	mos = dd->dd_pool->dp_meta_objset;
+	zapobj = dd->dd_phys->dd_props_zapobj;
+	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+	version = spa_version(dd->dd_pool->dp_spa);
+	if (version < SPA_VERSION_RECVD_PROPS) {
+		if (source & ZPROP_SRC_NONE)
+			source = ZPROP_SRC_NONE;
+		else if (source & ZPROP_SRC_RECEIVED)
+			source = ZPROP_SRC_LOCAL;
+	}
+
+	switch (source) {
+	case ZPROP_SRC_NONE:
+		/* Revert to the received value, if any. */
+		err = zap_lookup(mos, zapobj, recvdstr, 8, 1,
+		    &psa->psa_effective_value);
+		if (err == ENOENT)
+			psa->psa_effective_value = 0;
+		break;
+	case ZPROP_SRC_LOCAL:
+		psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+		break;
+	case ZPROP_SRC_RECEIVED:
+		/*
+		 * If there's no local setting, then the new received value will
+		 * be the effective value.
+		 */
+		err = zap_lookup(mos, zapobj, propname, 8, 1,
+		    &psa->psa_effective_value);
+		if (err == ENOENT)
+			psa->psa_effective_value = *(uint64_t *)psa->psa_value;
+		break;
+	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+		/*
+		 * We're clearing the received value, so the local setting (if
+		 * it exists) remains the effective value.
+		 */
+		err = zap_lookup(mos, zapobj, propname, 8, 1,
+		    &psa->psa_effective_value);
+		if (err == ENOENT)
+			psa->psa_effective_value = 0;
+		break;
+	default:
+		cmn_err(CE_PANIC, "unexpected property source: %d", source);
+	}
+
+	strfree(recvdstr);
+
+	if (err == ENOENT)
+		return (0);
+
+	return (err);
+}
+
+#ifdef	ZFS_DEBUG
+void
+dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa)
+{
+	zfs_prop_t prop = zfs_name_to_prop(psa->psa_name);
+	uint64_t intval;
+	char setpoint[MAXNAMELEN];
+	uint64_t version = spa_version(dd->dd_pool->dp_spa);
+	int err;
+
+	if (version < SPA_VERSION_RECVD_PROPS) {
+		switch (prop) {
+		case ZFS_PROP_QUOTA:
+		case ZFS_PROP_RESERVATION:
+			return;
+		}
+	}
+
+	err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval,
+	    setpoint, B_FALSE);
+	if (err == 0 && intval != psa->psa_effective_value) {
+		cmn_err(CE_PANIC, "%s property, source: %x, "
+		    "predicted effective value: %llu, "
+		    "actual effective value: %llu (setpoint: %s)",
+		    psa->psa_name, psa->psa_source,
+		    (unsigned long long)psa->psa_effective_value,
+		    (unsigned long long)intval, setpoint);
+	}
+}
+#endif
+
 /*
  * Unregister this callback.  Return 0 on success, ENOENT if ddname is
  * invalid, ENOMSG if no matching callback registered.
@@ -279,7 +500,6 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 	zap_cursor_t zc;
 	zap_attribute_t *za;
 	int err;
-	uint64_t dummyval;
 
 	ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock));
 	err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd);
@@ -291,8 +511,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 		 * If the prop is set here, then this change is not
 		 * being inherited here or below; stop the recursion.
 		 */
-		err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname,
-		    8, 1, &dummyval);
+		err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname);
 		if (err == 0) {
 			dsl_dir_close(dd, FTAG);
 			return;
@@ -312,8 +531,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 		 * If the property is set on this ds, then it is not
 		 * inherited here; don't call the callback.
 		 */
-		if (propobj && 0 == zap_lookup(mos, propobj, propname,
-		    8, 1, &dummyval))
+		if (propobj && 0 == zap_contains(mos, propobj, propname))
 			continue;
 
 		cbr->cbr_func(cbr->cbr_arg, value);
@@ -333,30 +551,28 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
 	dsl_dir_close(dd, FTAG);
 }
 
-struct prop_set_arg {
-	const char *name;
-	int intsz;
-	int numints;
-	const void *buf;
-};
-
-
-static void
+void
 dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 {
 	dsl_dataset_t *ds = arg1;
-	struct prop_set_arg *psa = arg2;
+	dsl_prop_setarg_t *psa = arg2;
 	objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset;
-	uint64_t zapobj, intval;
+	uint64_t zapobj, intval, dummy;
 	int isint;
 	char valbuf[32];
-	char *valstr;
+	char *valstr = NULL;
+	char *inheritstr;
+	char *recvdstr;
+	char *tbuf = NULL;
+	int err;
+	uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+	const char *propname = psa->psa_name;
+	zprop_source_t source = psa->psa_source;
 
-	isint = (dodefault(psa->name, 8, 1, &intval) == 0);
+	isint = (dodefault(propname, 8, 1, &intval) == 0);
 
-	if (dsl_dataset_is_snapshot(ds)) {
-		ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >=
-		    SPA_VERSION_SNAP_PROPS);
+	if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
+		ASSERT(version >= SPA_VERSION_SNAP_PROPS);
 		if (ds->ds_phys->ds_props_obj == 0) {
 			dmu_buf_will_dirty(ds->ds_dbuf, tx);
 			ds->ds_phys->ds_props_obj =
@@ -368,22 +584,96 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		zapobj = ds->ds_dir->dd_phys->dd_props_zapobj;
 	}
 
-	if (psa->numints == 0) {
-		int err = zap_remove(mos, zapobj, psa->name, tx);
+	if (version < SPA_VERSION_RECVD_PROPS) {
+		zfs_prop_t prop = zfs_name_to_prop(propname);
+		if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION)
+			return;
+
+		if (source & ZPROP_SRC_NONE)
+			source = ZPROP_SRC_NONE;
+		else if (source & ZPROP_SRC_RECEIVED)
+			source = ZPROP_SRC_LOCAL;
+	}
+
+	inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX);
+	recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX);
+
+	switch (source) {
+	case ZPROP_SRC_NONE:
+		/*
+		 * revert to received value, if any (inherit -S)
+		 * - remove propname
+		 * - remove propname$inherit
+		 */
+		err = zap_remove(mos, zapobj, propname, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		err = zap_remove(mos, zapobj, inheritstr, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		break;
+	case ZPROP_SRC_LOCAL:
+		/*
+		 * remove propname$inherit
+		 * set propname -> value
+		 */
+		err = zap_remove(mos, zapobj, inheritstr, tx);
 		ASSERT(err == 0 || err == ENOENT);
-		if (isint) {
-			VERIFY(0 == dsl_prop_get_ds(ds,
-			    psa->name, 8, 1, &intval, NULL));
+		VERIFY(0 == zap_update(mos, zapobj, propname,
+		    psa->psa_intsz, psa->psa_numints, psa->psa_value, tx));
+		break;
+	case ZPROP_SRC_INHERITED:
+		/*
+		 * explicitly inherit
+		 * - remove propname
+		 * - set propname$inherit
+		 */
+		err = zap_remove(mos, zapobj, propname, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		if (version >= SPA_VERSION_RECVD_PROPS &&
+		    zap_contains(mos, zapobj, ZPROP_HAS_RECVD) == 0) {
+			dummy = 0;
+			err = zap_update(mos, zapobj, inheritstr,
+			    8, 1, &dummy, tx);
+			ASSERT(err == 0);
 		}
-	} else {
-		VERIFY(0 == zap_update(mos, zapobj, psa->name,
-		    psa->intsz, psa->numints, psa->buf, tx));
-		if (isint)
-			intval = *(uint64_t *)psa->buf;
+		break;
+	case ZPROP_SRC_RECEIVED:
+		/*
+		 * set propname$recvd -> value
+		 */
+		err = zap_update(mos, zapobj, recvdstr,
+		    psa->psa_intsz, psa->psa_numints, psa->psa_value, tx);
+		ASSERT(err == 0);
+		break;
+	case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED):
+		/*
+		 * clear local and received settings
+		 * - remove propname
+		 * - remove propname$inherit
+		 * - remove propname$recvd
+		 */
+		err = zap_remove(mos, zapobj, propname, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		err = zap_remove(mos, zapobj, inheritstr, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		/* FALLTHRU */
+	case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED):
+		/*
+		 * remove propname$recvd
+		 */
+		err = zap_remove(mos, zapobj, recvdstr, tx);
+		ASSERT(err == 0 || err == ENOENT);
+		break;
+	default:
+		cmn_err(CE_PANIC, "unexpected property source: %d", source);
 	}
 
+	strfree(inheritstr);
+	strfree(recvdstr);
+
 	if (isint) {
-		if (dsl_dataset_is_snapshot(ds)) {
+		VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL));
+
+		if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) {
 			dsl_prop_cb_record_t *cbr;
 			/*
 			 * It's a snapshot; nothing can inherit this
@@ -394,29 +684,84 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 			for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
 			    cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
 				if (cbr->cbr_ds == ds &&
-				    strcmp(cbr->cbr_propname, psa->name) == 0)
+				    strcmp(cbr->cbr_propname, propname) == 0)
 					cbr->cbr_func(cbr->cbr_arg, intval);
 			}
 			mutex_exit(&ds->ds_dir->dd_lock);
 		} else {
 			dsl_prop_changed_notify(ds->ds_dir->dd_pool,
-			    ds->ds_dir->dd_object, psa->name, intval, TRUE);
+			    ds->ds_dir->dd_object, propname, intval, TRUE);
 		}
-	}
-	if (isint) {
+
 		(void) snprintf(valbuf, sizeof (valbuf),
 		    "%lld", (longlong_t)intval);
 		valstr = valbuf;
 	} else {
-		valstr = (char *)psa->buf;
+		if (source == ZPROP_SRC_LOCAL) {
+			valstr = (char *)psa->psa_value;
+		} else {
+			tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP);
+			if (dsl_prop_get_ds(ds, propname, 1,
+			    ZAP_MAXVALUELEN, tbuf, NULL) == 0)
+				valstr = tbuf;
+		}
 	}
-	spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT :
+
+	spa_history_internal_log((source == ZPROP_SRC_NONE ||
+	    source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT :
 	    LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr,
-	    "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object);
+	    "%s=%s dataset = %llu", propname,
+	    (valstr == NULL ? "" : valstr), ds->ds_object);
+
+	if (tbuf != NULL)
+		kmem_free(tbuf, ZAP_MAXVALUELEN);
 }
 
 void
-dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
+{
+	dsl_dataset_t *ds = arg1;
+	dsl_props_arg_t *pa = arg2;
+	nvlist_t *props = pa->pa_props;
+	dsl_prop_setarg_t psa;
+	nvpair_t *elem = NULL;
+
+	psa.psa_source = pa->pa_source;
+
+	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+		nvpair_t *pair = elem;
+
+		psa.psa_name = nvpair_name(pair);
+
+		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+			/*
+			 * dsl_prop_get_all_impl() returns properties in this
+			 * format.
+			 */
+			nvlist_t *attrs;
+			VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+			VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+			    &pair) == 0);
+		}
+
+		if (nvpair_type(pair) == DATA_TYPE_STRING) {
+			VERIFY(nvpair_value_string(pair,
+			    (char **)&psa.psa_value) == 0);
+			psa.psa_intsz = 1;
+			psa.psa_numints = strlen(psa.psa_value) + 1;
+		} else {
+			uint64_t intval;
+			VERIFY(nvpair_value_uint64(pair, &intval) == 0);
+			psa.psa_intsz = sizeof (intval);
+			psa.psa_numints = 1;
+			psa.psa_value = &intval;
+		}
+		dsl_prop_set_sync(ds, &psa, cr, tx);
+	}
+}
+
+void
+dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
     cred_t *cr, dmu_tx_t *tx)
 {
 	objset_t *mos = dd->dd_pool->dp_meta_objset;
@@ -434,12 +779,13 @@ dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
 }
 
 int
-dsl_prop_set(const char *dsname, const char *propname,
+dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source,
     int intsz, int numints, const void *buf)
 {
 	dsl_dataset_t *ds;
+	uint64_t version;
 	int err;
-	struct prop_set_arg psa;
+	dsl_prop_setarg_t psa;
 
 	/*
 	 * We must do these checks before we get to the syncfunc, since
@@ -447,23 +793,30 @@ dsl_prop_set(const char *dsname, const char *propname,
 	 */
 	if (strlen(propname) >= ZAP_MAXNAMELEN)
 		return (ENAMETOOLONG);
-	if (intsz * numints >= ZAP_MAXVALUELEN)
-		return (E2BIG);
 
 	err = dsl_dataset_hold(dsname, FTAG, &ds);
 	if (err)
 		return (err);
 
+	version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+	if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ?
+	    ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (E2BIG);
+	}
 	if (dsl_dataset_is_snapshot(ds) &&
-	    spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) {
+	    version < SPA_VERSION_SNAP_PROPS) {
 		dsl_dataset_rele(ds, FTAG);
 		return (ENOTSUP);
 	}
 
-	psa.name = propname;
-	psa.intsz = intsz;
-	psa.numints = numints;
-	psa.buf = buf;
+	psa.psa_name = propname;
+	psa.psa_source = source;
+	psa.psa_intsz = intsz;
+	psa.psa_numints = numints;
+	psa.psa_value = buf;
+	psa.psa_effective_value = -1ULL;
+
 	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
 	    NULL, dsl_prop_set_sync, ds, &psa, 2);
 
@@ -471,122 +824,319 @@ dsl_prop_set(const char *dsname, const char *propname,
 	return (err);
 }
 
-/*
- * Iterate over all properties for this dataset and return them in an nvlist.
- */
 int
-dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local)
+dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props)
 {
-	dsl_dataset_t *ds = os->os->os_dsl_dataset;
-	dsl_dir_t *dd = ds->ds_dir;
-	boolean_t snapshot = dsl_dataset_is_snapshot(ds);
-	int err = 0;
-	dsl_pool_t *dp = dd->dd_pool;
-	objset_t *mos = dp->dp_meta_objset;
-	uint64_t propobj = ds->ds_phys->ds_props_obj;
+	dsl_dataset_t *ds;
+	uint64_t version;
+	nvpair_t *elem = NULL;
+	dsl_props_arg_t pa;
+	int err;
 
-	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	if (err = dsl_dataset_hold(dsname, FTAG, &ds))
+		return (err);
+	/*
+	 * Do these checks before the syncfunc, since it can't fail.
+	 */
+	version = spa_version(ds->ds_dir->dd_pool->dp_spa);
+	while ((elem = nvlist_next_nvpair(props, elem)) != NULL) {
+		if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) {
+			dsl_dataset_rele(ds, FTAG);
+			return (ENAMETOOLONG);
+		}
+		if (nvpair_type(elem) == DATA_TYPE_STRING) {
+			char *valstr;
+			VERIFY(nvpair_value_string(elem, &valstr) == 0);
+			if (strlen(valstr) >= (version <
+			    SPA_VERSION_STMF_PROP ?
+			    ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) {
+				dsl_dataset_rele(ds, FTAG);
+				return (E2BIG);
+			}
+		}
+	}
 
-	if (local && snapshot && !propobj)
-		return (0);
+	if (dsl_dataset_is_snapshot(ds) &&
+	    version < SPA_VERSION_SNAP_PROPS) {
+		dsl_dataset_rele(ds, FTAG);
+		return (ENOTSUP);
+	}
 
-	rw_enter(&dp->dp_config_rwlock, RW_READER);
-	while (dd != NULL) {
-		char setpoint[MAXNAMELEN];
-		zap_cursor_t zc;
-		zap_attribute_t za;
-		dsl_dir_t *dd_next;
-
-		if (propobj) {
-			dsl_dataset_name(ds, setpoint);
-			dd_next = dd;
-		} else {
-			dsl_dir_name(dd, setpoint);
-			propobj = dd->dd_phys->dd_props_zapobj;
-			dd_next = dd->dd_parent;
-		}
+	pa.pa_props = props;
+	pa.pa_source = source;
 
-		for (zap_cursor_init(&zc, mos, propobj);
-		    (err = zap_cursor_retrieve(&zc, &za)) == 0;
-		    zap_cursor_advance(&zc)) {
-			nvlist_t *propval;
-			zfs_prop_t prop = zfs_name_to_prop(za.za_name);
+	err = dsl_sync_task_do(ds->ds_dir->dd_pool,
+	    NULL, dsl_props_set_sync, ds, &pa, 2);
 
-			/* Skip non-inheritable properties. */
-			if (prop != ZPROP_INVAL &&
-			    !zfs_prop_inheritable(prop) &&
-			    (dd != ds->ds_dir || (snapshot && dd != dd_next)))
-				continue;
+	dsl_dataset_rele(ds, FTAG);
+	return (err);
+}
 
-			/* Skip properties not valid for this type. */
-			if (snapshot && prop != ZPROP_INVAL &&
-			    !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
+typedef enum dsl_prop_getflags {
+	DSL_PROP_GET_INHERITING = 0x1,	/* searching parent of target ds */
+	DSL_PROP_GET_SNAPSHOT = 0x2,	/* snapshot dataset */
+	DSL_PROP_GET_LOCAL = 0x4,	/* local properties */
+	DSL_PROP_GET_RECEIVED = 0x8	/* received properties */
+} dsl_prop_getflags_t;
+
+static int
+dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj,
+    const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv)
+{
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	int err = 0;
+
+	for (zap_cursor_init(&zc, mos, propobj);
+	    (err = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		nvlist_t *propval;
+		zfs_prop_t prop;
+		char buf[ZAP_MAXNAMELEN];
+		char *valstr;
+		const char *suffix;
+		const char *propname;
+		const char *source;
+
+		suffix = strchr(za.za_name, '$');
+
+		if (suffix == NULL) {
+			/*
+			 * Skip local properties if we only want received
+			 * properties.
+			 */
+			if (flags & DSL_PROP_GET_RECEIVED)
 				continue;
 
-			/* Skip properties already defined */
-			if (nvlist_lookup_nvlist(*nvp, za.za_name,
-			    &propval) == 0)
+			propname = za.za_name;
+			source = setpoint;
+		} else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) {
+			/* Skip explicitly inherited entries. */
+			continue;
+		} else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) {
+			if (flags & DSL_PROP_GET_LOCAL)
 				continue;
 
-			VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME,
-			    KM_SLEEP) == 0);
-			if (za.za_integer_length == 1) {
-				/*
-				 * String property
-				 */
-				char *tmp = kmem_alloc(za.za_num_integers,
-				    KM_SLEEP);
-				err = zap_lookup(mos, propobj,
-				    za.za_name, 1, za.za_num_integers, tmp);
-				if (err != 0) {
-					kmem_free(tmp, za.za_num_integers);
+			(void) strncpy(buf, za.za_name, (suffix - za.za_name));
+			buf[suffix - za.za_name] = '\0';
+			propname = buf;
+
+			if (!(flags & DSL_PROP_GET_RECEIVED)) {
+				/* Skip if locally overridden. */
+				err = zap_contains(mos, propobj, propname);
+				if (err == 0)
+					continue;
+				if (err != ENOENT)
+					break;
+
+				/* Skip if explicitly inherited. */
+				valstr = kmem_asprintf("%s%s", propname,
+				    ZPROP_INHERIT_SUFFIX);
+				err = zap_contains(mos, propobj, valstr);
+				strfree(valstr);
+				if (err == 0)
+					continue;
+				if (err != ENOENT)
 					break;
-				}
-				VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
-				    tmp) == 0);
-				kmem_free(tmp, za.za_num_integers);
-			} else {
-				/*
-				 * Integer property
-				 */
-				ASSERT(za.za_integer_length == 8);
-				(void) nvlist_add_uint64(propval, ZPROP_VALUE,
-				    za.za_first_integer);
 			}
 
-			VERIFY(nvlist_add_string(propval, ZPROP_SOURCE,
-			    setpoint) == 0);
-			VERIFY(nvlist_add_nvlist(*nvp, za.za_name,
-			    propval) == 0);
-			nvlist_free(propval);
+			source = ((flags & DSL_PROP_GET_INHERITING) ?
+			    setpoint : ZPROP_SOURCE_VAL_RECVD);
+		} else {
+			/*
+			 * For backward compatibility, skip suffixes we don't
+			 * recognize.
+			 */
+			continue;
 		}
-		zap_cursor_fini(&zc);
 
-		if (err != ENOENT)
-			break;
+		prop = zfs_name_to_prop(propname);
+
+		/* Skip non-inheritable properties. */
+		if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL &&
+		    !zfs_prop_inheritable(prop))
+			continue;
+
+		/* Skip properties not valid for this type. */
+		if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL &&
+		    !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT))
+			continue;
+
+		/* Skip properties already defined. */
+		if (nvlist_exists(nv, propname))
+			continue;
+
+		VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		if (za.za_integer_length == 1) {
+			/*
+			 * String property
+			 */
+			char *tmp = kmem_alloc(za.za_num_integers,
+			    KM_SLEEP);
+			err = zap_lookup(mos, propobj,
+			    za.za_name, 1, za.za_num_integers, tmp);
+			if (err != 0) {
+				kmem_free(tmp, za.za_num_integers);
+				break;
+			}
+			VERIFY(nvlist_add_string(propval, ZPROP_VALUE,
+			    tmp) == 0);
+			kmem_free(tmp, za.za_num_integers);
+		} else {
+			/*
+			 * Integer property
+			 */
+			ASSERT(za.za_integer_length == 8);
+			(void) nvlist_add_uint64(propval, ZPROP_VALUE,
+			    za.za_first_integer);
+		}
+
+		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0);
+		VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
+		nvlist_free(propval);
+	}
+	zap_cursor_fini(&zc);
+	if (err == ENOENT)
 		err = 0;
-		/*
-		 * If we are just after the props that have been set
-		 * locally, then we are done after the first iteration.
-		 */
-		if (local)
+	return (err);
+}
+
+/*
+ * Iterate over all properties for this dataset and return them in an nvlist.
+ */
+static int
+dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp,
+    dsl_prop_getflags_t flags)
+{
+	dsl_dir_t *dd = ds->ds_dir;
+	dsl_pool_t *dp = dd->dd_pool;
+	objset_t *mos = dp->dp_meta_objset;
+	int err = 0;
+	char setpoint[MAXNAMELEN];
+
+	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	if (dsl_dataset_is_snapshot(ds))
+		flags |= DSL_PROP_GET_SNAPSHOT;
+
+	rw_enter(&dp->dp_config_rwlock, RW_READER);
+
+	if (ds->ds_phys->ds_props_obj != 0) {
+		ASSERT(flags & DSL_PROP_GET_SNAPSHOT);
+		dsl_dataset_name(ds, setpoint);
+		err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj,
+		    setpoint, flags, *nvp);
+		if (err)
+			goto out;
+	}
+
+	for (; dd != NULL; dd = dd->dd_parent) {
+		if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) {
+			if (flags & (DSL_PROP_GET_LOCAL |
+			    DSL_PROP_GET_RECEIVED))
+				break;
+			flags |= DSL_PROP_GET_INHERITING;
+		}
+		dsl_dir_name(dd, setpoint);
+		err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj,
+		    setpoint, flags, *nvp);
+		if (err)
 			break;
-		dd = dd_next;
-		propobj = 0;
 	}
+out:
 	rw_exit(&dp->dp_config_rwlock);
-
 	return (err);
 }
 
+boolean_t
+dsl_prop_get_hasrecvd(objset_t *os)
+{
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	int rc;
+	uint64_t dummy;
+
+	rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER);
+	rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL);
+	rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock);
+	ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
+	return (rc == 0);
+}
+
+static void
+dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source)
+{
+	dsl_dataset_t *ds = os->os_dsl_dataset;
+	uint64_t dummy = 0;
+	dsl_prop_setarg_t psa;
+
+	if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS)
+		return;
+
+	dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy);
+
+	(void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL,
+	    dsl_prop_set_sync, ds, &psa, 2);
+}
+
+/*
+ * Call after successfully receiving properties to ensure that only the first
+ * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties.
+ */
+void
+dsl_prop_set_hasrecvd(objset_t *os)
+{
+	if (dsl_prop_get_hasrecvd(os)) {
+		ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS);
+		return;
+	}
+	dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL);
+}
+
+void
+dsl_prop_unset_hasrecvd(objset_t *os)
+{
+	dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE);
+}
+
+int
+dsl_prop_get_all(objset_t *os, nvlist_t **nvp)
+{
+	return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0));
+}
+
+int
+dsl_prop_get_received(objset_t *os, nvlist_t **nvp)
+{
+	/*
+	 * Received properties are not distinguishable from local properties
+	 * until the dataset has received properties on or after
+	 * SPA_VERSION_RECVD_PROPS.
+	 */
+	dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ?
+	    DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL);
+	return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags));
+}
+
 void
 dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value)
 {
 	nvlist_t *propval;
+	const char *propname = zfs_prop_to_name(prop);
+	uint64_t default_value;
+
+	if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+		VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
+		return;
+	}
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0);
-	VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+	/* Indicate the default source if we can. */
+	if (dodefault(propname, 8, 1, &default_value) == 0 &&
+	    value == default_value) {
+		VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0);
+	}
+	VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
 	nvlist_free(propval);
 }
 
@@ -594,9 +1144,15 @@ void
 dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value)
 {
 	nvlist_t *propval;
+	const char *propname = zfs_prop_to_name(prop);
+
+	if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) {
+		VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
+		return;
+	}
 
 	VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0);
 	VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0);
-	VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0);
+	VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0);
 	nvlist_free(propval);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c
index 950a91f783a47..cf7f0f42684b7 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -40,15 +40,21 @@
 #include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
 #include <sys/zil_impl.h>
+#include <sys/zio_checksum.h>
+#include <sys/ddt.h>
 
 typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *);
 
 static scrub_cb_t dsl_pool_scrub_clean_cb;
 static dsl_syncfunc_t dsl_pool_scrub_cancel_sync;
+static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
+    uint64_t objset, uint64_t object);
 
-int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */
-int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */
+int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */
+int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */
 boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */
+boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */
+enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE;
 
 extern int zfs_txg_timeout;
 
@@ -57,14 +63,6 @@ static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = {
 	dsl_pool_scrub_clean_cb
 };
 
-#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
-{                                                       \
-	(zb)->zb_objset = objset;                       \
-	(zb)->zb_object = object;                       \
-	(zb)->zb_level = level;                         \
-	(zb)->zb_blkid = blkid;                         \
-}
-
 /* ARGSUSED */
 static void
 dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
@@ -82,6 +80,7 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 	dp->dp_scrub_min_txg = 0;
 	dp->dp_scrub_max_txg = tx->tx_txg;
+	dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max;
 
 	if (*funcp == SCRUB_FUNC_CLEAN) {
 		vdev_t *rvd = dp->dp_spa->spa_root_vdev;
@@ -95,6 +94,9 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 			    ESC_ZFS_RESILVER_START);
 			dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg,
 			    tx->tx_txg);
+		} else {
+			spa_event_notify(dp->dp_spa, NULL,
+			    ESC_ZFS_SCRUB_START);
 		}
 
 		/* zero out the scrub stats in all vdev_stat_t's */
@@ -102,6 +104,14 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
 		    POOL_SCRUB_EVERYTHING, B_FALSE);
 
+		/*
+		 * If this is an incremental scrub, limit the DDT scrub phase
+		 * to just the auto-ditto class (for correctness); the rest
+		 * of the scrub should go faster using top-down pruning.
+		 */
+		if (dp->dp_scrub_min_txg > TXG_INITIAL)
+			dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO;
+
 		dp->dp_spa->spa_scrub_started = B_TRUE;
 	}
 
@@ -120,6 +130,7 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset,
 	    ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx);
 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
 	dp->dp_scrub_restart = B_FALSE;
 	dp->dp_spa->spa_scrub_errors = 0;
 
@@ -136,8 +147,16 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	    DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1,
 	    &dp->dp_scrub_max_txg, tx));
 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
 	    &dp->dp_scrub_bookmark, tx));
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+	    &dp->dp_scrub_ddt_bookmark, tx));
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_ddt_class_max, tx));
 	VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
 	    &dp->dp_spa->spa_scrub_errors, tx));
@@ -186,6 +205,7 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	    dp->dp_scrub_queue_obj, tx));
 	dp->dp_scrub_queue_obj = 0;
 	bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t));
+	bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t));
 
 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_QUEUE, tx));
@@ -200,6 +220,11 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_ERRORS, tx));
 
+	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_BOOKMARK, tx);
+	(void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_CLASS_MAX, tx);
+
 	spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr,
 	    "complete=%u", *completep);
 
@@ -212,8 +237,9 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	 */
 	vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg,
 	    *completep ? dp->dp_scrub_max_txg : 0, B_TRUE);
-	if (dp->dp_scrub_min_txg && *completep)
-		spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH);
+	if (*completep)
+		spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ?
+		    ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH);
 	spa_errlog_rotate(dp->dp_spa);
 
 	/*
@@ -235,15 +261,13 @@ dsl_pool_scrub_cancel(dsl_pool_t *dp)
 	    dsl_pool_scrub_cancel_sync, dp, &complete, 3));
 }
 
-int
-dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags)
+void
+dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp)
 {
 	/*
 	 * This function will be used by bp-rewrite wad to intercept frees.
 	 */
-	return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp,
-	    done, private, arc_flags));
+	zio_free(dp->dp_spa, txg, bpp);
 }
 
 static boolean_t
@@ -261,14 +285,14 @@ bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
 	uint64_t zb1nextL0, zb2thisobj;
 
 	ASSERT(zb1->zb_objset == zb2->zb_objset);
-	ASSERT(zb1->zb_object != -1ULL);
+	ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT);
 	ASSERT(zb2->zb_level == 0);
 
 	/*
 	 * A bookmark in the deadlist is considered to be after
 	 * everything else.
 	 */
-	if (zb2->zb_object == -1ULL)
+	if (zb2->zb_object == DMU_DEADLIST_OBJECT)
 		return (B_TRUE);
 
 	/* The objset_phys_t isn't before anything. */
@@ -281,7 +305,7 @@ bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
 	zb2thisobj = zb2->zb_object ? zb2->zb_object :
 	    zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT);
 
-	if (zb1->zb_object == 0) {
+	if (zb1->zb_object == DMU_META_DNODE_OBJECT) {
 		uint64_t nextobj = zb1nextL0 *
 		    (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT;
 		return (nextobj <= zb2thisobj);
@@ -291,15 +315,15 @@ bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1,
 		return (B_TRUE);
 	if (zb1->zb_object > zb2thisobj)
 		return (B_FALSE);
-	if (zb2->zb_object == 0)
+	if (zb2->zb_object == DMU_META_DNODE_OBJECT)
 		return (B_FALSE);
 	return (zb1nextL0 <= zb2->zb_blkid);
 }
 
 static boolean_t
-scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
+scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb)
 {
-	int elapsed_ticks;
+	uint64_t elapsed_nanosecs;
 	int mintime;
 
 	if (dp->dp_scrub_pausing)
@@ -309,19 +333,31 @@ scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb)
 		return (B_FALSE); /* we're resuming */
 
 	/* We only know how to resume from level-0 blocks. */
-	if (zb->zb_level != 0)
+	if (zb != NULL && zb->zb_level != 0)
 		return (B_FALSE);
 
-	mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time :
-	    zfs_scrub_min_time;
-	elapsed_ticks = lbolt64 - dp->dp_scrub_start_time;
-	if (elapsed_ticks > hz * zfs_txg_timeout ||
-	    (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) {
-		dprintf("pausing at %llx/%llx/%llx/%llx\n",
-		    (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object,
-		    (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid);
+	mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms :
+	    zfs_scrub_min_time_ms;
+	elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time;
+	if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout ||
+	    (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) {
+		if (zb) {
+			dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n",
+			    (longlong_t)zb->zb_objset,
+			    (longlong_t)zb->zb_object,
+			    (longlong_t)zb->zb_level,
+			    (longlong_t)zb->zb_blkid);
+			dp->dp_scrub_bookmark = *zb;
+		}
+		if (ddb) {
+			dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n",
+			    (longlong_t)ddb->ddb_class,
+			    (longlong_t)ddb->ddb_type,
+			    (longlong_t)ddb->ddb_checksum,
+			    (longlong_t)ddb->ddb_cursor);
+			ASSERT(&dp->dp_scrub_ddt_bookmark == ddb);
+		}
 		dp->dp_scrub_pausing = B_TRUE;
-		dp->dp_scrub_bookmark = *zb;
 		return (B_TRUE);
 	}
 	return (B_FALSE);
@@ -333,7 +369,7 @@ typedef struct zil_traverse_arg {
 } zil_traverse_arg_t;
 
 /* ARGSUSED */
-static void
+static int
 traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	zil_traverse_arg_t *zta = arg;
@@ -342,20 +378,26 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 	zbookmark_t zb;
 
 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
-		return;
+		return (0);
 
+	/*
+	 * One block ("stubby") can be allocated a long time ago; we
+	 * want to visit that one because it has been allocated
+	 * (on-disk) even if it hasn't been claimed (even though for
+	 * plain scrub there's nothing to do to it).
+	 */
 	if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa))
-		return;
+		return (0);
+
+	SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
-	zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
 	VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
+	return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 {
 	if (lrc->lrc_txtype == TX_WRITE) {
@@ -367,17 +409,23 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg)
 		zbookmark_t zb;
 
 		if (bp->blk_birth <= dp->dp_scrub_min_txg)
-			return;
+			return (0);
 
+		/*
+		 * birth can be < claim_txg if this record's txg is
+		 * already txg sync'ed (but this log block contains
+		 * other records that are not synced)
+		 */
 		if (claim_txg == 0 || bp->blk_birth < claim_txg)
-			return;
+			return (0);
+
+		SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+		    lr->lr_foid, ZB_ZIL_LEVEL,
+		    lr->lr_offset / BP_GET_LSIZE(bp));
 
-		zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-		zb.zb_object = lr->lr_foid;
-		zb.zb_level = BP_GET_LEVEL(bp);
-		zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp);
 		VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb));
 	}
+	return (0);
 }
 
 static void
@@ -391,7 +439,7 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
 	 * We only want to visit blocks that have been claimed but not yet
 	 * replayed (or, in read-only mode, blocks that *would* be claimed).
 	 */
-	if (claim_txg == 0 && (spa_mode & FWRITE))
+	if (claim_txg == 0 && spa_writeable(dp->dp_spa))
 		return;
 
 	zilog = zil_alloc(dp->dp_meta_objset, zh);
@@ -402,6 +450,27 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh)
 	zil_free(zilog);
 }
 
+static void
+scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset,
+    uint64_t object, uint64_t blkid)
+{
+	zbookmark_t czb;
+	uint32_t flags = ARC_NOWAIT | ARC_PREFETCH;
+
+	if (zfs_no_scrub_prefetch)
+		return;
+
+	if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg ||
+	    (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE))
+		return;
+
+	SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid);
+
+	(void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp,
+	    buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL,
+	    &flags, &czb);
+}
+
 static void
 scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
     arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb)
@@ -409,13 +478,10 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 	int err;
 	arc_buf_t *buf = NULL;
 
-	if (bp->blk_birth == 0)
-		return;
-
 	if (bp->blk_birth <= dp->dp_scrub_min_txg)
 		return;
 
-	if (scrub_pause(dp, zb))
+	if (scrub_pause(dp, zb, NULL))
 		return;
 
 	if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) {
@@ -443,6 +509,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 		}
 	}
 
+	/*
+	 * If dsl_pool_scrub_ddt() has aready scrubbed this block,
+	 * don't scrub it again.
+	 */
+	if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp))
+		(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
+
 	if (BP_GET_LEVEL(bp) > 0) {
 		uint32_t flags = ARC_WAIT;
 		int i;
@@ -458,9 +531,11 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
 			return;
 		}
-		cbp = buf->b_data;
-
-		for (i = 0; i < epb; i++, cbp++) {
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
+			scrub_prefetch(dp, buf, cbp, zb->zb_objset,
+			    zb->zb_object, zb->zb_blkid * epb + i);
+		}
+		for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) {
 			zbookmark_t czb;
 
 			SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object,
@@ -470,7 +545,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) {
 		uint32_t flags = ARC_WAIT;
-		dnode_phys_t *child_dnp;
+		dnode_phys_t *cdnp;
 		int i, j;
 		int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT;
 
@@ -483,23 +558,20 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 			mutex_exit(&dp->dp_spa->spa_scrub_lock);
 			return;
 		}
-		child_dnp = buf->b_data;
-
-		for (i = 0; i < epb; i++, child_dnp++) {
-			for (j = 0; j < child_dnp->dn_nblkptr; j++) {
-				zbookmark_t czb;
-
-				SET_BOOKMARK(&czb, zb->zb_objset,
-				    zb->zb_blkid * epb + i,
-				    child_dnp->dn_nlevels - 1, j);
-				scrub_visitbp(dp, child_dnp, buf,
-				    &child_dnp->dn_blkptr[j], &czb);
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+			for (j = 0; j < cdnp->dn_nblkptr; j++) {
+				blkptr_t *cbp = &cdnp->dn_blkptr[j];
+				scrub_prefetch(dp, buf, cbp, zb->zb_objset,
+				    zb->zb_blkid * epb + i, j);
 			}
 		}
+		for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) {
+			scrub_visitdnode(dp, cdnp, buf, zb->zb_objset,
+			    zb->zb_blkid * epb + i);
+		}
 	} else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) {
 		uint32_t flags = ARC_WAIT;
 		objset_phys_t *osp;
-		int j;
 
 		err = arc_read_nolock(NULL, dp->dp_spa, bp,
 		    arc_getbuf_func, &buf,
@@ -515,27 +587,41 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp,
 
 		traverse_zil(dp, &osp->os_zil_header);
 
-		for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) {
-			zbookmark_t czb;
-
-			SET_BOOKMARK(&czb, zb->zb_objset, 0,
-			    osp->os_meta_dnode.dn_nlevels - 1, j);
-			scrub_visitbp(dp, &osp->os_meta_dnode, buf,
-			    &osp->os_meta_dnode.dn_blkptr[j], &czb);
+		scrub_visitdnode(dp, &osp->os_meta_dnode,
+		    buf, zb->zb_objset, DMU_META_DNODE_OBJECT);
+		if (arc_buf_size(buf) >= sizeof (objset_phys_t)) {
+			scrub_visitdnode(dp, &osp->os_userused_dnode,
+			    buf, zb->zb_objset, DMU_USERUSED_OBJECT);
+			scrub_visitdnode(dp, &osp->os_groupused_dnode,
+			    buf, zb->zb_objset, DMU_GROUPUSED_OBJECT);
 		}
 	}
 
-	(void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb);
 	if (buf)
 		(void) arc_buf_remove_ref(buf, &buf);
 }
 
+static void
+scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf,
+    uint64_t objset, uint64_t object)
+{
+	int j;
+
+	for (j = 0; j < dnp->dn_nblkptr; j++) {
+		zbookmark_t czb;
+
+		SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j);
+		scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb);
+	}
+}
+
 static void
 scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp)
 {
 	zbookmark_t zb;
 
-	SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0);
+	SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET,
+	    ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID);
 	scrub_visitbp(dp, NULL, NULL, bp, &zb);
 }
 
@@ -548,7 +634,8 @@ dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx)
 		return;
 
 	if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) {
-		SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0);
+		SET_BOOKMARK(&dp->dp_scrub_bookmark,
+		    ZB_DESTROYED_OBJSET, 0, 0, 0);
 	} else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj,
 	    ds->ds_object, tx) != 0) {
 		return;
@@ -677,17 +764,34 @@ scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx)
 		    ds->ds_phys->ds_next_snap_obj, tx) == 0);
 	}
 	if (ds->ds_phys->ds_num_children > 1) {
-		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
+		boolean_t usenext = B_FALSE;
+		if (ds->ds_phys->ds_next_clones_obj != 0) {
+			uint64_t count;
+			/*
+			 * A bug in a previous version of the code could
+			 * cause upgrade_clones_cb() to not set
+			 * ds_next_snap_obj when it should, leading to a
+			 * missing entry.  Therefore we can only use the
+			 * next_clones_obj when its count is correct.
+			 */
+			int err = zap_count(dp->dp_meta_objset,
+			    ds->ds_phys->ds_next_clones_obj, &count);
+			if (err == 0 &&
+			    count == ds->ds_phys->ds_num_children - 1)
+				usenext = B_TRUE;
+		}
+
+		if (usenext) {
+			VERIFY(zap_join(dp->dp_meta_objset,
+			    ds->ds_phys->ds_next_clones_obj,
+			    dp->dp_scrub_queue_obj, tx) == 0);
+		} else {
 			struct enqueue_clones_arg eca;
 			eca.tx = tx;
 			eca.originobj = ds->ds_object;
 
 			(void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa,
 			    NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN);
-		} else {
-			VERIFY(zap_join(dp->dp_meta_objset,
-			    ds->ds_phys->ds_next_clones_obj,
-			    dp->dp_scrub_queue_obj, tx) == 0);
 		}
 	}
 
@@ -737,9 +841,78 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg)
 	return (0);
 }
 
+/*
+ * Scrub/dedup interaction.
+ *
+ * If there are N references to a deduped block, we don't want to scrub it
+ * N times -- ideally, we should scrub it exactly once.
+ *
+ * We leverage the fact that the dde's replication class (enum ddt_class)
+ * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest
+ * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order.
+ *
+ * To prevent excess scrubbing, the scrub begins by walking the DDT
+ * to find all blocks with refcnt > 1, and scrubs each of these once.
+ * Since there are two replication classes which contain blocks with
+ * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first.
+ * Finally the top-down scrub begins, only visiting blocks with refcnt == 1.
+ *
+ * There would be nothing more to say if a block's refcnt couldn't change
+ * during a scrub, but of course it can so we must account for changes
+ * in a block's replication class.
+ *
+ * Here's an example of what can occur:
+ *
+ * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1
+ * when visited during the top-down scrub phase, it will be scrubbed twice.
+ * This negates our scrub optimization, but is otherwise harmless.
+ *
+ * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1
+ * on each visit during the top-down scrub phase, it will never be scrubbed.
+ * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's
+ * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to
+ * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1
+ * while a scrub is in progress, it scrubs the block right then.
+ */
+static void
+dsl_pool_scrub_ddt(dsl_pool_t *dp)
+{
+	ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark;
+	ddt_entry_t dde;
+	int error;
+
+	while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) {
+		if (ddb->ddb_class > dp->dp_scrub_ddt_class_max)
+			return;
+		dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde);
+		if (scrub_pause(dp, NULL, ddb))
+			return;
+	}
+	ASSERT(error == ENOENT);
+	ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max);
+}
+
+void
+dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
+    const ddt_entry_t *dde)
+{
+	const ddt_key_t *ddk = &dde->dde_key;
+	const ddt_phys_t *ddp = dde->dde_phys;
+	blkptr_t blk;
+	zbookmark_t zb = { 0 };
+
+	for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+		if (ddp->ddp_phys_birth == 0)
+			continue;
+		ddt_bp_create(checksum, ddk, ddp, &blk);
+		scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb);
+	}
+}
+
 void
 dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 {
+	spa_t *spa = dp->dp_spa;
 	zap_cursor_t zc;
 	zap_attribute_t za;
 	boolean_t complete = B_TRUE;
@@ -747,8 +920,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	if (dp->dp_scrub_func == SCRUB_FUNC_NONE)
 		return;
 
-	/* If the spa is not fully loaded, don't bother. */
-	if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE)
+	/*
+	 * If the pool is not loaded, or is trying to unload, leave it alone.
+	 */
+	if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa))
 		return;
 
 	if (dp->dp_scrub_restart) {
@@ -757,41 +932,47 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 		dsl_pool_scrub_setup_sync(dp, &func, kcred, tx);
 	}
 
-	if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
+	if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) {
 		/*
 		 * We must have resumed after rebooting; reset the vdev
 		 * stats to know that we're doing a scrub (although it
 		 * will think we're just starting now).
 		 */
-		vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev,
+		vdev_scrub_stat_update(spa->spa_root_vdev,
 		    dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER :
 		    POOL_SCRUB_EVERYTHING, B_FALSE);
 	}
 
 	dp->dp_scrub_pausing = B_FALSE;
-	dp->dp_scrub_start_time = lbolt64;
+	dp->dp_scrub_start_time = gethrtime();
 	dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0);
-	dp->dp_spa->spa_scrub_active = B_TRUE;
+	spa->spa_scrub_active = B_TRUE;
+
+	if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) {
+		dsl_pool_scrub_ddt(dp);
+		if (dp->dp_scrub_pausing)
+			goto out;
+	}
 
-	if (dp->dp_scrub_bookmark.zb_objset == 0) {
+	if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) {
 		/* First do the MOS & ORIGIN */
 		scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp);
 		if (dp->dp_scrub_pausing)
 			goto out;
 
-		if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) {
-			VERIFY(0 == dmu_objset_find_spa(dp->dp_spa,
+		if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) {
+			VERIFY(0 == dmu_objset_find_spa(spa,
 			    NULL, enqueue_cb, tx, DS_FIND_CHILDREN));
 		} else {
 			scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx);
 		}
 		ASSERT(!dp->dp_scrub_pausing);
-	} else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) {
+	} else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) {
 		/*
-		 * If we were paused, continue from here.  Note if the
-		 * ds we were paused on was deleted, the zb_objset will
-		 * be -1, so we will skip this and find a new objset
-		 * below.
+		 * If we were paused, continue from here.  Note if the ds
+		 * we were paused on was destroyed, the zb_objset will be
+		 * ZB_DESTROYED_OBJSET, so we will skip this and find a new
+		 * objset below.
 		 */
 		scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx);
 		if (dp->dp_scrub_pausing)
@@ -823,22 +1004,20 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx)
 	dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx);
 	return;
 out:
-	VERIFY(0 == zap_update(dp->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4,
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t),
+	    sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t),
 	    &dp->dp_scrub_bookmark, tx));
-	VERIFY(0 == zap_update(dp->dp_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT,
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t),
+	    sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t),
+	    &dp->dp_scrub_ddt_bookmark, tx));
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1,
+	    &dp->dp_scrub_ddt_class_max, tx));
+	VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
 	    DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1,
-	    &dp->dp_spa->spa_scrub_errors, tx));
-
-	/* XXX this is scrub-clean specific */
-	mutex_enter(&dp->dp_spa->spa_scrub_lock);
-	while (dp->dp_spa->spa_scrub_inflight > 0) {
-		cv_wait(&dp->dp_spa->spa_scrub_io_cv,
-		    &dp->dp_spa->spa_scrub_lock);
-	}
-	mutex_exit(&dp->dp_spa->spa_scrub_lock);
+	    &spa->spa_scrub_errors, tx));
 }
 
 void
@@ -920,13 +1099,17 @@ static int
 dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
     const blkptr_t *bp, const zbookmark_t *zb)
 {
-	size_t size = BP_GET_LSIZE(bp);
-	int d;
+	size_t size = BP_GET_PSIZE(bp);
 	spa_t *spa = dp->dp_spa;
+	uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp);
 	boolean_t needs_io;
-	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL;
+	int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL;
 	int zio_priority;
 
+	if (phys_birth <= dp->dp_scrub_min_txg ||
+	    phys_birth >= dp->dp_scrub_max_txg)
+		return (0);
+
 	count_block(dp->dp_blkstats, bp);
 
 	if (dp->dp_scrub_isresilver == 0) {
@@ -942,10 +1125,10 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
 	}
 
 	/* If it's an intent log block, failure is expected. */
-	if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET)
+	if (zb->zb_level == ZB_ZIL_LEVEL)
 		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
+	for (int d = 0; d < BP_GET_NDVAS(bp); d++) {
 		vdev_t *vd = vdev_lookup_top(spa,
 		    DVA_GET_VDEV(&bp->blk_dva[d]));
 
@@ -963,16 +1146,17 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
 			if (DVA_GET_GANG(&bp->blk_dva[d])) {
 				/*
 				 * Gang members may be spread across multiple
-				 * vdevs, so the best we can do is look at the
-				 * pool-wide DTL.
+				 * vdevs, so the best estimate we have is the
+				 * scrub range, which has already been checked.
 				 * XXX -- it would be better to change our
-				 * allocation policy to ensure that this can't
-				 * happen.
+				 * allocation policy to ensure that all
+				 * gang members reside on the same vdev.
 				 */
-				vd = spa->spa_root_vdev;
+				needs_io = B_TRUE;
+			} else {
+				needs_io = vdev_dtl_contains(vd, DTL_PARTIAL,
+				    phys_birth, 1);
 			}
-			needs_io = vdev_dtl_contains(&vd->vdev_dtl_map,
-			    bp->blk_birth, 1);
 		}
 	}
 
@@ -997,18 +1181,20 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp,
 int
 dsl_pool_scrub_clean(dsl_pool_t *dp)
 {
+	spa_t *spa = dp->dp_spa;
+
 	/*
-	 * Purge all vdev caches.  We do this here rather than in sync
-	 * context because this requires a writer lock on the spa_config
-	 * lock, which we can't do from sync context.  The
+	 * Purge all vdev caches and probe all devices.  We do this here
+	 * rather than in sync context because this requires a writer lock
+	 * on the spa_config lock, which we can't do from sync context.  The
 	 * spa_scrub_reopen flag indicates that vdev_open() should not
 	 * attempt to start another scrub.
 	 */
-	spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER);
-	dp->dp_spa->spa_scrub_reopen = B_TRUE;
-	vdev_reopen(dp->dp_spa->spa_root_vdev);
-	dp->dp_spa->spa_scrub_reopen = B_FALSE;
-	spa_config_exit(dp->dp_spa, SCL_ALL, FTAG);
+	spa_vdev_state_enter(spa, SCL_NONE);
+	spa->spa_scrub_reopen = B_TRUE;
+	vdev_reopen(spa->spa_root_vdev);
+	spa->spa_scrub_reopen = B_FALSE;
+	(void) spa_vdev_state_exit(spa, NULL, 0);
 
 	return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN));
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c
index 21100225abf73..cdea979890ffa 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
@@ -118,8 +116,10 @@ dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg)
 
 	txg_wait_synced(dstg->dstg_pool, txg);
 
-	if (dstg->dstg_err == EAGAIN)
+	if (dstg->dstg_err == EAGAIN) {
+		txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE);
 		goto top;
+	}
 
 	return (dstg->dstg_err);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c b/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c
index 7fcde8475e003..10952f472b333 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c
@@ -20,18 +20,18 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * We keep our own copy of this algorithm for 2 main reasons:
- * 	1. If we didn't, anyone modifying common/os/compress.c would
+ *	1. If we didn't, anyone modifying common/os/compress.c would
  *         directly break our on disk format
- * 	2. Our version of lzjb does not have a number of checks that the
+ *	2. Our version of lzjb does not have a number of checks that the
  *         common/os version needs and uses
+ *	3. We initialize the lempel to ensure deterministic results,
+ *	   so that identical blocks can always be deduplicated.
  * In particular, we are adding the "feature" that compress() can
  * take a destination buffer size and return -1 if the data will not
  * compress to d_len or less.
@@ -43,7 +43,7 @@
 #define	MATCH_MIN	3
 #define	MATCH_MAX	((1 << MATCH_BITS) + (MATCH_MIN - 1))
 #define	OFFSET_MASK	((1 << (16 - MATCH_BITS)) - 1)
-#define	LEMPEL_SIZE	256
+#define	LEMPEL_SIZE	1024
 
 /*ARGSUSED*/
 size_t
@@ -53,20 +53,14 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 	uchar_t *dst = d_start;
 	uchar_t *cpy, *copymap;
 	int copymask = 1 << (NBBY - 1);
-	int mlen, offset;
+	int mlen, offset, hash;
 	uint16_t *hp;
-	uint16_t lempel[LEMPEL_SIZE];	/* uninitialized; see above */
+	uint16_t lempel[LEMPEL_SIZE] = { 0 };
 
 	while (src < (uchar_t *)s_start + s_len) {
 		if ((copymask <<= 1) == (1 << NBBY)) {
-			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) {
-				if (d_len != s_len)
-					return (s_len);
-				mlen = s_len;
-				for (src = s_start, dst = d_start; mlen; mlen--)
-					*dst++ = *src++;
+			if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY)
 				return (s_len);
-			}
 			copymask = 1;
 			copymap = dst;
 			*dst++ = 0;
@@ -75,8 +69,10 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
 			*dst++ = *src++;
 			continue;
 		}
-		hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) &
-		    (LEMPEL_SIZE - 1)];
+		hash = (src[0] << 16) + (src[1] << 8) + src[2];
+		hash += hash >> 9;
+		hash += hash >> 5;
+		hp = &lempel[hash & (LEMPEL_SIZE - 1)];
 		offset = (intptr_t)(src - *hp) & OFFSET_MASK;
 		*hp = (uint16_t)(uintptr_t)src;
 		cpy = src - offset;
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c b/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c
index 87727fac2dbed..233fd9b336158 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c
@@ -19,12 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <sys/zfs_context.h>
-#include <sys/spa_impl.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/space_map.h>
@@ -35,19 +34,58 @@
 uint64_t metaslab_aliquot = 512ULL << 10;
 uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1;	/* force gang blocks */
 
+/*
+ * Metaslab debugging: when set, keeps all space maps in core to verify frees.
+ */
+static int metaslab_debug = 0;
+
+/*
+ * Minimum size which forces the dynamic allocator to change
+ * it's allocation strategy.  Once the space map cannot satisfy
+ * an allocation of this size then it switches to using more
+ * aggressive strategy (i.e search by size rather than offset).
+ */
+uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE;
+
+/*
+ * The minimum free space, in percent, which must be available
+ * in a space map to continue allocations in a first-fit fashion.
+ * Once the space_map's free space drops below this level we dynamically
+ * switch to using best-fit allocations.
+ */
+int metaslab_df_free_pct = 4;
+
+/*
+ * A metaslab is considered "free" if it contains a contiguous
+ * segment which is greater than metaslab_min_alloc_size.
+ */
+uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS;
+
+/*
+ * Max number of space_maps to prefetch.
+ */
+int metaslab_prefetch_limit = SPA_DVAS_PER_BP;
+
+/*
+ * Percentage bonus multiplier for metaslabs that are in the bonus area.
+ */
+int metaslab_smo_bonus_pct = 150;
+
 /*
  * ==========================================================================
  * Metaslab classes
  * ==========================================================================
  */
 metaslab_class_t *
-metaslab_class_create(void)
+metaslab_class_create(spa_t *spa, space_map_ops_t *ops)
 {
 	metaslab_class_t *mc;
 
 	mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP);
 
+	mc->mc_spa = spa;
 	mc->mc_rotor = NULL;
+	mc->mc_ops = ops;
 
 	return (mc);
 }
@@ -55,58 +93,73 @@ metaslab_class_create(void)
 void
 metaslab_class_destroy(metaslab_class_t *mc)
 {
-	metaslab_group_t *mg;
-
-	while ((mg = mc->mc_rotor) != NULL) {
-		metaslab_class_remove(mc, mg);
-		metaslab_group_destroy(mg);
-	}
+	ASSERT(mc->mc_rotor == NULL);
+	ASSERT(mc->mc_alloc == 0);
+	ASSERT(mc->mc_deferred == 0);
+	ASSERT(mc->mc_space == 0);
+	ASSERT(mc->mc_dspace == 0);
 
 	kmem_free(mc, sizeof (metaslab_class_t));
 }
 
-void
-metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg)
+int
+metaslab_class_validate(metaslab_class_t *mc)
 {
-	metaslab_group_t *mgprev, *mgnext;
+	metaslab_group_t *mg;
+	vdev_t *vd;
 
-	ASSERT(mg->mg_class == NULL);
+	/*
+	 * Must hold one of the spa_config locks.
+	 */
+	ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) ||
+	    spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER));
 
-	if ((mgprev = mc->mc_rotor) == NULL) {
-		mg->mg_prev = mg;
-		mg->mg_next = mg;
-	} else {
-		mgnext = mgprev->mg_next;
-		mg->mg_prev = mgprev;
-		mg->mg_next = mgnext;
-		mgprev->mg_next = mg;
-		mgnext->mg_prev = mg;
-	}
-	mc->mc_rotor = mg;
-	mg->mg_class = mc;
+	if ((mg = mc->mc_rotor) == NULL)
+		return (0);
+
+	do {
+		vd = mg->mg_vd;
+		ASSERT(vd->vdev_mg != NULL);
+		ASSERT3P(vd->vdev_top, ==, vd);
+		ASSERT3P(mg->mg_class, ==, mc);
+		ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops);
+	} while ((mg = mg->mg_next) != mc->mc_rotor);
+
+	return (0);
 }
 
 void
-metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg)
+metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta,
+    int64_t defer_delta, int64_t space_delta, int64_t dspace_delta)
 {
-	metaslab_group_t *mgprev, *mgnext;
+	atomic_add_64(&mc->mc_alloc, alloc_delta);
+	atomic_add_64(&mc->mc_deferred, defer_delta);
+	atomic_add_64(&mc->mc_space, space_delta);
+	atomic_add_64(&mc->mc_dspace, dspace_delta);
+}
 
-	ASSERT(mg->mg_class == mc);
+uint64_t
+metaslab_class_get_alloc(metaslab_class_t *mc)
+{
+	return (mc->mc_alloc);
+}
 
-	mgprev = mg->mg_prev;
-	mgnext = mg->mg_next;
+uint64_t
+metaslab_class_get_deferred(metaslab_class_t *mc)
+{
+	return (mc->mc_deferred);
+}
 
-	if (mg == mgnext) {
-		mc->mc_rotor = NULL;
-	} else {
-		mc->mc_rotor = mgnext;
-		mgprev->mg_next = mgnext;
-		mgnext->mg_prev = mgprev;
-	}
+uint64_t
+metaslab_class_get_space(metaslab_class_t *mc)
+{
+	return (mc->mc_space);
+}
 
-	mg->mg_prev = NULL;
-	mg->mg_next = NULL;
-	mg->mg_class = NULL;
+uint64_t
+metaslab_class_get_dspace(metaslab_class_t *mc)
+{
+	return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space);
 }
 
 /*
@@ -147,9 +200,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 	mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&mg->mg_metaslab_tree, metaslab_compare,
 	    sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node));
-	mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children);
 	mg->mg_vd = vd;
-	metaslab_class_add(mc, mg);
+	mg->mg_class = mc;
+	mg->mg_activation_count = 0;
 
 	return (mg);
 }
@@ -157,11 +210,82 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd)
 void
 metaslab_group_destroy(metaslab_group_t *mg)
 {
+	ASSERT(mg->mg_prev == NULL);
+	ASSERT(mg->mg_next == NULL);
+	/*
+	 * We may have gone below zero with the activation count
+	 * either because we never activated in the first place or
+	 * because we're done, and possibly removing the vdev.
+	 */
+	ASSERT(mg->mg_activation_count <= 0);
+
 	avl_destroy(&mg->mg_metaslab_tree);
 	mutex_destroy(&mg->mg_lock);
 	kmem_free(mg, sizeof (metaslab_group_t));
 }
 
+void
+metaslab_group_activate(metaslab_group_t *mg)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	metaslab_group_t *mgprev, *mgnext;
+
+	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+	ASSERT(mc->mc_rotor != mg);
+	ASSERT(mg->mg_prev == NULL);
+	ASSERT(mg->mg_next == NULL);
+	ASSERT(mg->mg_activation_count <= 0);
+
+	if (++mg->mg_activation_count <= 0)
+		return;
+
+	mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children);
+
+	if ((mgprev = mc->mc_rotor) == NULL) {
+		mg->mg_prev = mg;
+		mg->mg_next = mg;
+	} else {
+		mgnext = mgprev->mg_next;
+		mg->mg_prev = mgprev;
+		mg->mg_next = mgnext;
+		mgprev->mg_next = mg;
+		mgnext->mg_prev = mg;
+	}
+	mc->mc_rotor = mg;
+}
+
+void
+metaslab_group_passivate(metaslab_group_t *mg)
+{
+	metaslab_class_t *mc = mg->mg_class;
+	metaslab_group_t *mgprev, *mgnext;
+
+	ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER));
+
+	if (--mg->mg_activation_count != 0) {
+		ASSERT(mc->mc_rotor != mg);
+		ASSERT(mg->mg_prev == NULL);
+		ASSERT(mg->mg_next == NULL);
+		ASSERT(mg->mg_activation_count < 0);
+		return;
+	}
+
+	mgprev = mg->mg_prev;
+	mgnext = mg->mg_next;
+
+	if (mg == mgnext) {
+		mc->mc_rotor = NULL;
+	} else {
+		mc->mc_rotor = mgnext;
+		mgprev->mg_next = mgnext;
+		mgnext->mg_prev = mgprev;
+	}
+
+	mg->mg_prev = NULL;
+	mg->mg_next = NULL;
+}
+
 static void
 metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp)
 {
@@ -203,29 +327,39 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight)
 
 /*
  * ==========================================================================
- * The first-fit block allocator
+ * Common allocator routines
  * ==========================================================================
  */
-static void
-metaslab_ff_load(space_map_t *sm)
+static int
+metaslab_segsize_compare(const void *x1, const void *x2)
 {
-	ASSERT(sm->sm_ppd == NULL);
-	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
-}
+	const space_seg_t *s1 = x1;
+	const space_seg_t *s2 = x2;
+	uint64_t ss_size1 = s1->ss_end - s1->ss_start;
+	uint64_t ss_size2 = s2->ss_end - s2->ss_start;
 
-static void
-metaslab_ff_unload(space_map_t *sm)
-{
-	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
-	sm->sm_ppd = NULL;
+	if (ss_size1 < ss_size2)
+		return (-1);
+	if (ss_size1 > ss_size2)
+		return (1);
+
+	if (s1->ss_start < s2->ss_start)
+		return (-1);
+	if (s1->ss_start > s2->ss_start)
+		return (1);
+
+	return (0);
 }
 
+/*
+ * This is a helper function that can be used by the allocator to find
+ * a suitable block to allocate. This will search the specified AVL
+ * tree looking for a block that matches the specified criteria.
+ */
 static uint64_t
-metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size,
+    uint64_t align)
 {
-	avl_tree_t *t = &sm->sm_root;
-	uint64_t align = size & -size;
-	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
 	space_seg_t *ss, ssearch;
 	avl_index_t where;
 
@@ -254,31 +388,291 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size)
 		return (-1ULL);
 
 	*cursor = 0;
-	return (metaslab_ff_alloc(sm, size));
+	return (metaslab_block_picker(t, cursor, size, align));
+}
+
+static void
+metaslab_pp_load(space_map_t *sm)
+{
+	space_seg_t *ss;
+
+	ASSERT(sm->sm_ppd == NULL);
+	sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP);
+
+	sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
+	avl_create(sm->sm_pp_root, metaslab_segsize_compare,
+	    sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node));
+
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		avl_add(sm->sm_pp_root, ss);
+}
+
+static void
+metaslab_pp_unload(space_map_t *sm)
+{
+	void *cookie = NULL;
+
+	kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t));
+	sm->sm_ppd = NULL;
+
+	while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) {
+		/* tear down the tree */
+	}
+
+	avl_destroy(sm->sm_pp_root);
+	kmem_free(sm->sm_pp_root, sizeof (avl_tree_t));
+	sm->sm_pp_root = NULL;
 }
 
 /* ARGSUSED */
 static void
-metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size)
 {
 	/* No need to update cursor */
 }
 
 /* ARGSUSED */
 static void
-metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size)
+metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size)
 {
 	/* No need to update cursor */
 }
 
+/*
+ * Return the maximum contiguous segment within the metaslab.
+ */
+uint64_t
+metaslab_pp_maxsize(space_map_t *sm)
+{
+	avl_tree_t *t = sm->sm_pp_root;
+	space_seg_t *ss;
+
+	if (t == NULL || (ss = avl_last(t)) == NULL)
+		return (0ULL);
+
+	return (ss->ss_end - ss->ss_start);
+}
+
+/*
+ * ==========================================================================
+ * The first-fit block allocator
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_ff_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t align = size & -size;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+
+	return (metaslab_block_picker(t, cursor, size, align));
+}
+
+/* ARGSUSED */
+boolean_t
+metaslab_ff_fragmented(space_map_t *sm)
+{
+	return (B_TRUE);
+}
+
 static space_map_ops_t metaslab_ff_ops = {
-	metaslab_ff_load,
-	metaslab_ff_unload,
+	metaslab_pp_load,
+	metaslab_pp_unload,
 	metaslab_ff_alloc,
-	metaslab_ff_claim,
-	metaslab_ff_free
+	metaslab_pp_claim,
+	metaslab_pp_free,
+	metaslab_pp_maxsize,
+	metaslab_ff_fragmented
 };
 
+/*
+ * ==========================================================================
+ * Dynamic block allocator -
+ * Uses the first fit allocation scheme until space get low and then
+ * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold
+ * and metaslab_df_free_pct to determine when to switch the allocation scheme.
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_df_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t align = size & -size;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1;
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+	int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+	if (max_size < size)
+		return (-1ULL);
+
+	/*
+	 * If we're running low on space switch to using the size
+	 * sorted AVL tree (best-fit).
+	 */
+	if (max_size < metaslab_df_alloc_threshold ||
+	    free_pct < metaslab_df_free_pct) {
+		t = sm->sm_pp_root;
+		*cursor = 0;
+	}
+
+	return (metaslab_block_picker(t, cursor, size, 1ULL));
+}
+
+static boolean_t
+metaslab_df_fragmented(space_map_t *sm)
+{
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+	int free_pct = sm->sm_space * 100 / sm->sm_size;
+
+	if (max_size >= metaslab_df_alloc_threshold &&
+	    free_pct >= metaslab_df_free_pct)
+		return (B_FALSE);
+
+	return (B_TRUE);
+}
+
+static space_map_ops_t metaslab_df_ops = {
+	metaslab_pp_load,
+	metaslab_pp_unload,
+	metaslab_df_alloc,
+	metaslab_pp_claim,
+	metaslab_pp_free,
+	metaslab_pp_maxsize,
+	metaslab_df_fragmented
+};
+
+/*
+ * ==========================================================================
+ * Other experimental allocators
+ * ==========================================================================
+ */
+static uint64_t
+metaslab_cdf_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
+	uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1;
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+	uint64_t rsize = size;
+	uint64_t offset = 0;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+	if (max_size < size)
+		return (-1ULL);
+
+	ASSERT3U(*extent_end, >=, *cursor);
+
+	/*
+	 * If we're running low on space switch to using the size
+	 * sorted AVL tree (best-fit).
+	 */
+	if ((*cursor + size) > *extent_end) {
+
+		t = sm->sm_pp_root;
+		*cursor = *extent_end = 0;
+
+		if (max_size > 2 * SPA_MAXBLOCKSIZE)
+			rsize = MIN(metaslab_min_alloc_size, max_size);
+		offset = metaslab_block_picker(t, extent_end, rsize, 1ULL);
+		if (offset != -1)
+			*cursor = offset + size;
+	} else {
+		offset = metaslab_block_picker(t, cursor, rsize, 1ULL);
+	}
+	ASSERT3U(*cursor, <=, *extent_end);
+	return (offset);
+}
+
+static boolean_t
+metaslab_cdf_fragmented(space_map_t *sm)
+{
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+
+	if (max_size > (metaslab_min_alloc_size * 10))
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
+static space_map_ops_t metaslab_cdf_ops = {
+	metaslab_pp_load,
+	metaslab_pp_unload,
+	metaslab_cdf_alloc,
+	metaslab_pp_claim,
+	metaslab_pp_free,
+	metaslab_pp_maxsize,
+	metaslab_cdf_fragmented
+};
+
+static uint64_t
+metaslab_ndf_alloc(space_map_t *sm, uint64_t size)
+{
+	avl_tree_t *t = &sm->sm_root;
+	avl_index_t where;
+	space_seg_t *ss, ssearch;
+	uint64_t *cursor = (uint64_t *)sm->sm_ppd;
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+	ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root));
+
+	if (max_size < size)
+		return (-1ULL);
+
+	ssearch.ss_start = *cursor;
+	ssearch.ss_end = *cursor + size;
+
+	ss = avl_find(t, &ssearch, &where);
+	if (ss == NULL || (ss->ss_start + size > ss->ss_end)) {
+		t = sm->sm_pp_root;
+
+		if (max_size > 2 * SPA_MAXBLOCKSIZE)
+			size = MIN(metaslab_min_alloc_size, max_size);
+
+		ssearch.ss_start = 0;
+		ssearch.ss_end = size;
+		ss = avl_find(t, &ssearch, &where);
+		if (ss == NULL)
+			ss = avl_nearest(t, where, AVL_AFTER);
+		ASSERT(ss != NULL);
+	}
+
+	if (ss != NULL) {
+		if (ss->ss_start + size <= ss->ss_end) {
+			*cursor = ss->ss_start + size;
+			return (ss->ss_start);
+		}
+	}
+	return (-1ULL);
+}
+
+static boolean_t
+metaslab_ndf_fragmented(space_map_t *sm)
+{
+	uint64_t max_size = metaslab_pp_maxsize(sm);
+
+	if (max_size > (metaslab_min_alloc_size * 10))
+		return (B_FALSE);
+	return (B_TRUE);
+}
+
+
+static space_map_ops_t metaslab_ndf_ops = {
+	metaslab_pp_load,
+	metaslab_pp_unload,
+	metaslab_ndf_alloc,
+	metaslab_pp_claim,
+	metaslab_pp_free,
+	metaslab_pp_maxsize,
+	metaslab_ndf_fragmented
+};
+
+space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops;
+
 /*
  * ==========================================================================
  * Metaslabs
@@ -308,6 +702,13 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 
 	metaslab_group_add(mg, msp);
 
+	if (metaslab_debug && smo->smo_object != 0) {
+		mutex_enter(&msp->ms_lock);
+		VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops,
+		    SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0);
+		mutex_exit(&msp->ms_lock);
+	}
+
 	/*
 	 * If we're opening an existing pool (txg == 0) or creating
 	 * a new one (txg == TXG_INITIAL), all space is available now.
@@ -318,16 +719,8 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
 		metaslab_sync_done(msp, 0);
 
 	if (txg != 0) {
-		/*
-		 * The vdev is dirty, but the metaslab isn't -- it just needs
-		 * to have metaslab_sync_done() invoked from vdev_sync_done().
-		 * [We could just dirty the metaslab, but that would cause us
-		 * to allocate a space map object for it, which is wasteful
-		 * and would mess up the locality logic in metaslab_weight().]
-		 */
-		ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa));
 		vdev_dirty(vd, 0, NULL, txg);
-		vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg));
+		vdev_dirty(vd, VDD_METASLAB, msp, txg);
 	}
 
 	return (msp);
@@ -337,10 +730,9 @@ void
 metaslab_fini(metaslab_t *msp)
 {
 	metaslab_group_t *mg = msp->ms_group;
-	int t;
 
-	vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size,
-	    -msp->ms_smo.smo_alloc, B_TRUE);
+	vdev_space_update(mg->mg_vd,
+	    -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size);
 
 	metaslab_group_remove(mg, msp);
 
@@ -349,11 +741,16 @@ metaslab_fini(metaslab_t *msp)
 	space_map_unload(&msp->ms_map);
 	space_map_destroy(&msp->ms_map);
 
-	for (t = 0; t < TXG_SIZE; t++) {
+	for (int t = 0; t < TXG_SIZE; t++) {
 		space_map_destroy(&msp->ms_allocmap[t]);
 		space_map_destroy(&msp->ms_freemap[t]);
 	}
 
+	for (int t = 0; t < TXG_DEFER_SIZE; t++)
+		space_map_destroy(&msp->ms_defermap[t]);
+
+	ASSERT3S(msp->ms_deferspace, ==, 0);
+
 	mutex_exit(&msp->ms_lock);
 	mutex_destroy(&msp->ms_lock);
 
@@ -364,7 +761,6 @@ metaslab_fini(metaslab_t *msp)
 #define	METASLAB_WEIGHT_SECONDARY	(1ULL << 62)
 #define	METASLAB_ACTIVE_MASK		\
 	(METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY)
-#define	METASLAB_SMO_BONUS_MULTIPLIER	2
 
 static uint64_t
 metaslab_weight(metaslab_t *msp)
@@ -397,37 +793,97 @@ metaslab_weight(metaslab_t *msp)
 	ASSERT(weight >= space && weight <= 2 * space);
 
 	/*
-	 * For locality, assign higher weight to metaslabs we've used before.
+	 * For locality, assign higher weight to metaslabs which have
+	 * a lower offset than what we've already activated.
 	 */
-	if (smo->smo_object != 0)
-		weight *= METASLAB_SMO_BONUS_MULTIPLIER;
+	if (sm->sm_start <= mg->mg_bonus_area)
+		weight *= (metaslab_smo_bonus_pct / 100);
 	ASSERT(weight >= space &&
-	    weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space);
+	    weight <= 2 * (metaslab_smo_bonus_pct / 100) * space);
+
+	if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) {
+		/*
+		 * If this metaslab is one we're actively using, adjust its
+		 * weight to make it preferable to any inactive metaslab so
+		 * we'll polish it off.
+		 */
+		weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+	}
+	return (weight);
+}
+
+static void
+metaslab_prefetch(metaslab_group_t *mg)
+{
+	spa_t *spa = mg->mg_vd->vdev_spa;
+	metaslab_t *msp;
+	avl_tree_t *t = &mg->mg_metaslab_tree;
+	int m;
+
+	mutex_enter(&mg->mg_lock);
 
 	/*
-	 * If this metaslab is one we're actively using, adjust its weight to
-	 * make it preferable to any inactive metaslab so we'll polish it off.
+	 * Prefetch the next potential metaslabs
 	 */
-	weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK);
+	for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) {
+		space_map_t *sm = &msp->ms_map;
+		space_map_obj_t *smo = &msp->ms_smo;
 
-	return (weight);
+		/* If we have reached our prefetch limit then we're done */
+		if (m >= metaslab_prefetch_limit)
+			break;
+
+		if (!sm->sm_loaded && smo->smo_object != 0) {
+			mutex_exit(&mg->mg_lock);
+			dmu_prefetch(spa_meta_objset(spa), smo->smo_object,
+			    0ULL, smo->smo_objsize);
+			mutex_enter(&mg->mg_lock);
+		}
+	}
+	mutex_exit(&mg->mg_lock);
 }
 
 static int
-metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
+metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size)
 {
+	metaslab_group_t *mg = msp->ms_group;
 	space_map_t *sm = &msp->ms_map;
+	space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops;
 
 	ASSERT(MUTEX_HELD(&msp->ms_lock));
 
 	if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
-		int error = space_map_load(sm, &metaslab_ff_ops,
-		    SM_FREE, &msp->ms_smo,
-		    msp->ms_group->mg_vd->vdev_spa->spa_meta_objset);
-		if (error) {
-			metaslab_group_sort(msp->ms_group, msp, 0);
-			return (error);
+		space_map_load_wait(sm);
+		if (!sm->sm_loaded) {
+			int error = space_map_load(sm, sm_ops, SM_FREE,
+			    &msp->ms_smo,
+			    spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
+			if (error)  {
+				metaslab_group_sort(msp->ms_group, msp, 0);
+				return (error);
+			}
+			for (int t = 0; t < TXG_DEFER_SIZE; t++)
+				space_map_walk(&msp->ms_defermap[t],
+				    space_map_claim, sm);
+
 		}
+
+		/*
+		 * Track the bonus area as we activate new metaslabs.
+		 */
+		if (sm->sm_start > mg->mg_bonus_area) {
+			mutex_enter(&mg->mg_lock);
+			mg->mg_bonus_area = sm->sm_start;
+			mutex_exit(&mg->mg_lock);
+		}
+
+		/*
+		 * If we were able to load the map then make sure
+		 * that this map is still able to satisfy our request.
+		 */
+		if (msp->ms_weight < size)
+			return (ENOSPC);
+
 		metaslab_group_sort(msp->ms_group, msp,
 		    msp->ms_weight | activation_weight);
 	}
@@ -458,7 +914,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 {
 	vdev_t *vd = msp->ms_group->mg_vd;
 	spa_t *spa = vd->vdev_spa;
-	objset_t *mos = spa->spa_meta_objset;
+	objset_t *mos = spa_meta_objset(spa);
 	space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK];
 	space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK];
 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
@@ -466,9 +922,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	space_map_obj_t *smo = &msp->ms_smo_syncing;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
-	int t;
 
-	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+	ASSERT(!vd->vdev_ishole);
+
+	if (allocmap->sm_space == 0 && freemap->sm_space == 0)
+		return;
 
 	/*
 	 * The only state that can actually be changing concurrently with
@@ -478,12 +936,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 	 * We drop it whenever we call into the DMU, because the DMU
 	 * can call down to us (e.g. via zio_free()) at any time.
 	 */
-	mutex_enter(&msp->ms_lock);
+
+	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
 
 	if (smo->smo_object == 0) {
 		ASSERT(smo->smo_objsize == 0);
 		ASSERT(smo->smo_alloc == 0);
-		mutex_exit(&msp->ms_lock);
 		smo->smo_object = dmu_object_alloc(mos,
 		    DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT,
 		    DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx);
@@ -491,9 +949,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) *
 		    (sm->sm_start >> vd->vdev_ms_shift),
 		    sizeof (uint64_t), &smo->smo_object, tx);
-		mutex_enter(&msp->ms_lock);
 	}
 
+	mutex_enter(&msp->ms_lock);
+
 	space_map_walk(freemap, space_map_add, freed_map);
 
 	if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >=
@@ -506,6 +965,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		 * This metaslab is 100% allocated,
 		 * minus the content of the in-core map (sm),
 		 * minus what's been freed this txg (freed_map),
+		 * minus deferred frees (ms_defermap[]),
 		 * minus allocations from txgs in the future
 		 * (because they haven't been committed yet).
 		 */
@@ -517,7 +977,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg)
 		space_map_walk(sm, space_map_remove, allocmap);
 		space_map_walk(freed_map, space_map_remove, allocmap);
 
-		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+		for (int t = 0; t < TXG_DEFER_SIZE; t++)
+			space_map_walk(&msp->ms_defermap[t],
+			    space_map_remove, allocmap);
+
+		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
 			space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK],
 			    space_map_remove, allocmap);
 
@@ -551,9 +1015,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	space_map_obj_t *smosync = &msp->ms_smo_syncing;
 	space_map_t *sm = &msp->ms_map;
 	space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK];
+	space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE];
 	metaslab_group_t *mg = msp->ms_group;
 	vdev_t *vd = mg->mg_vd;
-	int t;
+	int64_t alloc_delta, defer_delta;
+
+	ASSERT(!vd->vdev_ishole);
 
 	mutex_enter(&msp->ms_lock);
 
@@ -562,16 +1029,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	 * allocmaps and freemaps and add its capacity to the vdev.
 	 */
 	if (freed_map->sm_size == 0) {
-		for (t = 0; t < TXG_SIZE; t++) {
+		for (int t = 0; t < TXG_SIZE; t++) {
 			space_map_create(&msp->ms_allocmap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 			space_map_create(&msp->ms_freemap[t], sm->sm_start,
 			    sm->sm_size, sm->sm_shift, sm->sm_lock);
 		}
-		vdev_space_update(vd, sm->sm_size, 0, B_TRUE);
+
+		for (int t = 0; t < TXG_DEFER_SIZE; t++)
+			space_map_create(&msp->ms_defermap[t], sm->sm_start,
+			    sm->sm_size, sm->sm_shift, sm->sm_lock);
+
+		vdev_space_update(vd, 0, 0, sm->sm_size);
 	}
 
-	vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE);
+	alloc_delta = smosync->smo_alloc - smo->smo_alloc;
+	defer_delta = freed_map->sm_space - defer_map->sm_space;
+
+	vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0);
 
 	ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0);
 	ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0);
@@ -579,13 +1054,26 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	/*
 	 * If there's a space_map_load() in progress, wait for it to complete
 	 * so that we have a consistent view of the in-core space map.
-	 * Then, add everything we freed in this txg to the map.
+	 * Then, add defer_map (oldest deferred frees) to this map and
+	 * transfer freed_map (this txg's frees) to defer_map.
 	 */
 	space_map_load_wait(sm);
-	space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm);
+	space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm);
+	space_map_vacate(freed_map, space_map_add, defer_map);
 
 	*smo = *smosync;
 
+	msp->ms_deferspace += defer_delta;
+	ASSERT3S(msp->ms_deferspace, >=, 0);
+	ASSERT3S(msp->ms_deferspace, <=, sm->sm_size);
+	if (msp->ms_deferspace != 0) {
+		/*
+		 * Keep syncing this metaslab until all deferred frees
+		 * are back in circulation.
+		 */
+		vdev_dirty(vd, VDD_METASLAB, msp, txg + 1);
+	}
+
 	/*
 	 * If the map is loaded but no longer active, evict it as soon as all
 	 * future allocations have synced.  (If we unloaded it now and then
@@ -594,11 +1082,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
 		int evictable = 1;
 
-		for (t = 1; t < TXG_CONCURRENT_STATES; t++)
+		for (int t = 1; t < TXG_CONCURRENT_STATES; t++)
 			if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space)
 				evictable = 0;
 
-		if (evictable)
+		if (evictable && !metaslab_debug)
 			space_map_unload(sm);
 	}
 
@@ -607,6 +1095,32 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg)
 	mutex_exit(&msp->ms_lock);
 }
 
+void
+metaslab_sync_reassess(metaslab_group_t *mg)
+{
+	vdev_t *vd = mg->mg_vd;
+
+	/*
+	 * Re-evaluate all metaslabs which have lower offsets than the
+	 * bonus area.
+	 */
+	for (int m = 0; m < vd->vdev_ms_count; m++) {
+		metaslab_t *msp = vd->vdev_ms[m];
+
+		if (msp->ms_map.sm_start > mg->mg_bonus_area)
+			break;
+
+		mutex_enter(&msp->ms_lock);
+		metaslab_group_sort(mg, msp, metaslab_weight(msp));
+		mutex_exit(&msp->ms_lock);
+	}
+
+	/*
+	 * Prefetch the next potential metaslabs
+	 */
+	metaslab_prefetch(mg);
+}
+
 static uint64_t
 metaslab_distance(metaslab_t *msp, dva_t *dva)
 {
@@ -636,11 +1150,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 	int i;
 
 	activation_weight = METASLAB_WEIGHT_PRIMARY;
-	for (i = 0; i < d; i++)
-		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id)
+	for (i = 0; i < d; i++) {
+		if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) {
 			activation_weight = METASLAB_WEIGHT_SECONDARY;
+			break;
+		}
+	}
 
 	for (;;) {
+		boolean_t was_active;
+
 		mutex_enter(&mg->mg_lock);
 		for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) {
 			if (msp->ms_weight < size) {
@@ -648,6 +1167,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 				return (-1ULL);
 			}
 
+			was_active = msp->ms_weight & METASLAB_ACTIVE_MASK;
 			if (activation_weight == METASLAB_WEIGHT_PRIMARY)
 				break;
 
@@ -673,7 +1193,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		 * another thread may have changed the weight while we
 		 * were blocked on the metaslab lock.
 		 */
-		if (msp->ms_weight < size) {
+		if (msp->ms_weight < size || (was_active &&
+		    !(msp->ms_weight & METASLAB_ACTIVE_MASK) &&
+		    activation_weight == METASLAB_WEIGHT_PRIMARY)) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -686,7 +1208,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 			continue;
 		}
 
-		if (metaslab_activate(msp, activation_weight) != 0) {
+		if (metaslab_activate(msp, activation_weight, size) != 0) {
 			mutex_exit(&msp->ms_lock);
 			continue;
 		}
@@ -694,7 +1216,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg,
 		if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL)
 			break;
 
-		metaslab_passivate(msp, size - 1);
+		metaslab_passivate(msp, space_map_maxsize(&msp->ms_map));
 
 		mutex_exit(&msp->ms_lock);
 	}
@@ -720,6 +1242,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	vdev_t *vd;
 	int dshift = 3;
 	int all_zero;
+	int zio_lock = B_FALSE;
+	boolean_t allocatable;
 	uint64_t offset = -1ULL;
 	uint64_t asize;
 	uint64_t distance;
@@ -729,12 +1253,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	/*
 	 * For testing, make some blocks above a certain size be gang blocks.
 	 */
-	if (psize >= metaslab_gang_bang && (lbolt & 3) == 0)
+	if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0)
 		return (ENOSPC);
 
 	/*
 	 * Start at the rotor and loop through all mgs until we find something.
-	 * Note that there's no locking on mc_rotor or mc_allocated because
+	 * Note that there's no locking on mc_rotor or mc_aliquot because
 	 * nothing actually breaks if we miss a few updates -- we just won't
 	 * allocate quite as evenly.  It all balances out over time.
 	 *
@@ -756,10 +1280,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	 */
 	if (hintdva) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d]));
-		if (flags & METASLAB_HINTBP_AVOID)
-			mg = vd->vdev_mg->mg_next;
-		else
+
+		/*
+		 * It's possible the vdev we're using as the hint no
+		 * longer exists (i.e. removed). Consult the rotor when
+		 * all else fails.
+		 */
+		if (vd != NULL) {
 			mg = vd->vdev_mg;
+
+			if (flags & METASLAB_HINTBP_AVOID &&
+			    mg->mg_next != NULL)
+				mg = mg->mg_next;
+		} else {
+			mg = mc->mc_rotor;
+		}
 	} else if (d != 0) {
 		vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1]));
 		mg = vd->vdev_mg->mg_next;
@@ -768,21 +1303,33 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 	}
 
 	/*
-	 * If the hint put us into the wrong class, just follow the rotor.
+	 * If the hint put us into the wrong metaslab class, or into a
+	 * metaslab group that has been passivated, just follow the rotor.
 	 */
-	if (mg->mg_class != mc)
+	if (mg->mg_class != mc || mg->mg_activation_count <= 0)
 		mg = mc->mc_rotor;
 
 	rotor = mg;
 top:
 	all_zero = B_TRUE;
 	do {
+		ASSERT(mg->mg_activation_count == 1);
+
 		vd = mg->mg_vd;
+
 		/*
 		 * Don't allocate from faulted devices.
 		 */
-		if (!vdev_allocatable(vd))
+		if (zio_lock) {
+			spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER);
+			allocatable = vdev_allocatable(vd);
+			spa_config_exit(spa, SCL_ZIO, FTAG);
+		} else {
+			allocatable = vdev_allocatable(vd);
+		}
+		if (!allocatable)
 			goto next;
+
 		/*
 		 * Avoid writing single-copy data to a failing vdev
 		 */
@@ -812,32 +1359,28 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 			 * over- or under-used relative to the pool,
 			 * and set an allocation bias to even it out.
 			 */
-			if (mc->mc_allocated == 0) {
+			if (mc->mc_aliquot == 0) {
 				vdev_stat_t *vs = &vd->vdev_stat;
-				uint64_t alloc, space;
-				int64_t vu, su;
-
-				alloc = spa_get_alloc(spa);
-				space = spa_get_space(spa);
+				int64_t vu, cu;
 
 				/*
 				 * Determine percent used in units of 0..1024.
 				 * (This is just to avoid floating point.)
 				 */
 				vu = (vs->vs_alloc << 10) / (vs->vs_space + 1);
-				su = (alloc << 10) / (space + 1);
+				cu = (mc->mc_alloc << 10) / (mc->mc_space + 1);
 
 				/*
 				 * Bias by at most +/- 25% of the aliquot.
 				 */
-				mg->mg_bias = ((su - vu) *
+				mg->mg_bias = ((cu - vu) *
 				    (int64_t)mg->mg_aliquot) / (1024 * 4);
 			}
 
-			if (atomic_add_64_nv(&mc->mc_allocated, asize) >=
+			if (atomic_add_64_nv(&mc->mc_aliquot, asize) >=
 			    mg->mg_aliquot + mg->mg_bias) {
 				mc->mc_rotor = mg->mg_next;
-				mc->mc_allocated = 0;
+				mc->mc_aliquot = 0;
 			}
 
 			DVA_SET_VDEV(&dva[d], vd->vdev_id);
@@ -849,7 +1392,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		}
 next:
 		mc->mc_rotor = mg->mg_next;
-		mc->mc_allocated = 0;
+		mc->mc_aliquot = 0;
 	} while ((mg = mg->mg_next) != rotor);
 
 	if (!all_zero) {
@@ -858,6 +1401,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize,
 		goto top;
 	}
 
+	if (!allocatable && !zio_lock) {
+		dshift = 3;
+		zio_lock = B_TRUE;
+		goto top;
+	}
+
 	bzero(&dva[d], sizeof (dva_t));
 
 	return (ENOSPC);
@@ -923,7 +1472,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 	uint64_t size = DVA_GET_ASIZE(dva);
 	vdev_t *vd;
 	metaslab_t *msp;
-	int error;
+	int error = 0;
 
 	ASSERT(DVA_IS_VALID(dva));
 
@@ -938,7 +1487,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 
 	mutex_enter(&msp->ms_lock);
 
-	error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY);
+	if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded)
+		error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0);
+
+	if (error == 0 && !space_map_contains(&msp->ms_map, offset, size))
+		error = ENOENT;
+
 	if (error || txg == 0) {	/* txg == 0 indicates dry run */
 		mutex_exit(&msp->ms_lock);
 		return (error);
@@ -946,7 +1500,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg)
 
 	space_map_claim(&msp->ms_map, offset, size);
 
-	if (spa_mode & FWRITE) {	/* don't dirty if we're zdb(1M) */
+	if (spa_writeable(spa)) {	/* don't dirty if we're zdb(1M) */
 		if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0)
 			vdev_dirty(vd, VDD_METASLAB, msp, txg);
 		space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size);
@@ -966,6 +1520,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 	int error = 0;
 
 	ASSERT(bp->blk_birth == 0);
+	ASSERT(BP_PHYSICAL_BIRTH(bp) == 0);
 
 	spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER);
 
@@ -995,7 +1550,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp,
 
 	spa_config_exit(spa, SCL_ALLOC, FTAG);
 
-	bp->blk_birth = txg;
+	BP_SET_BIRTH(bp, txg, txg);
 
 	return (0);
 }
@@ -1007,7 +1562,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now)
 	int ndvas = BP_GET_NDVAS(bp);
 
 	ASSERT(!BP_IS_HOLE(bp));
-	ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg);
+	ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa));
 
 	spa_config_enter(spa, SCL_FREE, FTAG, RW_READER);
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c b/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c
index 710685dbc71e2..4cef53f951327 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/refcount.h>
 #include <sys/rrwlock.h>
 
@@ -118,7 +116,7 @@ rrn_find_and_remove(rrwlock_t *rrl)
 	rrw_node_t *prev = NULL;
 
 	if (refcount_count(&rrl->rr_linked_rcount) == 0)
-		return (NULL);
+		return (B_FALSE);
 
 	for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) {
 		if (rn->rn_rrl == rrl) {
@@ -159,6 +157,14 @@ static void
 rrw_enter_read(rrwlock_t *rrl, void *tag)
 {
 	mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+	if (!rrl->rr_writer && !rrl->rr_writer_wanted) {
+		rrl->rr_anon_rcount.rc_count++;
+		mutex_exit(&rrl->rr_lock);
+		return;
+	}
+	DTRACE_PROBE(zfs__rrwfastpath__rdmiss);
+#endif
 	ASSERT(rrl->rr_writer != curthread);
 	ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0);
 
@@ -208,19 +214,28 @@ void
 rrw_exit(rrwlock_t *rrl, void *tag)
 {
 	mutex_enter(&rrl->rr_lock);
+#if !defined(DEBUG) && defined(_KERNEL)
+	if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) {
+		rrl->rr_anon_rcount.rc_count--;
+		if (rrl->rr_anon_rcount.rc_count == 0)
+			cv_broadcast(&rrl->rr_cv);
+		mutex_exit(&rrl->rr_lock);
+		return;
+	}
+	DTRACE_PROBE(zfs__rrwfastpath__exitmiss);
+#endif
 	ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) ||
 	    !refcount_is_zero(&rrl->rr_linked_rcount) ||
 	    rrl->rr_writer != NULL);
 
 	if (rrl->rr_writer == NULL) {
-		if (rrn_find_and_remove(rrl)) {
-			if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0)
-				cv_broadcast(&rrl->rr_cv);
-
-		} else {
-			if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0)
-				cv_broadcast(&rrl->rr_cv);
-		}
+		int64_t count;
+		if (rrn_find_and_remove(rrl))
+			count = refcount_remove(&rrl->rr_linked_rcount, tag);
+		else
+			count = refcount_remove(&rrl->rr_anon_rcount, tag);
+		if (count == 0)
+			cv_broadcast(&rrl->rr_cv);
 	} else {
 		ASSERT(rrl->rr_writer == curthread);
 		ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) &&
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c b/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c
index ca7076cb6fd99..f515be6bb3042 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c
@@ -19,111 +19,32 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
-
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
-
-/*
- * SHA-256 checksum, as specified in FIPS 180-3, available at:
- * http://csrc.nist.gov/publications/PubsFIPS.html
- *
- * This is a very compact implementation of SHA-256.
- * It is designed to be simple and portable, not to be fast.
- */
-
-/*
- * The literal definitions of Ch() and Maj() according to FIPS 180-3 are:
- *
- * 	Ch(x, y, z)     (x & y) ^ (~x & z)
- * 	Maj(x, y, z)    (x & y) ^ (x & z) ^ (y & z)
- *
- * We use equivalent logical reductions here that require one less op.
- */
-#define	Ch(x, y, z)	((z) ^ ((x) & ((y) ^ (z))))
-#define	Maj(x, y, z)	(((x) & (y)) ^ ((z) & ((x) ^ (y))))
-#define	Rot32(x, s)	(((x) >> s) | ((x) << (32 - s)))
-#define	SIGMA0(x)	(Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22))
-#define	SIGMA1(x)	(Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25))
-#define	sigma0(x)	(Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3))
-#define	sigma1(x)	(Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10))
-
-static const uint32_t SHA256_K[64] = {
-	0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
-	0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
-	0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
-	0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
-	0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
-	0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
-	0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
-	0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
-	0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
-	0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
-	0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
-	0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
-	0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
-	0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
-	0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
-	0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2
-};
-
-static void
-SHA256Transform(uint32_t *H, const uint8_t *cp)
-{
-	uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];
-
-	for (t = 0; t < 16; t++, cp += 4)
-		W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];
-
-	for (t = 16; t < 64; t++)
-		W[t] = sigma1(W[t - 2]) + W[t - 7] +
-		    sigma0(W[t - 15]) + W[t - 16];
-
-	a = H[0]; b = H[1]; c = H[2]; d = H[3];
-	e = H[4]; f = H[5]; g = H[6]; h = H[7];
-
-	for (t = 0; t < 64; t++) {
-		T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
-		T2 = SIGMA0(a) + Maj(a, b, c);
-		h = g; g = f; f = e; e = d + T1;
-		d = c; c = b; b = a; a = T1 + T2;
-	}
-
-	H[0] += a; H[1] += b; H[2] += c; H[3] += d;
-	H[4] += e; H[5] += f; H[6] += g; H[7] += h;
-}
+#include <sys/sha2.h>
 
 void
 zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp)
 {
-	uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a,
-	    0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 };
-	uint8_t pad[128];
-	int i, padsize;
-
-	for (i = 0; i < (size & ~63ULL); i += 64)
-		SHA256Transform(H, (uint8_t *)buf + i);
-
-	for (padsize = 0; i < size; i++)
-		pad[padsize++] = *((uint8_t *)buf + i);
-
-	for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++)
-		pad[padsize] = 0;
-
-	for (i = 56; i >= 0; i -= 8)
-		pad[padsize++] = (size << 3) >> i;
-
-	for (i = 0; i < padsize; i += 64)
-		SHA256Transform(H, pad + i);
-
-	ZIO_SET_CHECKSUM(zcp,
-	    (uint64_t)H[0] << 32 | H[1],
-	    (uint64_t)H[2] << 32 | H[3],
-	    (uint64_t)H[4] << 32 | H[5],
-	    (uint64_t)H[6] << 32 | H[7]);
+	SHA2_CTX ctx;
+	zio_cksum_t tmp;
+
+	SHA2Init(SHA256, &ctx);
+	SHA2Update(&ctx, buf, size);
+	SHA2Final(&tmp, &ctx);
+
+	/*
+	 * A prior implementation of this function had a
+	 * private SHA256 implementation always wrote things out in
+	 * Big Endian and there wasn't a byteswap variant of it.
+	 * To preseve on disk compatibility we need to force that
+	 * behaviour.
+	 */
+	zcp->zc_word[0] = BE_64(tmp.zc_word[0]);
+	zcp->zc_word[1] = BE_64(tmp.zc_word[1]);
+	zcp->zc_word[2] = BE_64(tmp.zc_word[2]);
+	zcp->zc_word[3] = BE_64(tmp.zc_word[3]);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c
index fb1b96f8b8117..9f2876fcfd9f3 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,13 +35,14 @@
 #include <sys/spa_impl.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
-#include <sys/zio_compress.h>
 #include <sys/dmu.h>
 #include <sys/dmu_tx.h>
 #include <sys/zap.h>
 #include <sys/zil.h>
+#include <sys/ddt.h>
 #include <sys/vdev_impl.h>
 #include <sys/metaslab.h>
+#include <sys/metaslab_impl.h>
 #include <sys/uberblock_impl.h>
 #include <sys/txg.h>
 #include <sys/avl.h>
@@ -57,24 +58,77 @@
 #include <sys/arc.h>
 #include <sys/callb.h>
 #include <sys/systeminfo.h>
-#include <sys/sunddi.h>
 #include <sys/spa_boot.h>
+#include <sys/zfs_ioctl.h>
+
+#ifdef	_KERNEL
+#include <sys/bootprops.h>
+#include <sys/callb.h>
+#include <sys/cpupart.h>
+#include <sys/pool.h>
+#include <sys/sysdc.h>
+#include <sys/zone.h>
+#endif	/* _KERNEL */
 
 #include "zfs_prop.h"
 #include "zfs_comutil.h"
 
-int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
-	/*	ISSUE	INTR					*/
-	{	1,	1	},	/* ZIO_TYPE_NULL	*/
-	{	1,	8	},	/* ZIO_TYPE_READ	*/
-	{	8,	1	},	/* ZIO_TYPE_WRITE	*/
-	{	1,	1	},	/* ZIO_TYPE_FREE	*/
-	{	1,	1	},	/* ZIO_TYPE_CLAIM	*/
-	{	1,	1	},	/* ZIO_TYPE_IOCTL	*/
+typedef enum zti_modes {
+	zti_mode_fixed,			/* value is # of threads (min 1) */
+	zti_mode_online_percent,	/* value is % of online CPUs */
+	zti_mode_batch,			/* cpu-intensive; value is ignored */
+	zti_mode_null,			/* don't create a taskq */
+	zti_nmodes
+} zti_modes_t;
+
+#define	ZTI_FIX(n)	{ zti_mode_fixed, (n) }
+#define	ZTI_PCT(n)	{ zti_mode_online_percent, (n) }
+#define	ZTI_BATCH	{ zti_mode_batch, 0 }
+#define	ZTI_NULL	{ zti_mode_null, 0 }
+
+#define	ZTI_ONE		ZTI_FIX(1)
+
+typedef struct zio_taskq_info {
+	enum zti_modes zti_mode;
+	uint_t zti_value;
+} zio_taskq_info_t;
+
+static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = {
+	"issue", "issue_high", "intr", "intr_high"
+};
+
+/*
+ * Define the taskq threads for the following I/O types:
+ * 	NULL, READ, WRITE, FREE, CLAIM, and IOCTL
+ */
+const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = {
+	/* ISSUE	ISSUE_HIGH	INTR		INTR_HIGH */
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
+	{ ZTI_FIX(8),	ZTI_NULL,	ZTI_BATCH,	ZTI_NULL },
+	{ ZTI_BATCH,	ZTI_FIX(5),	ZTI_FIX(8),	ZTI_FIX(5) },
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
+	{ ZTI_ONE,	ZTI_NULL,	ZTI_ONE,	ZTI_NULL },
 };
 
 static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx);
 static boolean_t spa_has_active_shared_spare(spa_t *spa);
+static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config,
+    spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+    char **ereport);
+
+uint_t		zio_taskq_batch_pct = 100;	/* 1 thread per cpu in pset */
+id_t		zio_taskq_psrset_bind = PS_NONE;
+boolean_t	zio_taskq_sysdc = B_TRUE;	/* use SDC scheduling class */
+uint_t		zio_taskq_basedc = 80;		/* base duty cycle */
+
+boolean_t	spa_create_process = B_TRUE;	/* no process ==> no sysdc */
+
+/*
+ * This (illegal) pool name is used when temporarily importing a spa_t in order
+ * to get the vdev stats associated with the imported devices.
+ */
+#define	TRYIMPORT_NAME	"$import"
 
 /*
  * ==========================================================================
@@ -110,38 +164,41 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval,
 static void
 spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 {
-	uint64_t size = spa_get_space(spa);
-	uint64_t used = spa_get_alloc(spa);
+	uint64_t size;
+	uint64_t alloc;
 	uint64_t cap, version;
 	zprop_source_t src = ZPROP_SRC_NONE;
 	spa_config_dirent_t *dp;
 
 	ASSERT(MUTEX_HELD(&spa->spa_props_lock));
 
-	/*
-	 * readonly properties
-	 */
-	spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src);
-
-	cap = (size == 0) ? 0 : (used * 100 / size);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+	if (spa->spa_root_vdev != NULL) {
+		alloc = metaslab_class_get_alloc(spa_normal_class(spa));
+		size = metaslab_class_get_space(spa_normal_class(spa));
+		spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL,
+		    size - alloc, src);
+
+		cap = (size == 0) ? 0 : (alloc * 100 / size);
+		spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src);
+
+		spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL,
+		    ddt_get_pool_dedup_ratio(spa), src);
+
+		spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
+		    spa->spa_root_vdev->vdev_state, src);
+
+		version = spa_version(spa);
+		if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
+			src = ZPROP_SRC_DEFAULT;
+		else
+			src = ZPROP_SRC_LOCAL;
+		spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
+	}
 
 	spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src);
-	spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL,
-	    spa->spa_root_vdev->vdev_state, src);
-
-	/*
-	 * settable properties that are not stored in the pool property object.
-	 */
-	version = spa_version(spa);
-	if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION))
-		src = ZPROP_SRC_DEFAULT;
-	else
-		src = ZPROP_SRC_LOCAL;
-	spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src);
 
 	if (spa->spa_root != NULL)
 		spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root,
@@ -164,9 +221,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp)
 int
 spa_prop_get(spa_t *spa, nvlist_t **nvp)
 {
+	objset_t *mos = spa->spa_meta_objset;
 	zap_cursor_t zc;
 	zap_attribute_t za;
-	objset_t *mos = spa->spa_meta_objset;
 	int err;
 
 	VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
@@ -179,7 +236,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp)
 	spa_prop_get_config(spa, nvp);
 
 	/* If no pool property object, no more prop to get. */
-	if (spa->spa_pool_props_object == 0) {
+	if (mos == NULL || spa->spa_pool_props_object == 0) {
 		mutex_exit(&spa->spa_props_lock);
 		return (0);
 	}
@@ -300,12 +357,18 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 		case ZPOOL_PROP_DELEGATION:
 		case ZPOOL_PROP_AUTOREPLACE:
 		case ZPOOL_PROP_LISTSNAPS:
+		case ZPOOL_PROP_AUTOEXPAND:
 			error = nvpair_value_uint64(elem, &intval);
 			if (!error && intval > 1)
 				error = EINVAL;
 			break;
 
 		case ZPOOL_PROP_BOOTFS:
+			/*
+			 * If the pool version is less than SPA_VERSION_BOOTFS,
+			 * or the pool is still being created (version == 0),
+			 * the bootfs property cannot be set.
+			 */
 			if (spa_version(spa) < SPA_VERSION_BOOTFS) {
 				error = ENOTSUP;
 				break;
@@ -332,12 +395,14 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 					break;
 				}
 
-				if (error = dmu_objset_open(strval, DMU_OST_ZFS,
-				    DS_MODE_USER | DS_MODE_READONLY, &os))
+				if (error = dmu_objset_hold(strval, FTAG, &os))
 					break;
 
-				/* We don't support gzip bootable datasets */
-				if ((error = dsl_prop_get_integer(strval,
+				/* Must be ZPL and not gzip compressed. */
+
+				if (dmu_objset_type(os) != DMU_OST_ZFS) {
+					error = ENOTSUP;
+				} else if ((error = dsl_prop_get_integer(strval,
 				    zfs_prop_to_name(ZFS_PROP_COMPRESSION),
 				    &compress, NULL)) == 0 &&
 				    !BOOTFS_COMPRESS_VALID(compress)) {
@@ -345,7 +410,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 				} else {
 					objnum = dmu_objset_id(os);
 				}
-				dmu_objset_close(os);
+				dmu_objset_rele(os, FTAG);
 			}
 			break;
 
@@ -393,6 +458,16 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 			    strcmp(slash, "/..") == 0)
 				error = EINVAL;
 			break;
+
+		case ZPOOL_PROP_DEDUPDITTO:
+			if (spa_version(spa) < SPA_VERSION_DEDUP)
+				error = ENOTSUP;
+			else
+				error = nvpair_value_uint64(elem, &intval);
+			if (error == 0 &&
+			    intval != 0 && intval < ZIO_DEDUPDITTO_MIN)
+				error = EINVAL;
+			break;
 		}
 
 		if (error)
@@ -412,16 +487,60 @@ spa_prop_validate(spa_t *spa, nvlist_t *props)
 	return (error);
 }
 
+void
+spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync)
+{
+	char *cachefile;
+	spa_config_dirent_t *dp;
+
+	if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE),
+	    &cachefile) != 0)
+		return;
+
+	dp = kmem_alloc(sizeof (spa_config_dirent_t),
+	    KM_SLEEP);
+
+	if (cachefile[0] == '\0')
+		dp->scd_path = spa_strdup(spa_config_path);
+	else if (strcmp(cachefile, "none") == 0)
+		dp->scd_path = NULL;
+	else
+		dp->scd_path = spa_strdup(cachefile);
+
+	list_insert_head(&spa->spa_config_list, dp);
+	if (need_sync)
+		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+}
+
 int
 spa_prop_set(spa_t *spa, nvlist_t *nvp)
 {
 	int error;
+	nvpair_t *elem;
+	boolean_t need_sync = B_FALSE;
+	zpool_prop_t prop;
 
 	if ((error = spa_prop_validate(spa, nvp)) != 0)
 		return (error);
 
-	return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
-	    spa, nvp, 3));
+	elem = NULL;
+	while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) {
+		if ((prop = zpool_name_to_prop(
+		    nvpair_name(elem))) == ZPROP_INVAL)
+			return (EINVAL);
+
+		if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT)
+			continue;
+
+		need_sync = B_TRUE;
+		break;
+	}
+
+	if (need_sync)
+		return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props,
+		    spa, nvp, 3));
+	else
+		return (0);
 }
 
 /*
@@ -482,26 +601,185 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub)
 	    offsetof(spa_error_entry_t, se_avl));
 }
 
+static taskq_t *
+spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode,
+    uint_t value)
+{
+	uint_t flags = TASKQ_PREPOPULATE;
+	boolean_t batch = B_FALSE;
+
+	switch (mode) {
+	case zti_mode_null:
+		return (NULL);		/* no taskq needed */
+
+	case zti_mode_fixed:
+		ASSERT3U(value, >=, 1);
+		value = MAX(value, 1);
+		break;
+
+	case zti_mode_batch:
+		batch = B_TRUE;
+		flags |= TASKQ_THREADS_CPU_PCT;
+		value = zio_taskq_batch_pct;
+		break;
+
+	case zti_mode_online_percent:
+		flags |= TASKQ_THREADS_CPU_PCT;
+		break;
+
+	default:
+		panic("unrecognized mode for %s taskq (%u:%u) in "
+		    "spa_activate()",
+		    name, mode, value);
+		break;
+	}
+
+	if (zio_taskq_sysdc && spa->spa_proc != &p0) {
+		if (batch)
+			flags |= TASKQ_DC_BATCH;
+
+		return (taskq_create_sysdc(name, value, 50, INT_MAX,
+		    spa->spa_proc, zio_taskq_basedc, flags));
+	}
+	return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX,
+	    spa->spa_proc, flags));
+}
+
+static void
+spa_create_zio_taskqs(spa_t *spa)
+{
+	for (int t = 0; t < ZIO_TYPES; t++) {
+		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
+			const zio_taskq_info_t *ztip = &zio_taskqs[t][q];
+			enum zti_modes mode = ztip->zti_mode;
+			uint_t value = ztip->zti_value;
+			char name[32];
+
+			(void) snprintf(name, sizeof (name),
+			    "%s_%s", zio_type_name[t], zio_taskq_types[q]);
+
+			spa->spa_zio_taskq[t][q] =
+			    spa_taskq_create(spa, name, mode, value);
+		}
+	}
+}
+
+#ifdef _KERNEL
+static void
+spa_thread(void *arg)
+{
+	callb_cpr_t cprinfo;
+
+	spa_t *spa = arg;
+	user_t *pu = PTOU(curproc);
+
+	CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr,
+	    spa->spa_name);
+
+	ASSERT(curproc != &p0);
+	(void) snprintf(pu->u_psargs, sizeof (pu->u_psargs),
+	    "zpool-%s", spa->spa_name);
+	(void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm));
+
+	/* bind this thread to the requested psrset */
+	if (zio_taskq_psrset_bind != PS_NONE) {
+		pool_lock();
+		mutex_enter(&cpu_lock);
+		mutex_enter(&pidlock);
+		mutex_enter(&curproc->p_lock);
+
+		if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind,
+		    0, NULL, NULL) == 0)  {
+			curthread->t_bind_pset = zio_taskq_psrset_bind;
+		} else {
+			cmn_err(CE_WARN,
+			    "Couldn't bind process for zfs pool \"%s\" to "
+			    "pset %d\n", spa->spa_name, zio_taskq_psrset_bind);
+		}
+
+		mutex_exit(&curproc->p_lock);
+		mutex_exit(&pidlock);
+		mutex_exit(&cpu_lock);
+		pool_unlock();
+	}
+
+	if (zio_taskq_sysdc) {
+		sysdc_thread_enter(curthread, 100, 0);
+	}
+
+	spa->spa_proc = curproc;
+	spa->spa_did = curthread->t_did;
+
+	spa_create_zio_taskqs(spa);
+
+	mutex_enter(&spa->spa_proc_lock);
+	ASSERT(spa->spa_proc_state == SPA_PROC_CREATED);
+
+	spa->spa_proc_state = SPA_PROC_ACTIVE;
+	cv_broadcast(&spa->spa_proc_cv);
+
+	CALLB_CPR_SAFE_BEGIN(&cprinfo);
+	while (spa->spa_proc_state == SPA_PROC_ACTIVE)
+		cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+	CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock);
+
+	ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE);
+	spa->spa_proc_state = SPA_PROC_GONE;
+	spa->spa_proc = &p0;
+	cv_broadcast(&spa->spa_proc_cv);
+	CALLB_CPR_EXIT(&cprinfo);	/* drops spa_proc_lock */
+
+	mutex_enter(&curproc->p_lock);
+	lwp_exit();
+}
+#endif
+
 /*
  * Activate an uninitialized pool.
  */
 static void
-spa_activate(spa_t *spa)
+spa_activate(spa_t *spa, int mode)
 {
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
 	spa->spa_state = POOL_STATE_ACTIVE;
-
-	spa->spa_normal_class = metaslab_class_create();
-	spa->spa_log_class = metaslab_class_create();
-
-	for (int t = 0; t < ZIO_TYPES; t++) {
-		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			spa->spa_zio_taskq[t][q] = taskq_create("spa_zio",
-			    zio_taskq_threads[t][q], maxclsyspri, 50,
-			    INT_MAX, TASKQ_PREPOPULATE);
+	spa->spa_mode = mode;
+
+	spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops);
+	spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops);
+
+	/* Try to create a covering process */
+	mutex_enter(&spa->spa_proc_lock);
+	ASSERT(spa->spa_proc_state == SPA_PROC_NONE);
+	ASSERT(spa->spa_proc == &p0);
+	spa->spa_did = 0;
+
+	/* Only create a process if we're going to be around a while. */
+	if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) {
+		if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri,
+		    NULL, 0) == 0) {
+			spa->spa_proc_state = SPA_PROC_CREATED;
+			while (spa->spa_proc_state == SPA_PROC_CREATED) {
+				cv_wait(&spa->spa_proc_cv,
+				    &spa->spa_proc_lock);
+			}
+			ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+			ASSERT(spa->spa_proc != &p0);
+			ASSERT(spa->spa_did != 0);
+		} else {
+#ifdef _KERNEL
+			cmn_err(CE_WARN,
+			    "Couldn't create process for zfs pool \"%s\"\n",
+			    spa->spa_name);
+#endif
 		}
 	}
+	mutex_exit(&spa->spa_proc_lock);
+
+	/* If we didn't create a process, we need to create our taskqs. */
+	if (spa->spa_proc == &p0) {
+		spa_create_zio_taskqs(spa);
+	}
 
 	list_create(&spa->spa_config_dirty_list, sizeof (vdev_t),
 	    offsetof(vdev_t, vdev_config_dirty_node));
@@ -528,7 +806,7 @@ spa_deactivate(spa_t *spa)
 	ASSERT(spa->spa_sync_on == B_FALSE);
 	ASSERT(spa->spa_dsl_pool == NULL);
 	ASSERT(spa->spa_root_vdev == NULL);
-
+	ASSERT(spa->spa_async_zio_root == NULL);
 	ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED);
 
 	txg_list_destroy(&spa->spa_vdev_txg_list);
@@ -538,7 +816,8 @@ spa_deactivate(spa_t *spa)
 
 	for (int t = 0; t < ZIO_TYPES; t++) {
 		for (int q = 0; q < ZIO_TASKQ_TYPES; q++) {
-			taskq_destroy(spa->spa_zio_taskq[t][q]);
+			if (spa->spa_zio_taskq[t][q] != NULL)
+				taskq_destroy(spa->spa_zio_taskq[t][q]);
 			spa->spa_zio_taskq[t][q] = NULL;
 		}
 	}
@@ -559,6 +838,31 @@ spa_deactivate(spa_t *spa)
 	avl_destroy(&spa->spa_errlist_last);
 
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
+
+	mutex_enter(&spa->spa_proc_lock);
+	if (spa->spa_proc_state != SPA_PROC_NONE) {
+		ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE);
+		spa->spa_proc_state = SPA_PROC_DEACTIVATE;
+		cv_broadcast(&spa->spa_proc_cv);
+		while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) {
+			ASSERT(spa->spa_proc != &p0);
+			cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock);
+		}
+		ASSERT(spa->spa_proc_state == SPA_PROC_GONE);
+		spa->spa_proc_state = SPA_PROC_NONE;
+	}
+	ASSERT(spa->spa_proc == &p0);
+	mutex_exit(&spa->spa_proc_lock);
+
+	/*
+	 * We want to make sure spa_thread() has actually exited the ZFS
+	 * module, so that the module can't be unloaded out from underneath
+	 * it.
+	 */
+	if (spa->spa_did != 0) {
+		thread_join(spa->spa_did);
+		spa->spa_did = 0;
+	}
 }
 
 /*
@@ -572,7 +876,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
     uint_t id, int atype)
 {
 	nvlist_t **child;
-	uint_t c, children;
+	uint_t children;
 	int error;
 
 	if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0)
@@ -593,7 +897,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent,
 		return (EINVAL);
 	}
 
-	for (c = 0; c < children; c++) {
+	for (int c = 0; c < children; c++) {
 		vdev_t *vd;
 		if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c,
 		    atype)) != 0) {
@@ -634,15 +938,10 @@ spa_unload(spa_t *spa)
 	/*
 	 * Wait for any outstanding async I/O to complete.
 	 */
-	mutex_enter(&spa->spa_async_root_lock);
-	while (spa->spa_async_root_count != 0)
-		cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock);
-	mutex_exit(&spa->spa_async_root_lock);
-
-	/*
-	 * Drop and purge level 2 cache
-	 */
-	spa_l2cache_drop(spa);
+	if (spa->spa_async_zio_root != NULL) {
+		(void) zio_wait(spa->spa_async_zio_root);
+		spa->spa_async_zio_root = NULL;
+	}
 
 	/*
 	 * Close the dsl pool.
@@ -650,8 +949,18 @@ spa_unload(spa_t *spa)
 	if (spa->spa_dsl_pool) {
 		dsl_pool_close(spa->spa_dsl_pool);
 		spa->spa_dsl_pool = NULL;
+		spa->spa_meta_objset = NULL;
 	}
 
+	ddt_unload(spa);
+
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+
+	/*
+	 * Drop and purge level 2 cache
+	 */
+	spa_l2cache_drop(spa);
+
 	/*
 	 * Close all vdevs.
 	 */
@@ -686,6 +995,8 @@ spa_unload(spa_t *spa)
 	spa->spa_l2cache.sav_count = 0;
 
 	spa->spa_async_suspended = 0;
+
+	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
@@ -775,6 +1086,7 @@ spa_load_spares(spa_t *spa)
 		}
 
 		vd->vdev_top = vd;
+		vd->vdev_aux = &spa->spa_spares;
 
 		if (vdev_open(vd) != 0)
 			continue;
@@ -816,7 +1128,7 @@ spa_load_l2cache(spa_t *spa)
 	nvlist_t **l2cache;
 	uint_t nl2cache;
 	int i, j, oldnvdevs;
-	uint64_t guid, size;
+	uint64_t guid;
 	vdev_t *vd, **oldvdevs, **newvdevs;
 	spa_aux_vdev_t *sav = &spa->spa_l2cache;
 
@@ -880,12 +1192,8 @@ spa_load_l2cache(spa_t *spa)
 
 			(void) vdev_validate_aux(vd);
 
-			if (!vdev_is_dead(vd)) {
-				size = vdev_get_rsize(vd);
-				l2arc_add_vdev(spa, vd,
-				    VDEV_LABEL_START_SIZE,
-				    size - VDEV_LABEL_START_SIZE);
-			}
+			if (!vdev_is_dead(vd))
+				l2arc_add_vdev(spa, vd);
 		}
 	}
 
@@ -897,12 +1205,9 @@ spa_load_l2cache(spa_t *spa)
 
 		vd = oldvdevs[i];
 		if (vd != NULL) {
-			if ((spa_mode & FWRITE) &&
-			    spa_l2cache_exists(vd->vdev_guid, &pool) &&
-			    pool != 0ULL &&
-			    l2arc_vdev_present(vd)) {
+			if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+			    pool != 0ULL && l2arc_vdev_present(vd))
 				l2arc_remove_vdev(vd);
-			}
 			(void) vdev_close(vd);
 			spa_l2cache_remove(vd);
 		}
@@ -951,7 +1256,8 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 	dmu_buf_rele(db, FTAG);
 
 	packed = kmem_alloc(nvsize, KM_SLEEP);
-	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed);
+	error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed,
+	    DMU_READ_PREFETCH);
 	if (error == 0)
 		error = nvlist_unpack(packed, nvsize, value, 0);
 	kmem_free(packed, nvsize);
@@ -966,9 +1272,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value)
 static void
 spa_check_removed(vdev_t *vd)
 {
-	int c;
-
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		spa_check_removed(vd->vdev_child[c]);
 
 	if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) {
@@ -977,6 +1281,30 @@ spa_check_removed(vdev_t *vd)
 	}
 }
 
+/*
+ * Load the slog device state from the config object since it's possible
+ * that the label does not contain the most up-to-date information.
+ */
+void
+spa_load_log_state(spa_t *spa, nvlist_t *nv)
+{
+	vdev_t *ovd, *rvd = spa->spa_root_vdev;
+
+	/*
+	 * Load the original root vdev tree from the passed config.
+	 */
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0);
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *cvd = rvd->vdev_child[c];
+		if (cvd->vdev_islog)
+			vdev_load_log_state(cvd, ovd->vdev_child[c]);
+	}
+	vdev_free(ovd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+}
+
 /*
  * Check for missing log devices
  */
@@ -989,140 +1317,458 @@ spa_check_logs(spa_t *spa)
 	case SPA_LOG_UNKNOWN:
 		if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL,
 		    DS_FIND_CHILDREN)) {
-			spa->spa_log_state = SPA_LOG_MISSING;
+			spa_set_log_state(spa, SPA_LOG_MISSING);
 			return (1);
 		}
 		break;
-
-	case SPA_LOG_CLEAR:
-		(void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL,
-		    DS_FIND_CHILDREN);
-		break;
 	}
-	spa->spa_log_state = SPA_LOG_GOOD;
 	return (0);
 }
 
-/*
- * Load an existing storage pool, using the pool's builtin spa_config as a
- * source of configuration information.
- */
-static int
-spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
+static boolean_t
+spa_passivate_log(spa_t *spa)
 {
-	int error = 0;
-	nvlist_t *nvroot = NULL;
-	vdev_t *rvd;
-	uberblock_t *ub = &spa->spa_uberblock;
-	uint64_t config_cache_txg = spa->spa_config_txg;
-	uint64_t pool_guid;
-	uint64_t version;
-	uint64_t autoreplace = 0;
-	char *ereport = FM_EREPORT_ZFS_POOL;
+	vdev_t *rvd = spa->spa_root_vdev;
+	boolean_t slog_found = B_FALSE;
 
-	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
 
-	spa->spa_load_state = state;
+	if (!spa_has_slogs(spa))
+		return (B_FALSE);
 
-	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) ||
-	    nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) {
-		error = EINVAL;
-		goto out;
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (tvd->vdev_islog) {
+			metaslab_group_passivate(mg);
+			slog_found = B_TRUE;
+		}
 	}
 
-	/*
-	 * Versioning wasn't explicitly added to the label until later, so if
-	 * it's not present treat it as the initial version.
-	 */
-	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0)
-		version = SPA_VERSION_INITIAL;
+	return (slog_found);
+}
 
-	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
-	    &spa->spa_config_txg);
+static void
+spa_activate_log(spa_t *spa)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
 
-	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
-	    spa_guid_exists(pool_guid, 0)) {
-		error = EEXIST;
-		goto out;
+	ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+		metaslab_group_t *mg = tvd->vdev_mg;
+
+		if (tvd->vdev_islog)
+			metaslab_group_activate(mg);
 	}
+}
 
-	spa->spa_load_guid = pool_guid;
+int
+spa_offline_log(spa_t *spa)
+{
+	int error = 0;
 
-	/*
-	 * Parse the configuration into a vdev tree.  We explicitly set the
-	 * value that will be returned by spa_version() since parsing the
-	 * configuration requires knowing the version number.
-	 */
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	spa->spa_ubsync.ub_version = version;
-	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD);
-	spa_config_exit(spa, SCL_ALL, FTAG);
+	if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+	    NULL, DS_FIND_CHILDREN)) == 0) {
 
-	if (error != 0)
-		goto out;
+		/*
+		 * We successfully offlined the log device, sync out the
+		 * current txg so that the "stubby" block can be removed
+		 * by zil_sync().
+		 */
+		txg_wait_synced(spa->spa_dsl_pool, 0);
+	}
+	return (error);
+}
 
-	ASSERT(spa->spa_root_vdev == rvd);
-	ASSERT(spa_guid(spa) == pool_guid);
+static void
+spa_aux_check_removed(spa_aux_vdev_t *sav)
+{
+	for (int i = 0; i < sav->sav_count; i++)
+		spa_check_removed(sav->sav_vdevs[i]);
+}
 
-	/*
-	 * Try to open all vdevs, loading each label in the process.
-	 */
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	error = vdev_open(rvd);
-	spa_config_exit(spa, SCL_ALL, FTAG);
-	if (error != 0)
-		goto out;
+void
+spa_claim_notify(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
 
-	/*
-	 * Validate the labels for all leaf vdevs.  We need to grab the config
-	 * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER.
-	 */
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-	error = vdev_validate(rvd);
-	spa_config_exit(spa, SCL_ALL, FTAG);
+	if (zio->io_error)
+		return;
 
-	if (error != 0)
-		goto out;
+	mutex_enter(&spa->spa_props_lock);	/* any mutex will do */
+	if (spa->spa_claim_max_txg < zio->io_bp->blk_birth)
+		spa->spa_claim_max_txg = zio->io_bp->blk_birth;
+	mutex_exit(&spa->spa_props_lock);
+}
 
-	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-		error = ENXIO;
-		goto out;
-	}
+typedef struct spa_load_error {
+	uint64_t	sle_meta_count;
+	uint64_t	sle_data_count;
+} spa_load_error_t;
 
-	/*
-	 * Find the best uberblock.
-	 */
-	vdev_uberblock_load(NULL, rvd, ub);
+static void
+spa_load_verify_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	spa_load_error_t *sle = zio->io_private;
+	dmu_object_type_t type = BP_GET_TYPE(bp);
+	int error = zio->io_error;
 
-	/*
-	 * If we weren't able to find a single valid uberblock, return failure.
+	if (error) {
+		if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) &&
+		    type != DMU_OT_INTENT_LOG)
+			atomic_add_64(&sle->sle_meta_count, 1);
+		else
+			atomic_add_64(&sle->sle_data_count, 1);
+	}
+	zio_data_buf_free(zio->io_data, zio->io_size);
+}
+
+/*ARGSUSED*/
+static int
+spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
+{
+	if (bp != NULL) {
+		zio_t *rio = arg;
+		size_t size = BP_GET_PSIZE(bp);
+		void *data = zio_data_buf_alloc(size);
+
+		zio_nowait(zio_read(rio, spa, bp, data, size,
+		    spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB,
+		    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL |
+		    ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb));
+	}
+	return (0);
+}
+
+static int
+spa_load_verify(spa_t *spa)
+{
+	zio_t *rio;
+	spa_load_error_t sle = { 0 };
+	zpool_rewind_policy_t policy;
+	boolean_t verify_ok = B_FALSE;
+	int error;
+
+	zpool_get_rewind_policy(spa->spa_config, &policy);
+
+	if (policy.zrp_request & ZPOOL_NEVER_REWIND)
+		return (0);
+
+	rio = zio_root(spa, NULL, &sle,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE);
+
+	error = traverse_pool(spa, spa->spa_verify_min_txg,
+	    TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio);
+
+	(void) zio_wait(rio);
+
+	spa->spa_load_meta_errors = sle.sle_meta_count;
+	spa->spa_load_data_errors = sle.sle_data_count;
+
+	if (!error && sle.sle_meta_count <= policy.zrp_maxmeta &&
+	    sle.sle_data_count <= policy.zrp_maxdata) {
+		verify_ok = B_TRUE;
+		spa->spa_load_txg = spa->spa_uberblock.ub_txg;
+		spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp;
+	} else {
+		spa->spa_load_max_txg = spa->spa_uberblock.ub_txg;
+	}
+
+	if (error) {
+		if (error != ENXIO && error != EIO)
+			error = EIO;
+		return (error);
+	}
+
+	return (verify_ok ? 0 : EIO);
+}
+
+/*
+ * Find a value in the pool props object.
+ */
+static void
+spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val)
+{
+	(void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object,
+	    zpool_prop_to_name(prop), sizeof (uint64_t), 1, val);
+}
+
+/*
+ * Find a value in the pool directory object.
+ */
+static int
+spa_dir_prop(spa_t *spa, const char *name, uint64_t *val)
+{
+	return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
+	    name, sizeof (uint64_t), 1, val));
+}
+
+static int
+spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err)
+{
+	vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux);
+	return (err);
+}
+
+/*
+ * Fix up config after a partly-completed split.  This is done with the
+ * ZPOOL_CONFIG_SPLIT nvlist.  Both the splitting pool and the split-off
+ * pool have that entry in their config, but only the splitting one contains
+ * a list of all the guids of the vdevs that are being split off.
+ *
+ * This function determines what to do with that list: either rejoin
+ * all the disks to the pool, or complete the splitting process.  To attempt
+ * the rejoin, each disk that is offlined is marked online again, and
+ * we do a reopen() call.  If the vdev label for every disk that was
+ * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL)
+ * then we call vdev_split() on each disk, and complete the split.
+ *
+ * Otherwise we leave the config alone, with all the vdevs in place in
+ * the original pool.
+ */
+static void
+spa_try_repair(spa_t *spa, nvlist_t *config)
+{
+	uint_t extracted;
+	uint64_t *glist;
+	uint_t i, gcount;
+	nvlist_t *nvl;
+	vdev_t **vd;
+	boolean_t attempt_reopen;
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0)
+		return;
+
+	/* check that the config is complete */
+	if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+	    &glist, &gcount) != 0)
+		return;
+
+	vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP);
+
+	/* attempt to online all the vdevs & validate */
+	attempt_reopen = B_TRUE;
+	for (i = 0; i < gcount; i++) {
+		if (glist[i] == 0)	/* vdev is hole */
+			continue;
+
+		vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE);
+		if (vd[i] == NULL) {
+			/*
+			 * Don't bother attempting to reopen the disks;
+			 * just do the split.
+			 */
+			attempt_reopen = B_FALSE;
+		} else {
+			/* attempt to re-online it */
+			vd[i]->vdev_offline = B_FALSE;
+		}
+	}
+
+	if (attempt_reopen) {
+		vdev_reopen(spa->spa_root_vdev);
+
+		/* check each device to see what state it's in */
+		for (extracted = 0, i = 0; i < gcount; i++) {
+			if (vd[i] != NULL &&
+			    vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL)
+				break;
+			++extracted;
+		}
+	}
+
+	/*
+	 * If every disk has been moved to the new pool, or if we never
+	 * even attempted to look at them, then we split them off for
+	 * good.
 	 */
-	if (ub->ub_txg == 0) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = ENXIO;
-		goto out;
+	if (!attempt_reopen || gcount == extracted) {
+		for (i = 0; i < gcount; i++)
+			if (vd[i] != NULL)
+				vdev_split(vd[i]);
+		vdev_reopen(spa->spa_root_vdev);
 	}
 
+	kmem_free(vd, gcount * sizeof (vdev_t *));
+}
+
+static int
+spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type,
+    boolean_t mosconfig)
+{
+	nvlist_t *config = spa->spa_config;
+	char *ereport = FM_EREPORT_ZFS_POOL;
+	int error;
+	uint64_t pool_guid;
+	nvlist_t *nvl;
+
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid))
+		return (EINVAL);
+
 	/*
-	 * If the pool is newer than the code, we can't open it.
+	 * Versioning wasn't explicitly added to the label until later, so if
+	 * it's not present treat it as the initial version.
 	 */
-	if (ub->ub_version > SPA_VERSION) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_VERSION_NEWER);
-		error = ENOTSUP;
-		goto out;
+	if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION,
+	    &spa->spa_ubsync.ub_version) != 0)
+		spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL;
+
+	(void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    &spa->spa_config_txg);
+
+	if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) &&
+	    spa_guid_exists(pool_guid, 0)) {
+		error = EEXIST;
+	} else {
+		spa->spa_load_guid = pool_guid;
+
+		if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT,
+		    &nvl) == 0) {
+			VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting,
+			    KM_SLEEP) == 0);
+		}
+
+		error = spa_load_impl(spa, pool_guid, config, state, type,
+		    mosconfig, &ereport);
+	}
+
+	spa->spa_minref = refcount_count(&spa->spa_refcount);
+	if (error && error != EBADF)
+		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
+	spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE;
+	spa->spa_ena = 0;
+
+	return (error);
+}
+
+/*
+ * Load an existing storage pool, using the pool's builtin spa_config as a
+ * source of configuration information.
+ */
+static int
+spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config,
+    spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig,
+    char **ereport)
+{
+	int error = 0;
+	nvlist_t *nvconfig, *nvroot = NULL;
+	vdev_t *rvd;
+	uberblock_t *ub = &spa->spa_uberblock;
+	uint64_t config_cache_txg = spa->spa_config_txg;
+	int orig_mode = spa->spa_mode;
+	int parse;
+
+	/*
+	 * If this is an untrusted config, access the pool in read-only mode.
+	 * This prevents things like resilvering recently removed devices.
+	 */
+	if (!mosconfig)
+		spa->spa_mode = FREAD;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
+	spa->spa_load_state = state;
+
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot))
+		return (EINVAL);
+
+	parse = (type == SPA_IMPORT_EXISTING ?
+	    VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT);
+
+	/*
+	 * Create "The Godfather" zio to hold all async IOs
+	 */
+	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+
+	/*
+	 * Parse the configuration into a vdev tree.  We explicitly set the
+	 * value that will be returned by spa_version() since parsing the
+	 * configuration requires knowing the version number.
+	 */
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	if (error != 0)
+		return (error);
+
+	ASSERT(spa->spa_root_vdev == rvd);
+
+	if (type != SPA_IMPORT_ASSEMBLE) {
+		ASSERT(spa_guid(spa) == pool_guid);
+	}
+
+	/*
+	 * Try to open all vdevs, loading each label in the process.
+	 */
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	error = vdev_open(rvd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+	if (error != 0)
+		return (error);
+
+	/*
+	 * We need to validate the vdev labels against the configuration that
+	 * we have in hand, which is dependent on the setting of mosconfig. If
+	 * mosconfig is true then we're validating the vdev labels based on
+	 * that config.  Otherwise, we're validating against the cached config
+	 * (zpool.cache) that was read when we loaded the zfs module, and then
+	 * later we will recursively call spa_load() and validate against
+	 * the vdev config.
+	 *
+	 * If we're assembling a new pool that's been split off from an
+	 * existing pool, the labels haven't yet been updated so we skip
+	 * validation for now.
+	 */
+	if (type != SPA_IMPORT_ASSEMBLE) {
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		error = vdev_validate(rvd);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+
+		if (error != 0)
+			return (error);
+
+		if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+			return (ENXIO);
 	}
 
+	/*
+	 * Find the best uberblock.
+	 */
+	vdev_uberblock_load(NULL, rvd, ub);
+
+	/*
+	 * If we weren't able to find a single valid uberblock, return failure.
+	 */
+	if (ub->ub_txg == 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO));
+
+	/*
+	 * If the pool is newer than the code, we can't open it.
+	 */
+	if (ub->ub_version > SPA_VERSION)
+		return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP));
+
 	/*
 	 * If the vdev guid sum doesn't match the uberblock, we have an
 	 * incomplete configuration.
 	 */
-	if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_BAD_GUID_SUM);
-		error = ENXIO;
-		goto out;
+	if (mosconfig && type != SPA_IMPORT_ASSEMBLE &&
+	    rvd->vdev_guid_sum != ub->ub_guid_sum)
+		return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO));
+
+	if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) {
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_try_repair(spa, config);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		nvlist_free(spa->spa_config_splitting);
+		spa->spa_config_splitting = NULL;
 	}
 
 	/*
@@ -1130,219 +1776,165 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	 */
 	spa->spa_state = POOL_STATE_ACTIVE;
 	spa->spa_ubsync = spa->spa_uberblock;
-	spa->spa_first_txg = spa_last_synced_txg(spa) + 1;
+	spa->spa_verify_min_txg = spa->spa_extreme_rewind ?
+	    TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1;
+	spa->spa_first_txg = spa->spa_last_ubsync_txg ?
+	    spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1;
+	spa->spa_claim_max_txg = spa->spa_first_txg;
+
 	error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool);
-	if (error) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		goto out;
-	}
+	if (error)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 	spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset;
 
-	if (zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG,
-	    sizeof (uint64_t), 1, &spa->spa_config_object) != 0) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
+	if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (!mosconfig) {
-		nvlist_t *newconfig;
 		uint64_t hostid;
+		nvlist_t *policy = NULL;
 
-		if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) {
-			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			error = EIO;
-			goto out;
-		}
-
-		if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig,
+		if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig,
 		    ZPOOL_CONFIG_HOSTID, &hostid) == 0) {
 			char *hostname;
 			unsigned long myhostid = 0;
 
-			VERIFY(nvlist_lookup_string(newconfig,
+			VERIFY(nvlist_lookup_string(nvconfig,
 			    ZPOOL_CONFIG_HOSTNAME, &hostname) == 0);
 
+#ifdef	_KERNEL
+			myhostid = zone_get_hostid(NULL);
+#else	/* _KERNEL */
+			/*
+			 * We're emulating the system's hostid in userland, so
+			 * we can't use zone_get_hostid().
+			 */
 			(void) ddi_strtoul(hw_serial, NULL, 10, &myhostid);
+#endif	/* _KERNEL */
 			if (hostid != 0 && myhostid != 0 &&
-			    (unsigned long)hostid != myhostid) {
+			    hostid != myhostid) {
 				cmn_err(CE_WARN, "pool '%s' could not be "
 				    "loaded as it was last accessed by "
 				    "another system (host: %s hostid: 0x%lx). "
 				    "See: http://www.sun.com/msg/ZFS-8000-EY",
 				    spa_name(spa), hostname,
 				    (unsigned long)hostid);
-				error = EBADF;
-				goto out;
+				return (EBADF);
 			}
 		}
+		if (nvlist_lookup_nvlist(spa->spa_config,
+		    ZPOOL_REWIND_POLICY, &policy) == 0)
+			VERIFY(nvlist_add_nvlist(nvconfig,
+			    ZPOOL_REWIND_POLICY, policy) == 0);
 
-		spa_config_set(spa, newconfig);
+		spa_config_set(spa, nvconfig);
 		spa_unload(spa);
 		spa_deactivate(spa);
-		spa_activate(spa);
+		spa_activate(spa, orig_mode);
 
-		return (spa_load(spa, newconfig, state, B_TRUE));
+		return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE));
 	}
 
-	if (zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
+	if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST,
+	    &spa->spa_deferred_bplist_obj) != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the bit that tells us to use the new accounting function
 	 * (raid-z deflation).  If we have an older pool, this will not
 	 * be present.
 	 */
-	error = zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE,
-	    sizeof (uint64_t), 1, &spa->spa_deflate);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
+	error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the persistent error log.  If we have an older pool, this will
 	 * not be present.
 	 */
-	error = zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST,
-	    sizeof (uint64_t), 1, &spa->spa_errlog_last);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
+	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
-	error = zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB,
-	    sizeof (uint64_t), 1, &spa->spa_errlog_scrub);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
+	error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB,
+	    &spa->spa_errlog_scrub);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	/*
 	 * Load the history object.  If we have an older pool, this
 	 * will not be present.
 	 */
-	error = zap_lookup(spa->spa_meta_objset,
-	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY,
-	    sizeof (uint64_t), 1, &spa->spa_history);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
+	error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+
+	/*
+	 * If we're assembling the pool from the split-off vdevs of
+	 * an existing pool, we don't want to attach the spares & cache
+	 * devices.
+	 */
 
 	/*
 	 * Load any hot spares for this pool.
 	 */
-	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-	if (error == 0) {
+	error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object);
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_SPARES);
 		if (load_nvlist(spa, spa->spa_spares.sav_object,
-		    &spa->spa_spares.sav_config) != 0) {
-			vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			error = EIO;
-			goto out;
-		}
+		    &spa->spa_spares.sav_config) != 0)
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_spares(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
+	} else if (error == 0) {
+		spa->spa_spares.sav_sync = B_TRUE;
 	}
 
 	/*
 	 * Load any level 2 ARC devices for this pool.
 	 */
-	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_L2CACHE, sizeof (uint64_t), 1,
+	error = spa_dir_prop(spa, DMU_POOL_L2CACHE,
 	    &spa->spa_l2cache.sav_object);
-	if (error != 0 && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
-	if (error == 0) {
+	if (error != 0 && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
+	if (error == 0 && type != SPA_IMPORT_ASSEMBLE) {
 		ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE);
 		if (load_nvlist(spa, spa->spa_l2cache.sav_object,
-		    &spa->spa_l2cache.sav_config) != 0) {
-			vdev_set_state(rvd, B_TRUE,
-			    VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_CORRUPT_DATA);
-			error = EIO;
-			goto out;
-		}
+		    &spa->spa_l2cache.sav_config) != 0)
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 		spa_load_l2cache(spa);
 		spa_config_exit(spa, SCL_ALL, FTAG);
+	} else if (error == 0) {
+		spa->spa_l2cache.sav_sync = B_TRUE;
 	}
 
-	if (spa_check_logs(spa)) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_BAD_LOG);
-		error = ENXIO;
-		ereport = FM_EREPORT_ZFS_LOG_REPLAY;
-		goto out;
-	}
-
-
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 
-	error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT,
-	    DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object);
-
-	if (error && error != ENOENT) {
-		vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN,
-		    VDEV_AUX_CORRUPT_DATA);
-		error = EIO;
-		goto out;
-	}
+	error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object);
+	if (error && error != ENOENT)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
 	if (error == 0) {
-		(void) zap_lookup(spa->spa_meta_objset,
-		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZPOOL_PROP_BOOTFS),
-		    sizeof (uint64_t), 1, &spa->spa_bootfs);
-		(void) zap_lookup(spa->spa_meta_objset,
-		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE),
-		    sizeof (uint64_t), 1, &autoreplace);
-		(void) zap_lookup(spa->spa_meta_objset,
-		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZPOOL_PROP_DELEGATION),
-		    sizeof (uint64_t), 1, &spa->spa_delegation);
-		(void) zap_lookup(spa->spa_meta_objset,
-		    spa->spa_pool_props_object,
-		    zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE),
-		    sizeof (uint64_t), 1, &spa->spa_failmode);
+		uint64_t autoreplace;
+
+		spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs);
+		spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace);
+		spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation);
+		spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode);
+		spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand);
+		spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO,
+		    &spa->spa_dedup_ditto);
+
+		spa->spa_autoreplace = (autoreplace != 0);
 	}
 
 	/*
@@ -1352,8 +1944,18 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	 * unopenable vdevs so that the normal autoreplace handler can take
 	 * over.
 	 */
-	if (autoreplace && state != SPA_LOAD_TRYIMPORT)
+	if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) {
 		spa_check_removed(spa->spa_root_vdev);
+		/*
+		 * For the import case, this is done in spa_import(), because
+		 * at this point we're using the spare definitions from
+		 * the MOS config, not necessarily from the userland config.
+		 */
+		if (state != SPA_LOAD_IMPORT) {
+			spa_aux_check_removed(&spa->spa_spares);
+			spa_aux_check_removed(&spa->spa_l2cache);
+		}
+	}
 
 	/*
 	 * Load the vdev state for all toplevel vdevs.
@@ -1371,43 +1973,91 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 	 * Check the state of the root vdev.  If it can't be opened, it
 	 * indicates one or more toplevel vdevs are faulted.
 	 */
-	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) {
-		error = ENXIO;
-		goto out;
-	}
+	if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN)
+		return (ENXIO);
 
-	if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) {
-		dmu_tx_t *tx;
-		int need_update = B_FALSE;
-		int c;
+	/*
+	 * Load the DDTs (dedup tables).
+	 */
+	error = ddt_load(spa);
+	if (error != 0)
+		return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO));
 
-		/*
-		 * Claim log blocks that haven't been committed yet.
-		 * This must all happen in a single txg.
-		 */
-		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
+	spa_update_dspace(spa);
+
+	if (state != SPA_LOAD_TRYIMPORT) {
+		error = spa_load_verify(spa);
+		if (error)
+			return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA,
+			    error));
+	}
+
+	/*
+	 * Load the intent log state and check log integrity.  If we're
+	 * assembling a pool from a split, the log is not transferred over.
+	 */
+	if (type != SPA_IMPORT_ASSEMBLE) {
+		VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE,
+		    &nvroot) == 0);
+		spa_load_log_state(spa, nvroot);
+		nvlist_free(nvconfig);
+
+		if (spa_check_logs(spa)) {
+			*ereport = FM_EREPORT_ZFS_LOG_REPLAY;
+			return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO));
+		}
+	}
+
+	if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER ||
+	    spa->spa_load_max_txg == UINT64_MAX)) {
+		dmu_tx_t *tx;
+		int need_update = B_FALSE;
+
+		ASSERT(state != SPA_LOAD_TRYIMPORT);
+
+		/*
+		 * Claim log blocks that haven't been committed yet.
+		 * This must all happen in a single txg.
+		 * Note: spa_claim_max_txg is updated by spa_claim_notify(),
+		 * invoked from zil_claim_log_block()'s i/o done callback.
+		 * Price of rollback is that we abandon the log.
+		 */
+		spa->spa_claiming = B_TRUE;
+
+		tx = dmu_tx_create_assigned(spa_get_dsl(spa),
 		    spa_first_txg(spa));
 		(void) dmu_objset_find(spa_name(spa),
 		    zil_claim, tx, DS_FIND_CHILDREN);
 		dmu_tx_commit(tx);
 
+		spa->spa_claiming = B_FALSE;
+
+		spa_set_log_state(spa, SPA_LOG_GOOD);
 		spa->spa_sync_on = B_TRUE;
 		txg_sync_start(spa->spa_dsl_pool);
 
 		/*
-		 * Wait for all claims to sync.
+		 * Wait for all claims to sync.  We sync up to the highest
+		 * claimed log block birth time so that claimed log blocks
+		 * don't appear to be from the future.  spa_claim_max_txg
+		 * will have been set for us by either zil_check_log_chain()
+		 * (invoked from spa_check_logs()) or zil_claim() above.
 		 */
-		txg_wait_synced(spa->spa_dsl_pool, 0);
+		txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg);
 
 		/*
 		 * If the config cache is stale, or we have uninitialized
 		 * metaslabs (see spa_vdev_add()), then update the config.
+		 *
+		 * If spa_load_verbatim is true, trust the current
+		 * in-core spa_config and update the disk labels.
 		 */
 		if (config_cache_txg != spa->spa_config_txg ||
-		    state == SPA_LOAD_IMPORT)
+		    state == SPA_LOAD_IMPORT || spa->spa_load_verbatim ||
+		    state == SPA_LOAD_RECOVER)
 			need_update = B_TRUE;
 
-		for (c = 0; c < rvd->vdev_children; c++)
+		for (int c = 0; c < rvd->vdev_children; c++)
 			if (rvd->vdev_child[c]->vdev_ms_array == 0)
 				need_update = B_TRUE;
 
@@ -1417,17 +2067,104 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
 		 */
 		if (need_update)
 			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+
+		/*
+		 * Check all DTLs to see if anything needs resilvering.
+		 */
+		if (vdev_resilver_needed(rvd, NULL, NULL))
+			spa_async_request(spa, SPA_ASYNC_RESILVER);
+
+		/*
+		 * Delete any inconsistent datasets.
+		 */
+		(void) dmu_objset_find(spa_name(spa),
+		    dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN);
+
+		/*
+		 * Clean up any stale temporary dataset userrefs.
+		 */
+		dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool);
 	}
 
-	error = 0;
-out:
-	spa->spa_minref = refcount_count(&spa->spa_refcount);
-	if (error && error != EBADF)
-		zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0);
-	spa->spa_load_state = SPA_LOAD_NONE;
-	spa->spa_ena = 0;
+	return (0);
+}
 
-	return (error);
+static int
+spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig)
+{
+	spa_unload(spa);
+	spa_deactivate(spa);
+
+	spa->spa_load_max_txg--;
+
+	spa_activate(spa, spa_mode_global);
+	spa_async_suspend(spa);
+
+	return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig));
+}
+
+static int
+spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig,
+    uint64_t max_request, int rewind_flags)
+{
+	nvlist_t *config = NULL;
+	int load_error, rewind_error;
+	uint64_t safe_rewind_txg;
+	uint64_t min_txg;
+
+	if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) {
+		spa->spa_load_max_txg = spa->spa_load_txg;
+		spa_set_log_state(spa, SPA_LOG_CLEAR);
+	} else {
+		spa->spa_load_max_txg = max_request;
+	}
+
+	load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING,
+	    mosconfig);
+	if (load_error == 0)
+		return (0);
+
+	if (spa->spa_root_vdev != NULL)
+		config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
+
+	spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg;
+	spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp;
+
+	if (rewind_flags & ZPOOL_NEVER_REWIND) {
+		nvlist_free(config);
+		return (load_error);
+	}
+
+	/* Price of rolling back is discarding txgs, including log */
+	if (state == SPA_LOAD_RECOVER)
+		spa_set_log_state(spa, SPA_LOG_CLEAR);
+
+	spa->spa_load_max_txg = spa->spa_last_ubsync_txg;
+	safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE;
+	min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ?
+	    TXG_INITIAL : safe_rewind_txg;
+
+	/*
+	 * Continue as long as we're finding errors, we're still within
+	 * the acceptable rewind range, and we're still finding uberblocks
+	 */
+	while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg &&
+	    spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) {
+		if (spa->spa_load_max_txg < safe_rewind_txg)
+			spa->spa_extreme_rewind = B_TRUE;
+		rewind_error = spa_load_retry(spa, state, mosconfig);
+	}
+
+	if (config)
+		spa_rewind_data_to_nvlist(spa, config);
+
+	spa->spa_extreme_rewind = B_FALSE;
+	spa->spa_load_max_txg = UINT64_MAX;
+
+	if (config && (rewind_error || state != SPA_LOAD_RECOVER))
+		spa_config_set(spa, config);
+
+	return (state == SPA_LOAD_RECOVER ? rewind_error : load_error);
 }
 
 /*
@@ -1443,9 +2180,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig)
  * ambiguous state.
  */
 static int
-spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
+spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy,
+    nvlist_t **config)
 {
 	spa_t *spa;
+	zpool_rewind_policy_t policy;
+	spa_load_state_t state = SPA_LOAD_OPEN;
 	int error;
 	int locked = B_FALSE;
 
@@ -1467,11 +2207,31 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 			mutex_exit(&spa_namespace_lock);
 		return (ENOENT);
 	}
+
+	zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy);
+	if (policy.zrp_request & ZPOOL_DO_REWIND)
+		state = SPA_LOAD_RECOVER;
+
 	if (spa->spa_state == POOL_STATE_UNINITIALIZED) {
 
-		spa_activate(spa);
+		spa_activate(spa, spa_mode_global);
+
+		if (spa->spa_last_open_failed && (policy.zrp_request &
+		    (ZPOOL_NO_REWIND | ZPOOL_NEVER_REWIND))) {
+			if (config != NULL && spa->spa_config)
+				VERIFY(nvlist_dup(spa->spa_config,
+				    config, KM_SLEEP) == 0);
+			spa_deactivate(spa);
+			if (locked)
+				mutex_exit(&spa_namespace_lock);
+			return (spa->spa_last_open_failed);
+		}
+
+		if (state != SPA_LOAD_RECOVER)
+			spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
 
-		error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE);
+		error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg,
+		    policy.zrp_request);
 
 		if (error == EBADF) {
 			/*
@@ -1496,38 +2256,49 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config)
 			 * information: the state of each vdev after the
 			 * attempted vdev_open().  Return this to the user.
 			 */
-			if (config != NULL && spa->spa_root_vdev != NULL)
-				*config = spa_config_generate(spa, NULL, -1ULL,
-				    B_TRUE);
+			if (config != NULL && spa->spa_config)
+				VERIFY(nvlist_dup(spa->spa_config, config,
+				    KM_SLEEP) == 0);
 			spa_unload(spa);
 			spa_deactivate(spa);
-			spa->spa_last_open_failed = B_TRUE;
+			spa->spa_last_open_failed = error;
 			if (locked)
 				mutex_exit(&spa_namespace_lock);
 			*spapp = NULL;
 			return (error);
-		} else {
-			spa->spa_last_open_failed = B_FALSE;
 		}
+
 	}
 
 	spa_open_ref(spa, tag);
 
-	if (locked)
-		mutex_exit(&spa_namespace_lock);
-
-	*spapp = spa;
 
 	if (config != NULL)
 		*config = spa_config_generate(spa, NULL, -1ULL, B_TRUE);
 
+	if (locked) {
+		spa->spa_last_open_failed = 0;
+		spa->spa_last_ubsync_txg = 0;
+		spa->spa_load_txg = 0;
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	*spapp = spa;
+
 	return (0);
 }
 
+int
+spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy,
+    nvlist_t **config)
+{
+	return (spa_open_common(name, spapp, tag, policy, config));
+}
+
 int
 spa_open(const char *name, spa_t **spapp, void *tag)
 {
-	return (spa_open_common(name, spapp, tag, NULL));
+	return (spa_open_common(name, spapp, tag, NULL, NULL));
 }
 
 /*
@@ -1572,6 +2343,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config)
 	uint_t vsc;
 	uint64_t pool;
 
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
 	if (spa->spa_spares.sav_count == 0)
 		return;
 
@@ -1619,11 +2392,11 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
 	vdev_stat_t *vs;
 	uint_t vsc;
 
+	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+
 	if (spa->spa_l2cache.sav_count == 0)
 		return;
 
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
-
 	VERIFY(nvlist_lookup_nvlist(config,
 	    ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0);
 	VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config,
@@ -1657,8 +2430,6 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config)
 			vdev_get_stats(vd, vs);
 		}
 	}
-
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
 }
 
 int
@@ -1668,18 +2439,29 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 	spa_t *spa;
 
 	*config = NULL;
-	error = spa_open_common(name, &spa, FTAG, config);
+	error = spa_open_common(name, &spa, FTAG, NULL, config);
 
-	if (spa && *config != NULL) {
-		VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT,
-		    spa_get_errlog_size(spa)) == 0);
+	if (spa != NULL) {
+		/*
+		 * This still leaves a window of inconsistency where the spares
+		 * or l2cache devices could change and the config would be
+		 * self-inconsistent.
+		 */
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 
-		if (spa_suspended(spa))
+		if (*config != NULL) {
 			VERIFY(nvlist_add_uint64(*config,
-			    ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0);
+			    ZPOOL_CONFIG_ERRCOUNT,
+			    spa_get_errlog_size(spa)) == 0);
+
+			if (spa_suspended(spa))
+				VERIFY(nvlist_add_uint64(*config,
+				    ZPOOL_CONFIG_SUSPENDED,
+				    spa->spa_failmode) == 0);
 
-		spa_add_spares(spa, *config);
-		spa_add_l2cache(spa, *config);
+			spa_add_spares(spa, *config);
+			spa_add_l2cache(spa, *config);
+		}
 	}
 
 	/*
@@ -1701,8 +2483,10 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen)
 		}
 	}
 
-	if (spa != NULL)
+	if (spa != NULL) {
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 		spa_close(spa, FTAG);
+	}
 
 	return (error);
 }
@@ -1873,11 +2657,9 @@ spa_l2cache_drop(spa_t *spa)
 		vd = sav->sav_vdevs[i];
 		ASSERT(vd != NULL);
 
-		if ((spa_mode & FWRITE) &&
-		    spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL &&
-		    l2arc_vdev_present(vd)) {
+		if (spa_l2cache_exists(vd->vdev_guid, &pool) &&
+		    pool != 0ULL && l2arc_vdev_present(vd))
 			l2arc_remove_vdev(vd);
-		}
 		if (vd->vdev_isl2cache)
 			spa_l2cache_remove(vd);
 		vdev_clear_stats(vd);
@@ -1897,7 +2679,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	vdev_t *rvd;
 	dsl_pool_t *dp;
 	dmu_tx_t *tx;
-	int c, error = 0;
+	int error = 0;
 	uint64_t txg = TXG_INITIAL;
 	nvlist_t **spares, **l2cache;
 	uint_t nspares, nl2cache;
@@ -1917,13 +2699,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	 */
 	(void) nvlist_lookup_string(props,
 	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, altroot);
-	spa_activate(spa);
-
-	spa->spa_uberblock.ub_txg = txg - 1;
+	spa = spa_add(pool, NULL, altroot);
+	spa_activate(spa, spa_mode_global);
 
 	if (props && (error = spa_prop_validate(spa, props))) {
-		spa_unload(spa);
 		spa_deactivate(spa);
 		spa_remove(spa);
 		mutex_exit(&spa_namespace_lock);
@@ -1934,9 +2713,18 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	    &version) != 0)
 		version = SPA_VERSION;
 	ASSERT(version <= SPA_VERSION);
+
+	spa->spa_first_txg = txg;
+	spa->spa_uberblock.ub_txg = txg - 1;
 	spa->spa_uberblock.ub_version = version;
 	spa->spa_ubsync = spa->spa_uberblock;
 
+	/*
+	 * Create "The Godfather" zio to hold all async IOs
+	 */
+	spa->spa_async_zio_root = zio_root(spa, NULL, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER);
+
 	/*
 	 * Create the root vdev.
 	 */
@@ -1954,9 +2742,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	    (error = vdev_create(rvd, txg, B_FALSE)) == 0 &&
 	    (error = spa_validate_aux(spa, nvroot, txg,
 	    VDEV_ALLOC_ADD)) == 0) {
-		for (c = 0; c < rvd->vdev_children; c++)
-			vdev_init(rvd->vdev_child[c], txg);
-		vdev_config_dirty(rvd);
+		for (int c = 0; c < rvd->vdev_children; c++) {
+			vdev_metaslab_set_size(rvd->vdev_child[c]);
+			vdev_expand(rvd->vdev_child[c], txg);
+		}
 	}
 
 	spa_config_exit(spa, SCL_ALL, FTAG);
@@ -2002,6 +2791,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg);
 	spa->spa_meta_objset = dp->dp_meta_objset;
 
+	/*
+	 * Create DDTs (dedup tables).
+	 */
+	ddt_create(spa);
+
+	spa_update_dspace(spa);
+
 	tx = dmu_tx_create_assigned(dp, txg);
 
 	/*
@@ -2032,14 +2828,14 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	 * because sync-to-convergence takes longer if the blocksize
 	 * keeps changing.
 	 */
-	spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset,
+	spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset,
 	    1 << 14, tx);
-	dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj,
-	    ZIO_COMPRESS_OFF, tx);
+	dmu_object_set_compress(spa->spa_meta_objset,
+	    spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx);
 
 	if (zap_add(spa->spa_meta_objset,
 	    DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST,
-	    sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) {
+	    sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) {
 		cmn_err(CE_PANIC, "failed to add bplist");
 	}
 
@@ -2055,8 +2851,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 	spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS);
 	spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION);
 	spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE);
-	if (props)
+	spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND);
+
+	if (props != NULL) {
+		spa_configfile_set(spa, props, B_FALSE);
 		spa_sync_props(spa, props, CRED(), tx);
+	}
 
 	dmu_tx_commit(tx);
 
@@ -2073,403 +2873,410 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
 
 	if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL)
 		(void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE);
-
-	mutex_exit(&spa_namespace_lock);
+	spa_history_log_version(spa, LOG_POOL_CREATE);
 
 	spa->spa_minref = refcount_count(&spa->spa_refcount);
 
+	mutex_exit(&spa_namespace_lock);
+
 	return (0);
 }
 
+#ifdef _KERNEL
 /*
- * Import the given pool into the system.  We set up the necessary spa_t and
- * then call spa_load() to do the dirty work.
+ * Get the root pool information from the root disk, then import the root pool
+ * during the system boot up time.
  */
-static int
-spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props,
-    boolean_t isroot, boolean_t allowfaulted)
+extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
+
+static nvlist_t *
+spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid)
 {
-	spa_t *spa;
-	char *altroot = NULL;
-	int error, loaderr;
-	nvlist_t *nvroot;
-	nvlist_t **spares, **l2cache;
-	uint_t nspares, nl2cache;
+	nvlist_t *config;
+	nvlist_t *nvtop, *nvroot;
+	uint64_t pgid;
 
-	/*
-	 * If a pool with this name exists, return failure.
-	 */
-	mutex_enter(&spa_namespace_lock);
-	if ((spa = spa_lookup(pool)) != NULL) {
-		if (isroot) {
-			/*
-			 * Remove the existing root pool from the
-			 * namespace so that we can replace it with
-			 * the correct config we just read in.
-			 */
-			ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
-			spa_remove(spa);
-		} else {
-			mutex_exit(&spa_namespace_lock);
-			return (EEXIST);
-		}
-	}
+	if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0)
+		return (NULL);
 
 	/*
-	 * Create and initialize the spa structure.
+	 * Add this top-level vdev to the child array.
 	 */
-	(void) nvlist_lookup_string(props,
-	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
-	spa = spa_add(pool, altroot);
-	spa_activate(spa);
-
-	if (allowfaulted)
-		spa->spa_import_faulted = B_TRUE;
-	spa->spa_is_root = isroot;
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvtop) == 0);
+	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    &pgid) == 0);
+	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0);
 
 	/*
-	 * Pass off the heavy lifting to spa_load().
-	 * Pass TRUE for mosconfig (unless this is a root pool) because
-	 * the user-supplied config is actually the one to trust when
-	 * doing an import.
+	 * Put this pool's top-level vdevs into a root vdev.
 	 */
-	loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot);
+	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE,
+	    VDEV_TYPE_ROOT) == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
+	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
+	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
+	    &nvtop, 1) == 0);
 
-	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	/*
-	 * Toss any existing sparelist, as it doesn't have any validity anymore,
-	 * and conflicts with spa_has_spare().
+	 * Replace the existing vdev_tree with the new root vdev in
+	 * this pool's configuration (remove the old, add the new).
 	 */
-	if (!isroot && spa->spa_spares.sav_config) {
-		nvlist_free(spa->spa_spares.sav_config);
-		spa->spa_spares.sav_config = NULL;
-		spa_load_spares(spa);
-	}
-	if (!isroot && spa->spa_l2cache.sav_config) {
-		nvlist_free(spa->spa_l2cache.sav_config);
-		spa->spa_l2cache.sav_config = NULL;
-		spa_load_l2cache(spa);
-	}
+	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
+	nvlist_free(nvroot);
+	return (config);
+}
 
-	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvroot) == 0);
-	if (error == 0)
-		error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE);
-	if (error == 0)
-		error = spa_validate_aux(spa, nvroot, -1ULL,
-		    VDEV_ALLOC_L2CACHE);
-	spa_config_exit(spa, SCL_ALL, FTAG);
+/*
+ * Walk the vdev tree and see if we can find a device with "better"
+ * configuration. A configuration is "better" if the label on that
+ * device has a more recent txg.
+ */
+static void
+spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg)
+{
+	for (int c = 0; c < vd->vdev_children; c++)
+		spa_alt_rootvdev(vd->vdev_child[c], avd, txg);
 
-	if (error != 0 || (props && (error = spa_prop_set(spa, props)))) {
-		if (loaderr != 0 && loaderr != EINVAL && allowfaulted) {
-			/*
-			 * If we failed to load the pool, but 'allowfaulted' is
-			 * set, then manually set the config as if the config
-			 * passed in was specified in the cache file.
-			 */
-			error = 0;
-			spa->spa_import_faulted = B_FALSE;
-			if (spa->spa_config == NULL)
-				spa->spa_config = spa_config_generate(spa,
-				    NULL, -1ULL, B_TRUE);
-			spa_unload(spa);
-			spa_deactivate(spa);
-			spa_config_sync(spa, B_FALSE, B_TRUE);
-		} else {
-			spa_unload(spa);
-			spa_deactivate(spa);
-			spa_remove(spa);
+	if (vd->vdev_ops->vdev_op_leaf) {
+		nvlist_t *label;
+		uint64_t label_txg;
+
+		if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid,
+		    &label) != 0)
+			return;
+
+		VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG,
+		    &label_txg) == 0);
+
+		/*
+		 * Do we have a better boot device?
+		 */
+		if (label_txg > *txg) {
+			*txg = label_txg;
+			*avd = vd;
 		}
-		mutex_exit(&spa_namespace_lock);
-		return (error);
+		nvlist_free(label);
 	}
+}
+
+/*
+ * Import a root pool.
+ *
+ * For x86. devpath_list will consist of devid and/or physpath name of
+ * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
+ * The GRUB "findroot" command will return the vdev we should boot.
+ *
+ * For Sparc, devpath_list consists the physpath name of the booting device
+ * no matter the rootpool is a single device pool or a mirrored pool.
+ * e.g.
+ *	"/pci@1f,0/ide@d/disk@0,0:a"
+ */
+int
+spa_import_rootpool(char *devpath, char *devid)
+{
+	spa_t *spa;
+	vdev_t *rvd, *bvd, *avd = NULL;
+	nvlist_t *config, *nvtop;
+	uint64_t guid, txg;
+	char *pname;
+	int error;
 
 	/*
-	 * Override any spares and level 2 cache devices as specified by
-	 * the user, as these may have correct device names/devids, etc.
+	 * Read the label from the boot device and generate a configuration.
 	 */
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
-	    &spares, &nspares) == 0) {
-		if (spa->spa_spares.sav_config)
-			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
-			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
-		else
-			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
-		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_spares(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		spa->spa_spares.sav_sync = B_TRUE;
+	config = spa_generate_rootconf(devpath, devid, &guid);
+#if defined(_OBP) && defined(_KERNEL)
+	if (config == NULL) {
+		if (strstr(devpath, "/iscsi/ssd") != NULL) {
+			/* iscsi boot */
+			get_iscsi_bootpath_phy(devpath);
+			config = spa_generate_rootconf(devpath, devid, &guid);
+		}
 	}
-	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
-	    &l2cache, &nl2cache) == 0) {
-		if (spa->spa_l2cache.sav_config)
-			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
-			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
-		else
-			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
-		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
-		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
-		spa_load_l2cache(spa);
-		spa_config_exit(spa, SCL_ALL, FTAG);
-		spa->spa_l2cache.sav_sync = B_TRUE;
+#endif
+	if (config == NULL) {
+		cmn_err(CE_NOTE, "Can not read the pool label from '%s'",
+		    devpath);
+		return (EIO);
 	}
 
-	if (spa_mode & FWRITE) {
+	VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME,
+	    &pname) == 0);
+	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+
+	mutex_enter(&spa_namespace_lock);
+	if ((spa = spa_lookup(pname)) != NULL) {
 		/*
-		 * Update the config cache to include the newly-imported pool.
+		 * Remove the existing root pool from the namespace so that we
+		 * can replace it with the correct config we just read in.
 		 */
-		spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot);
+		spa_remove(spa);
 	}
 
-	spa->spa_import_faulted = B_FALSE;
-	mutex_exit(&spa_namespace_lock);
-
-	return (0);
-}
+	spa = spa_add(pname, config, NULL);
+	spa->spa_is_root = B_TRUE;
+	spa->spa_load_verbatim = B_TRUE;
 
-#ifdef _KERNEL
-/*
- * Build a "root" vdev for a top level vdev read in from a rootpool
- * device label.
- */
-static void
-spa_build_rootpool_config(nvlist_t *config)
-{
-	nvlist_t *nvtop, *nvroot;
-	uint64_t pgid;
+	/*
+	 * Build up a vdev tree based on the boot device's label config.
+	 */
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvtop) == 0);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	error = spa_config_parse(spa, &rvd, nvtop, NULL, 0,
+	    VDEV_ALLOC_ROOTPOOL);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+	if (error) {
+		mutex_exit(&spa_namespace_lock);
+		nvlist_free(config);
+		cmn_err(CE_NOTE, "Can not parse the config for pool '%s'",
+		    pname);
+		return (error);
+	}
 
 	/*
-	 * Add this top-level vdev to the child array.
+	 * Get the boot vdev.
 	 */
-	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop)
-	    == 0);
-	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid)
-	    == 0);
+	if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) {
+		cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu",
+		    (u_longlong_t)guid);
+		error = ENOENT;
+		goto out;
+	}
 
 	/*
-	 * Put this pool's top-level vdevs into a root vdev.
+	 * Determine if there is a better boot device.
 	 */
-	VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-	VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT)
-	    == 0);
-	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0);
-	VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0);
-	VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
-	    &nvtop, 1) == 0);
+	avd = bvd;
+	spa_alt_rootvdev(rvd, &avd, &txg);
+	if (avd != bvd) {
+		cmn_err(CE_NOTE, "The boot device is 'degraded'. Please "
+		    "try booting from '%s'", avd->vdev_path);
+		error = EINVAL;
+		goto out;
+	}
 
 	/*
-	 * Replace the existing vdev_tree with the new root vdev in
-	 * this pool's configuration (remove the old, add the new).
+	 * If the boot device is part of a spare vdev then ensure that
+	 * we're booting off the active spare.
 	 */
-	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
-	nvlist_free(nvroot);
+	if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+	    !bvd->vdev_isspare) {
+		cmn_err(CE_NOTE, "The boot device is currently spared. Please "
+		    "try booting from '%s'",
+		    bvd->vdev_parent->vdev_child[1]->vdev_path);
+		error = EINVAL;
+		goto out;
+	}
+
+	error = 0;
+	spa_history_log_version(spa, LOG_POOL_IMPORT);
+out:
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	vdev_free(rvd);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+	mutex_exit(&spa_namespace_lock);
+
+	nvlist_free(config);
+	return (error);
 }
 
+#endif
+
 /*
- * Get the root pool information from the root disk, then import the root pool
- * during the system boot up time.
+ * Take a pool and insert it into the namespace as if it had been loaded at
+ * boot.
  */
-extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **);
-
 int
-spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf,
-    uint64_t *besttxg)
+spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props)
 {
-	nvlist_t *config;
-	uint64_t txg;
-	int error;
+	spa_t *spa;
+	char *altroot = NULL;
 
-	if (error = vdev_disk_read_rootlabel(devpath, devid, &config))
-		return (error);
+	mutex_enter(&spa_namespace_lock);
+	if (spa_lookup(pool) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (EEXIST);
+	}
 
-	VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0);
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+	spa = spa_add(pool, config, altroot);
 
-	if (bestconf != NULL)
-		*bestconf = config;
-	else
-		nvlist_free(config);
-	*besttxg = txg;
-	return (0);
-}
+	spa->spa_load_verbatim = B_TRUE;
 
-boolean_t
-spa_rootdev_validate(nvlist_t *nv)
-{
-	uint64_t ival;
+	if (props != NULL)
+		spa_configfile_set(spa, props, B_FALSE);
 
-	if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 ||
-	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 ||
-	    nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0)
-		return (B_FALSE);
+	spa_config_sync(spa, B_FALSE, B_TRUE);
 
-	return (B_TRUE);
-}
+	mutex_exit(&spa_namespace_lock);
+	spa_history_log_version(spa, LOG_POOL_IMPORT);
 
+	return (0);
+}
 
 /*
- * Given the boot device's physical path or devid, check if the device
- * is in a valid state.  If so, return the configuration from the vdev
- * label.
+ * Import a non-root pool into the system.
  */
 int
-spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf)
+spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
 {
-	nvlist_t *conf = NULL;
-	uint64_t txg = 0;
-	nvlist_t *nvtop, **child;
-	char *type;
-	char *bootpath = NULL;
-	uint_t children, c;
-	char *tmp;
+	spa_t *spa;
+	char *altroot = NULL;
+	spa_load_state_t state = SPA_LOAD_IMPORT;
+	zpool_rewind_policy_t policy;
 	int error;
+	nvlist_t *nvroot;
+	nvlist_t **spares, **l2cache;
+	uint_t nspares, nl2cache;
 
-	if (devpath && ((tmp = strchr(devpath, ' ')) != NULL))
-		*tmp = '\0';
-	if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) {
-		cmn_err(CE_NOTE, "error reading device label");
-		return (error);
-	}
-	if (txg == 0) {
-		cmn_err(CE_NOTE, "this device is detached");
-		nvlist_free(conf);
-		return (EINVAL);
+	/*
+	 * If a pool with this name exists, return failure.
+	 */
+	mutex_enter(&spa_namespace_lock);
+	if (spa_lookup(pool) != NULL) {
+		mutex_exit(&spa_namespace_lock);
+		return (EEXIST);
 	}
 
-	VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE,
-	    &nvtop) == 0);
-	VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0);
+	zpool_get_rewind_policy(config, &policy);
+	if (policy.zrp_request & ZPOOL_DO_REWIND)
+		state = SPA_LOAD_RECOVER;
 
-	if (strcmp(type, VDEV_TYPE_DISK) == 0) {
-		if (spa_rootdev_validate(nvtop)) {
-			goto out;
-		} else {
-			nvlist_free(conf);
-			return (EINVAL);
-		}
-	}
+	/*
+	 * Create and initialize the spa structure.
+	 */
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
+	spa = spa_add(pool, config, altroot);
+	spa_activate(spa, spa_mode_global);
 
-	ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0);
+	/*
+	 * Don't start async tasks until we know everything is healthy.
+	 */
+	spa_async_suspend(spa);
 
-	VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN,
-	    &child, &children) == 0);
+	/*
+	 * Pass off the heavy lifting to spa_load().  Pass TRUE for mosconfig
+	 * because the user-supplied config is actually the one to trust when
+	 * doing an import.
+	 */
+	if (state != SPA_LOAD_RECOVER)
+		spa->spa_last_ubsync_txg = spa->spa_load_txg = 0;
+	error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg,
+	    policy.zrp_request);
 
 	/*
-	 * Go thru vdevs in the mirror to see if the given device
-	 * has the most recent txg. Only the device with the most
-	 * recent txg has valid information and should be booted.
+	 * Propagate anything learned about failing or best txgs
+	 * back to caller
 	 */
-	for (c = 0; c < children; c++) {
-		char *cdevid, *cpath;
-		uint64_t tmptxg;
+	spa_rewind_data_to_nvlist(spa, config);
 
-		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH,
-		    &cpath) != 0)
-			return (EINVAL);
-		if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID,
-		    &cdevid) != 0)
-			return (EINVAL);
-		if ((spa_check_rootconf(cpath, cdevid, NULL,
-		    &tmptxg) == 0) && (tmptxg > txg)) {
-			txg = tmptxg;
-			VERIFY(nvlist_lookup_string(child[c],
-			    ZPOOL_CONFIG_PATH, &bootpath) == 0);
-		}
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+	/*
+	 * Toss any existing sparelist, as it doesn't have any validity
+	 * anymore, and conflicts with spa_has_spare().
+	 */
+	if (spa->spa_spares.sav_config) {
+		nvlist_free(spa->spa_spares.sav_config);
+		spa->spa_spares.sav_config = NULL;
+		spa_load_spares(spa);
+	}
+	if (spa->spa_l2cache.sav_config) {
+		nvlist_free(spa->spa_l2cache.sav_config);
+		spa->spa_l2cache.sav_config = NULL;
+		spa_load_l2cache(spa);
 	}
 
-	/* Does the best device match the one we've booted from? */
-	if (bootpath) {
-		cmn_err(CE_NOTE, "try booting from '%s'", bootpath);
-		return (EINVAL);
+	VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE,
+	    &nvroot) == 0);
+	if (error == 0)
+		error = spa_validate_aux(spa, nvroot, -1ULL,
+		    VDEV_ALLOC_SPARE);
+	if (error == 0)
+		error = spa_validate_aux(spa, nvroot, -1ULL,
+		    VDEV_ALLOC_L2CACHE);
+	spa_config_exit(spa, SCL_ALL, FTAG);
+
+	if (props != NULL)
+		spa_configfile_set(spa, props, B_FALSE);
+
+	if (error != 0 || (props && spa_writeable(spa) &&
+	    (error = spa_prop_set(spa, props)))) {
+		spa_unload(spa);
+		spa_deactivate(spa);
+		spa_remove(spa);
+		mutex_exit(&spa_namespace_lock);
+		return (error);
 	}
-out:
-	*bestconf = conf;
-	return (0);
-}
 
-/*
- * Import a root pool.
- *
- * For x86. devpath_list will consist of devid and/or physpath name of
- * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a").
- * The GRUB "findroot" command will return the vdev we should boot.
- *
- * For Sparc, devpath_list consists the physpath name of the booting device
- * no matter the rootpool is a single device pool or a mirrored pool.
- * e.g.
- *	"/pci@1f,0/ide@d/disk@0,0:a"
- */
-int
-spa_import_rootpool(char *devpath, char *devid)
-{
-	nvlist_t *conf = NULL;
-	char *pname;
-	int error;
+	spa_async_resume(spa);
 
 	/*
-	 * Get the vdev pathname and configuation from the most
-	 * recently updated vdev (highest txg).
+	 * Override any spares and level 2 cache devices as specified by
+	 * the user, as these may have correct device names/devids, etc.
 	 */
-	if (error = spa_get_rootconf(devpath, devid, &conf))
-		goto msg_out;
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES,
+	    &spares, &nspares) == 0) {
+		if (spa->spa_spares.sav_config)
+			VERIFY(nvlist_remove(spa->spa_spares.sav_config,
+			    ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0);
+		else
+			VERIFY(nvlist_alloc(&spa->spa_spares.sav_config,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config,
+		    ZPOOL_CONFIG_SPARES, spares, nspares) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_spares(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_spares.sav_sync = B_TRUE;
+	}
+	if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE,
+	    &l2cache, &nl2cache) == 0) {
+		if (spa->spa_l2cache.sav_config)
+			VERIFY(nvlist_remove(spa->spa_l2cache.sav_config,
+			    ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0);
+		else
+			VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config,
+			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config,
+		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
+		spa_load_l2cache(spa);
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		spa->spa_l2cache.sav_sync = B_TRUE;
+	}
 
 	/*
-	 * Add type "root" vdev to the config.
+	 * Check for any removed devices.
 	 */
-	spa_build_rootpool_config(conf);
+	if (spa->spa_autoreplace) {
+		spa_aux_check_removed(&spa->spa_spares);
+		spa_aux_check_removed(&spa->spa_l2cache);
+	}
 
-	VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0);
+	if (spa_writeable(spa)) {
+		/*
+		 * Update the config cache to include the newly-imported pool.
+		 */
+		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+	}
 
 	/*
-	 * We specify 'allowfaulted' for this to be treated like spa_open()
-	 * instead of spa_import().  This prevents us from marking vdevs as
-	 * persistently unavailable, and generates FMA ereports as if it were a
-	 * pool open, not import.
+	 * It's possible that the pool was expanded while it was exported.
+	 * We kick off an async task to handle this for us.
 	 */
-	error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE);
-	ASSERT(error != EEXIST);
+	spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
 
-	nvlist_free(conf);
-	return (error);
-
-msg_out:
-	cmn_err(CE_NOTE, "\n"
-	    "  ***************************************************  \n"
-	    "  *  This device is not bootable!                   *  \n"
-	    "  *  It is either offlined or detached or faulted.  *  \n"
-	    "  *  Please try to boot from a different device.    *  \n"
-	    "  ***************************************************  ");
-
-	return (error);
-}
-#endif
-
-/*
- * Import a non-root pool into the system.
- */
-int
-spa_import(const char *pool, nvlist_t *config, nvlist_t *props)
-{
-	return (spa_import_common(pool, config, props, B_FALSE, B_FALSE));
-}
+	mutex_exit(&spa_namespace_lock);
+	spa_history_log_version(spa, LOG_POOL_IMPORT);
 
-int
-spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props)
-{
-	return (spa_import_common(pool, config, props, B_FALSE, B_TRUE));
+	return (0);
 }
 
-
-/*
- * This (illegal) pool name is used when temporarily importing a spa_t in order
- * to get the vdev stats associated with the imported devices.
- */
-#define	TRYIMPORT_NAME	"$import"
-
 nvlist_t *
 spa_tryimport(nvlist_t *tryconfig)
 {
@@ -2477,6 +3284,7 @@ spa_tryimport(nvlist_t *tryconfig)
 	char *poolname;
 	spa_t *spa;
 	uint64_t state;
+	int error;
 
 	if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname))
 		return (NULL);
@@ -2488,15 +3296,15 @@ spa_tryimport(nvlist_t *tryconfig)
 	 * Create and initialize the spa structure.
 	 */
 	mutex_enter(&spa_namespace_lock);
-	spa = spa_add(TRYIMPORT_NAME, NULL);
-	spa_activate(spa);
+	spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL);
+	spa_activate(spa, FREAD);
 
 	/*
 	 * Pass off the heavy lifting to spa_load().
 	 * Pass TRUE for mosconfig because the user-supplied config
 	 * is actually the one to trust when doing an import.
 	 */
-	(void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE);
+	error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE);
 
 	/*
 	 * If 'tryconfig' was at least parsable, return the current config.
@@ -2515,7 +3323,7 @@ spa_tryimport(nvlist_t *tryconfig)
 		 * copy it out so that external consumers can tell which
 		 * pools are bootable.
 		 */
-		if (spa->spa_bootfs) {
+		if ((!error || error == EEXIST) && spa->spa_bootfs) {
 			char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP);
 
 			/*
@@ -2545,8 +3353,10 @@ spa_tryimport(nvlist_t *tryconfig)
 		/*
 		 * Add the list of hot spares and level 2 cache devices.
 		 */
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
 		spa_add_spares(spa, config);
 		spa_add_l2cache(spa, config);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
 	}
 
 	spa_unload(spa);
@@ -2563,18 +3373,19 @@ spa_tryimport(nvlist_t *tryconfig)
  * The act of destroying or exporting a pool is very simple.  We make sure there
  * is no more pending I/O and any references to the pool are gone.  Then, we
  * update the pool state and sync all the labels to disk, removing the
- * configuration from the cache afterwards.
+ * configuration from the cache afterwards. If the 'hardforce' flag is set, then
+ * we don't sync the labels or remove the configuration cache.
  */
 static int
 spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
-    boolean_t force)
+    boolean_t force, boolean_t hardforce)
 {
 	spa_t *spa;
 
 	if (oldconfig)
 		*oldconfig = NULL;
 
-	if (!(spa_mode & FWRITE))
+	if (!(spa_mode_global & FWRITE))
 		return (EROFS);
 
 	mutex_enter(&spa_namespace_lock);
@@ -2635,7 +3446,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 		 * so mark them all dirty.  spa_unload() will do the
 		 * final sync that pushes these changes out.
 		 */
-		if (new_state != POOL_STATE_UNINITIALIZED) {
+		if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) {
 			spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 			spa->spa_state = new_state;
 			spa->spa_final_txg = spa_last_synced_txg(spa) + 1;
@@ -2655,7 +3466,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 		VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0);
 
 	if (new_state != POOL_STATE_UNINITIALIZED) {
-		spa_config_sync(spa, B_TRUE, B_TRUE);
+		if (!hardforce)
+			spa_config_sync(spa, B_TRUE, B_TRUE);
 		spa_remove(spa);
 	}
 	mutex_exit(&spa_namespace_lock);
@@ -2669,16 +3481,19 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig,
 int
 spa_destroy(char *pool)
 {
-	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE));
+	return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL,
+	    B_FALSE, B_FALSE));
 }
 
 /*
  * Export a storage pool.
  */
 int
-spa_export(char *pool, nvlist_t **oldconfig, boolean_t force)
+spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
+    boolean_t hardforce)
 {
-	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force));
+	return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig,
+	    force, hardforce));
 }
 
 /*
@@ -2689,7 +3504,7 @@ int
 spa_reset(char *pool)
 {
 	return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL,
-	    B_FALSE));
+	    B_FALSE, B_FALSE));
 }
 
 /*
@@ -2704,8 +3519,8 @@ spa_reset(char *pool)
 int
 spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 {
-	uint64_t txg;
-	int c, error;
+	uint64_t txg, id;
+	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *tvd;
 	nvlist_t **spares, **l2cache;
@@ -2744,10 +3559,20 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot)
 	/*
 	 * Transfer each new top-level vdev from vd to rvd.
 	 */
-	for (c = 0; c < vd->vdev_children; c++) {
+	for (int c = 0; c < vd->vdev_children; c++) {
+
+		/*
+		 * Set the vdev id to the first hole, if one exists.
+		 */
+		for (id = 0; id < rvd->vdev_children; id++) {
+			if (rvd->vdev_child[id]->vdev_ishole) {
+				vdev_free(rvd->vdev_child[id]);
+				break;
+			}
+		}
 		tvd = vd->vdev_child[c];
 		vdev_remove_child(vd, tvd);
-		tvd->vdev_id = rvd->vdev_children;
+		tvd->vdev_id = id;
 		vdev_add_child(rvd, tvd);
 		vdev_config_dirty(tvd);
 	}
@@ -2808,7 +3633,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd;
 	vdev_ops_t *pvops;
-	dmu_tx_t *tx;
 	char *oldvdpath, *newvdpath;
 	int newvd_isspare;
 	int error;
@@ -2887,10 +3711,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	}
 
 	/*
-	 * Compare the new device size with the replaceable/attachable
-	 * device size.
+	 * Make sure the new device is big enough.
 	 */
-	if (newvd->vdev_psize < vdev_get_rsize(oldvd))
+	if (newvd->vdev_asize < vdev_get_min_asize(oldvd))
 		return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW));
 
 	/*
@@ -2932,14 +3755,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	 */
 	vdev_remove_child(newrootvd, newvd);
 	newvd->vdev_id = pvd->vdev_children;
+	newvd->vdev_crtxg = oldvd->vdev_crtxg;
 	vdev_add_child(pvd, newvd);
 
-	/*
-	 * If newvd is smaller than oldvd, but larger than its rsize,
-	 * the addition of newvd may have decreased our parent's asize.
-	 */
-	pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize);
-
 	tvd = newvd->vdev_top;
 	ASSERT(pvd->vdev_top == tvd);
 	ASSERT(tvd->vdev_parent == rvd);
@@ -2952,13 +3770,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 	 */
 	open_txg = txg + TXG_CONCURRENT_STATES - 1;
 
-	mutex_enter(&newvd->vdev_dtl_lock);
-	space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL,
-	    open_txg - TXG_INITIAL + 1);
-	mutex_exit(&newvd->vdev_dtl_lock);
+	vdev_dtl_dirty(newvd, DTL_MISSING,
+	    TXG_INITIAL, open_txg - TXG_INITIAL + 1);
 
-	if (newvd->vdev_isspare)
+	if (newvd->vdev_isspare) {
 		spa_spare_activate(newvd);
+		spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE);
+	}
+
 	oldvdpath = spa_strdup(oldvd->vdev_path);
 	newvdpath = spa_strdup(newvd->vdev_path);
 	newvd_isspare = newvd->vdev_isspare;
@@ -2970,17 +3789,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
 
 	(void) spa_vdev_exit(spa, newrootvd, open_txg, 0);
 
-	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
-	if (dmu_tx_assign(tx, TXG_WAIT) == 0) {
-		spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx,
-		    CRED(),  "%s vdev=%s %s vdev=%s",
-		    replacing && newvd_isspare ? "spare in" :
-		    replacing ? "replace" : "attach", newvdpath,
-		    replacing ? "for" : "to", oldvdpath);
-		dmu_tx_commit(tx);
-	} else {
-		dmu_tx_abort(tx);
-	}
+	spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL,
+	    CRED(),  "%s vdev=%s %s vdev=%s",
+	    replacing && newvd_isspare ? "spare in" :
+	    replacing ? "replace" : "attach", newvdpath,
+	    replacing ? "for" : "to", oldvdpath);
 
 	spa_strfree(oldvdpath);
 	spa_strfree(newvdpath);
@@ -2999,15 +3812,16 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing)
  * is a replacing vdev.
  */
 int
-spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
+spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done)
 {
 	uint64_t txg;
-	int c, t, error;
+	int error;
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd, *pvd, *cvd, *tvd;
 	boolean_t unspare = B_FALSE;
 	uint64_t unspare_guid;
 	size_t len;
+	char *vdpath;
 
 	txg = spa_vdev_enter(spa);
 
@@ -3021,6 +3835,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 
 	pvd = vd->vdev_parent;
 
+	/*
+	 * If the parent/child relationship is not as expected, don't do it.
+	 * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing
+	 * vdev that's replacing B with C.  The user's intent in replacing
+	 * is to go from M(A,B) to M(A,C).  If the user decides to cancel
+	 * the replace by detaching C, the expected behavior is to end up
+	 * M(A,B).  But suppose that right after deciding to detach C,
+	 * the replacement of B completes.  We would have M(A,C), and then
+	 * ask to detach C, which would leave us with just A -- not what
+	 * the user wanted.  To prevent this, we make sure that the
+	 * parent/child relationship hasn't changed -- in this example,
+	 * that C's parent is still the replacing vdev R.
+	 */
+	if (pvd->vdev_guid != pguid && pguid != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+
 	/*
 	 * If replace_done is specified, only remove this device if it's
 	 * the first child of a replacing vdev.  For the 'spare' vdev, either
@@ -3047,36 +3877,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
 
 	/*
-	 * If there's only one replica, you can't detach it.
+	 * If this device has the only valid copy of some data,
+	 * we cannot safely detach it.
 	 */
-	if (pvd->vdev_children <= 1)
+	if (vdev_dtl_required(vd))
 		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
 
-	/*
-	 * If all siblings have non-empty DTLs, this device may have the only
-	 * valid copy of the data, which means we cannot safely detach it.
-	 *
-	 * XXX -- as in the vdev_offline() case, we really want a more
-	 * precise DTL check.
-	 */
-	for (c = 0; c < pvd->vdev_children; c++) {
-		uint64_t dirty;
-
-		cvd = pvd->vdev_child[c];
-		if (cvd == vd)
-			continue;
-		if (vdev_is_dead(cvd))
-			continue;
-		mutex_enter(&cvd->vdev_dtl_lock);
-		dirty = cvd->vdev_dtl_map.sm_space |
-		    cvd->vdev_dtl_scrub.sm_space;
-		mutex_exit(&cvd->vdev_dtl_lock);
-		if (!dirty)
-			break;
-	}
-
-	if (c == pvd->vdev_children)
-		return (spa_vdev_exit(spa, NULL, txg, EBUSY));
+	ASSERT(pvd->vdev_children >= 2);
 
 	/*
 	 * If we are detaching the second disk from a replacing vdev, then
@@ -3102,7 +3909,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 	 * active spare list for the pool.
 	 */
 	if (pvd->vdev_ops == &vdev_spare_ops &&
-	    vd->vdev_id == 0)
+	    vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare)
 		unspare = B_TRUE;
 
 	/*
@@ -3128,80 +3935,369 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done)
 
 	/*
 	 * If we need to remove the remaining child from the list of hot spares,
-	 * do it now, marking the vdev as no longer a spare in the process.  We
-	 * must do this before vdev_remove_parent(), because that can change the
-	 * GUID if it creates a new toplevel GUID.
+	 * do it now, marking the vdev as no longer a spare in the process.
+	 * We must do this before vdev_remove_parent(), because that can
+	 * change the GUID if it creates a new toplevel GUID.  For a similar
+	 * reason, we must remove the spare now, in the same txg as the detach;
+	 * otherwise someone could attach a new sibling, change the GUID, and
+	 * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail.
 	 */
 	if (unspare) {
 		ASSERT(cvd->vdev_isspare);
 		spa_spare_remove(cvd);
 		unspare_guid = cvd->vdev_guid;
+		(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+	}
+
+	/*
+	 * If the parent mirror/replacing vdev only has one child,
+	 * the parent is no longer needed.  Remove it from the tree.
+	 */
+	if (pvd->vdev_children == 1)
+		vdev_remove_parent(cvd);
+
+	/*
+	 * We don't set tvd until now because the parent we just removed
+	 * may have been the previous top-level vdev.
+	 */
+	tvd = cvd->vdev_top;
+	ASSERT(tvd->vdev_parent == rvd);
+
+	/*
+	 * Reevaluate the parent vdev state.
+	 */
+	vdev_propagate_state(cvd);
+
+	/*
+	 * If the 'autoexpand' property is set on the pool then automatically
+	 * try to expand the size of the pool. For example if the device we
+	 * just detached was smaller than the others, it may be possible to
+	 * add metaslabs (i.e. grow the pool). We need to reopen the vdev
+	 * first so that we can obtain the updated sizes of the leaf vdevs.
+	 */
+	if (spa->spa_autoexpand) {
+		vdev_reopen(tvd);
+		vdev_expand(tvd, txg);
+	}
+
+	vdev_config_dirty(tvd);
+
+	/*
+	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
+	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
+	 * But first make sure we're not on any *other* txg's DTL list, to
+	 * prevent vd from being accessed after it's freed.
+	 */
+	vdpath = spa_strdup(vd->vdev_path);
+	for (int t = 0; t < TXG_SIZE; t++)
+		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
+	vd->vdev_detached = B_TRUE;
+	vdev_dirty(tvd, VDD_DTL, vd, txg);
+
+	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+
+	error = spa_vdev_exit(spa, vd, txg, 0);
+
+	spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(),
+	    "vdev=%s", vdpath);
+	spa_strfree(vdpath);
+
+	/*
+	 * If this was the removal of the original device in a hot spare vdev,
+	 * then we want to go through and remove the device from the hot spare
+	 * list of every other pool.
+	 */
+	if (unspare) {
+		spa_t *myspa = spa;
+		spa = NULL;
+		mutex_enter(&spa_namespace_lock);
+		while ((spa = spa_next(spa)) != NULL) {
+			if (spa->spa_state != POOL_STATE_ACTIVE)
+				continue;
+			if (spa == myspa)
+				continue;
+			spa_open_ref(spa, FTAG);
+			mutex_exit(&spa_namespace_lock);
+			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
+			mutex_enter(&spa_namespace_lock);
+			spa_close(spa, FTAG);
+		}
+		mutex_exit(&spa_namespace_lock);
+	}
+
+	return (error);
+}
+
+/*
+ * Split a set of devices from their mirrors, and create a new pool from them.
+ */
+int
+spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+    nvlist_t *props, boolean_t exp)
+{
+	int error = 0;
+	uint64_t txg, *glist;
+	spa_t *newspa;
+	uint_t c, children, lastlog;
+	nvlist_t **child, *nvl, *tmp;
+	dmu_tx_t *tx;
+	char *altroot = NULL;
+	vdev_t *rvd, **vml = NULL;			/* vdev modify list */
+	boolean_t activate_slog;
+
+	if (!spa_writeable(spa))
+		return (EROFS);
+
+	txg = spa_vdev_enter(spa);
+
+	/* clear the log and flush everything up to now */
+	activate_slog = spa_passivate_log(spa);
+	(void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+	error = spa_offline_log(spa);
+	txg = spa_vdev_config_enter(spa);
+
+	if (activate_slog)
+		spa_activate_log(spa);
+
+	if (error != 0)
+		return (spa_vdev_exit(spa, NULL, txg, error));
+
+	/* check new spa name before going any further */
+	if (spa_lookup(newname) != NULL)
+		return (spa_vdev_exit(spa, NULL, txg, EEXIST));
+
+	/*
+	 * scan through all the children to ensure they're all mirrors
+	 */
+	if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 ||
+	    nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child,
+	    &children) != 0)
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	/* first, check to ensure we've got the right child count */
+	rvd = spa->spa_root_vdev;
+	lastlog = 0;
+	for (c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *vd = rvd->vdev_child[c];
+
+		/* don't count the holes & logs as children */
+		if (vd->vdev_islog || vd->vdev_ishole) {
+			if (lastlog == 0)
+				lastlog = c;
+			continue;
+		}
+
+		lastlog = 0;
+	}
+	if (children != (lastlog != 0 ? lastlog : rvd->vdev_children))
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	/* next, ensure no spare or cache devices are part of the split */
+	if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 ||
+	    nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0)
+		return (spa_vdev_exit(spa, NULL, txg, EINVAL));
+
+	vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP);
+	glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP);
+
+	/* then, loop over each vdev and validate it */
+	for (c = 0; c < children; c++) {
+		uint64_t is_hole = 0;
+
+		(void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE,
+		    &is_hole);
+
+		if (is_hole != 0) {
+			if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole ||
+			    spa->spa_root_vdev->vdev_child[c]->vdev_islog) {
+				continue;
+			} else {
+				error = EINVAL;
+				break;
+			}
+		}
+
+		/* which disk is going to be split? */
+		if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID,
+		    &glist[c]) != 0) {
+			error = EINVAL;
+			break;
+		}
+
+		/* look it up in the spa */
+		vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE);
+		if (vml[c] == NULL) {
+			error = ENODEV;
+			break;
+		}
+
+		/* make sure there's nothing stopping the split */
+		if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops ||
+		    vml[c]->vdev_islog ||
+		    vml[c]->vdev_ishole ||
+		    vml[c]->vdev_isspare ||
+		    vml[c]->vdev_isl2cache ||
+		    !vdev_writeable(vml[c]) ||
+		    vml[c]->vdev_children != 0 ||
+		    vml[c]->vdev_state != VDEV_STATE_HEALTHY ||
+		    c != spa->spa_root_vdev->vdev_child[c]->vdev_id) {
+			error = EINVAL;
+			break;
+		}
+
+		if (vdev_dtl_required(vml[c])) {
+			error = EBUSY;
+			break;
+		}
+
+		/* we need certain info from the top level */
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY,
+		    vml[c]->vdev_top->vdev_ms_array) == 0);
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT,
+		    vml[c]->vdev_top->vdev_ms_shift) == 0);
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE,
+		    vml[c]->vdev_top->vdev_asize) == 0);
+		VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT,
+		    vml[c]->vdev_top->vdev_ashift) == 0);
+	}
+
+	if (error != 0) {
+		kmem_free(vml, children * sizeof (vdev_t *));
+		kmem_free(glist, children * sizeof (uint64_t));
+		return (spa_vdev_exit(spa, NULL, txg, error));
+	}
+
+	/* stop writers from using the disks */
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL)
+			vml[c]->vdev_offline = B_TRUE;
 	}
+	vdev_reopen(spa->spa_root_vdev);
 
 	/*
-	 * If the parent mirror/replacing vdev only has one child,
-	 * the parent is no longer needed.  Remove it from the tree.
+	 * Temporarily record the splitting vdevs in the spa config.  This
+	 * will disappear once the config is regenerated.
 	 */
-	if (pvd->vdev_children == 1)
-		vdev_remove_parent(cvd);
+	VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST,
+	    glist, children) == 0);
+	kmem_free(glist, children * sizeof (uint64_t));
+
+	VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT,
+	    nvl) == 0);
+	spa->spa_config_splitting = nvl;
+	vdev_config_dirty(spa->spa_root_vdev);
 
-	/*
-	 * We don't set tvd until now because the parent we just removed
-	 * may have been the previous top-level vdev.
-	 */
-	tvd = cvd->vdev_top;
-	ASSERT(tvd->vdev_parent == rvd);
+	/* configure and create the new pool */
+	VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE,
+	    exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION,
+	    spa_version(spa)) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG,
+	    spa->spa_config_txg) == 0);
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
+	    spa_generate_guid(NULL)) == 0);
+	(void) nvlist_lookup_string(props,
+	    zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot);
 
-	/*
-	 * Reevaluate the parent vdev state.
-	 */
-	vdev_propagate_state(cvd);
+	/* add the new pool to the namespace */
+	newspa = spa_add(newname, config, altroot);
+	newspa->spa_config_txg = spa->spa_config_txg;
+	spa_set_log_state(newspa, SPA_LOG_CLEAR);
 
-	/*
-	 * If the device we just detached was smaller than the others, it may be
-	 * possible to add metaslabs (i.e. grow the pool).  vdev_metaslab_init()
-	 * can't fail because the existing metaslabs are already in core, so
-	 * there's nothing to read from disk.
-	 */
-	VERIFY(vdev_metaslab_init(tvd, txg) == 0);
+	/* release the spa config lock, retaining the namespace lock */
+	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
 
-	vdev_config_dirty(tvd);
+	if (zio_injection_enabled)
+		zio_handle_panic_injection(spa, FTAG, 1);
 
-	/*
-	 * Mark vd's DTL as dirty in this txg.  vdev_dtl_sync() will see that
-	 * vd->vdev_detached is set and free vd's DTL object in syncing context.
-	 * But first make sure we're not on any *other* txg's DTL list, to
-	 * prevent vd from being accessed after it's freed.
-	 */
-	for (t = 0; t < TXG_SIZE; t++)
-		(void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t);
-	vd->vdev_detached = B_TRUE;
-	vdev_dirty(tvd, VDD_DTL, vd, txg);
+	spa_activate(newspa, spa_mode_global);
+	spa_async_suspend(newspa);
 
-	spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE);
+	/* create the new pool from the disks of the original pool */
+	error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE);
+	if (error)
+		goto out;
 
-	error = spa_vdev_exit(spa, vd, txg, 0);
+	/* if that worked, generate a real config for the new pool */
+	if (newspa->spa_root_vdev != NULL) {
+		VERIFY(nvlist_alloc(&newspa->spa_config_splitting,
+		    NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_uint64(newspa->spa_config_splitting,
+		    ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0);
+		spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL,
+		    B_TRUE));
+	}
 
-	/*
-	 * If this was the removal of the original device in a hot spare vdev,
-	 * then we want to go through and remove the device from the hot spare
-	 * list of every other pool.
-	 */
-	if (unspare) {
-		spa = NULL;
-		mutex_enter(&spa_namespace_lock);
-		while ((spa = spa_next(spa)) != NULL) {
-			if (spa->spa_state != POOL_STATE_ACTIVE)
-				continue;
-			spa_open_ref(spa, FTAG);
-			mutex_exit(&spa_namespace_lock);
-			(void) spa_vdev_remove(spa, unspare_guid, B_TRUE);
-			mutex_enter(&spa_namespace_lock);
-			spa_close(spa, FTAG);
+	/* set the props */
+	if (props != NULL) {
+		spa_configfile_set(newspa, props, B_FALSE);
+		error = spa_prop_set(newspa, props);
+		if (error)
+			goto out;
+	}
+
+	/* flush everything */
+	txg = spa_vdev_config_enter(newspa);
+	vdev_config_dirty(newspa->spa_root_vdev);
+	(void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG);
+
+	if (zio_injection_enabled)
+		zio_handle_panic_injection(spa, FTAG, 2);
+
+	spa_async_resume(newspa);
+
+	/* finally, update the original pool's config */
+	txg = spa_vdev_config_enter(spa);
+	tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error != 0)
+		dmu_tx_abort(tx);
+	for (c = 0; c < children; c++) {
+		if (vml[c] != NULL) {
+			vdev_split(vml[c]);
+			if (error == 0)
+				spa_history_internal_log(LOG_POOL_VDEV_DETACH,
+				    spa, tx, CRED(), "vdev=%s",
+				    vml[c]->vdev_path);
+			vdev_free(vml[c]);
 		}
-		mutex_exit(&spa_namespace_lock);
 	}
+	vdev_config_dirty(spa->spa_root_vdev);
+	spa->spa_config_splitting = NULL;
+	nvlist_free(nvl);
+	if (error == 0)
+		dmu_tx_commit(tx);
+	(void) spa_vdev_exit(spa, NULL, txg, 0);
+
+	if (zio_injection_enabled)
+		zio_handle_panic_injection(spa, FTAG, 3);
+
+	/* split is complete; log a history record */
+	spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(),
+	    "split new pool %s from pool %s", newname, spa_name(spa));
+
+	kmem_free(vml, children * sizeof (vdev_t *));
+
+	/* if we're not going to mount the filesystems in userland, export */
+	if (exp)
+		error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL,
+		    B_FALSE, B_FALSE);
+
+	return (error);
 
+out:
+	spa_unload(newspa);
+	spa_deactivate(newspa);
+	spa_remove(newspa);
+
+	txg = spa_vdev_config_enter(spa);
+	nvlist_free(spa->spa_config_splitting);
+	spa->spa_config_splitting = NULL;
+	(void) spa_vdev_exit(spa, NULL, txg, error);
+
+	kmem_free(vml, children * sizeof (vdev_t *));
 	return (error);
 }
 
@@ -3246,20 +4342,112 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
 		kmem_free(newdev, (count - 1) * sizeof (void *));
 }
 
+/*
+ * Removing a device from the vdev namespace requires several steps
+ * and can take a significant amount of time.  As a result we use
+ * the spa_vdev_config_[enter/exit] functions which allow us to
+ * grab and release the spa_config_lock while still holding the namespace
+ * lock.  During each step the configuration is synced out.
+ */
+
+/*
+ * Evacuate the device.
+ */
+int
+spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd)
+{
+	int error = 0;
+	uint64_t txg;
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0);
+	ASSERT(vd == vd->vdev_top);
+
+	/*
+	 * Evacuate the device.  We don't hold the config lock as writer
+	 * since we need to do I/O but we do keep the
+	 * spa_namespace_lock held.  Once this completes the device
+	 * should no longer have any blocks allocated on it.
+	 */
+	if (vd->vdev_islog) {
+		error = dmu_objset_find(spa_name(spa), zil_vdev_offline,
+		    NULL, DS_FIND_CHILDREN);
+	} else {
+		error = ENOTSUP;	/* until we have bp rewrite */
+	}
+
+	txg_wait_synced(spa_get_dsl(spa), 0);
+
+	if (error)
+		return (error);
+
+	/*
+	 * The evacuation succeeded.  Remove any remaining MOS metadata
+	 * associated with this vdev, and wait for these changes to sync.
+	 */
+	txg = spa_vdev_config_enter(spa);
+	vd->vdev_removing = B_TRUE;
+	vdev_dirty(vd, 0, NULL, txg);
+	vdev_config_dirty(vd);
+	spa_vdev_config_exit(spa, NULL, txg, 0, FTAG);
+
+	return (0);
+}
+
+/*
+ * Complete the removal by cleaning up the namespace.
+ */
+void
+spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t id = vd->vdev_id;
+	boolean_t last_vdev = (id == (rvd->vdev_children - 1));
+
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+	ASSERT(vd == vd->vdev_top);
+
+	(void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE);
+
+	if (list_link_active(&vd->vdev_state_dirty_node))
+		vdev_state_clean(vd);
+	if (list_link_active(&vd->vdev_config_dirty_node))
+		vdev_config_clean(vd);
+
+	vdev_free(vd);
+
+	if (last_vdev) {
+		vdev_compact_children(rvd);
+	} else {
+		vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops);
+		vdev_add_child(rvd, vd);
+	}
+	vdev_config_dirty(rvd);
+
+	/*
+	 * Reassess the health of our root vdev.
+	 */
+	vdev_reopen(rvd);
+}
+
 /*
  * Remove a device from the pool.  Currently, this supports removing only hot
- * spares and level 2 ARC devices.
+ * spares, slogs, and level 2 ARC devices.
  */
 int
 spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 {
 	vdev_t *vd;
+	metaslab_group_t *mg;
 	nvlist_t **spares, **l2cache, *nv;
+	uint64_t txg = 0;
 	uint_t nspares, nl2cache;
-	uint64_t txg;
 	int error = 0;
+	boolean_t locked = MUTEX_HELD(&spa_namespace_lock);
 
-	txg = spa_vdev_enter(spa);
+	if (!locked)
+		txg = spa_vdev_enter(spa);
 
 	vd = spa_lookup_by_guid(spa, guid, B_FALSE);
 
@@ -3290,6 +4478,49 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 		    ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv);
 		spa_load_l2cache(spa);
 		spa->spa_l2cache.sav_sync = B_TRUE;
+	} else if (vd != NULL && vd->vdev_islog) {
+		ASSERT(!locked);
+		ASSERT(vd == vd->vdev_top);
+
+		/*
+		 * XXX - Once we have bp-rewrite this should
+		 * become the common case.
+		 */
+
+		mg = vd->vdev_mg;
+
+		/*
+		 * Stop allocating from this vdev.
+		 */
+		metaslab_group_passivate(mg);
+
+		/*
+		 * Wait for the youngest allocations and frees to sync,
+		 * and then wait for the deferral of those frees to finish.
+		 */
+		spa_vdev_config_exit(spa, NULL,
+		    txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG);
+
+		/*
+		 * Attempt to evacuate the vdev.
+		 */
+		error = spa_vdev_remove_evacuate(spa, vd);
+
+		txg = spa_vdev_config_enter(spa);
+
+		/*
+		 * If we couldn't evacuate the vdev, unwind.
+		 */
+		if (error) {
+			metaslab_group_activate(mg);
+			return (spa_vdev_exit(spa, NULL, txg, error));
+		}
+
+		/*
+		 * Clean up the vdev namespace.
+		 */
+		spa_vdev_remove_from_namespace(spa, vd);
+
 	} else if (vd != NULL) {
 		/*
 		 * Normal vdevs cannot be removed (yet).
@@ -3302,7 +4533,10 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare)
 		error = ENOENT;
 	}
 
-	return (spa_vdev_exit(spa, NULL, txg, error));
+	if (!locked)
+		return (spa_vdev_exit(spa, NULL, txg, error));
+
+	return (error);
 }
 
 /*
@@ -3313,9 +4547,8 @@ static vdev_t *
 spa_vdev_resilver_done_hunt(vdev_t *vd)
 {
 	vdev_t *newvd, *oldvd;
-	int c;
 
-	for (c = 0; c < vd->vdev_children; c++) {
+	for (int c = 0; c < vd->vdev_children; c++) {
 		oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]);
 		if (oldvd != NULL)
 			return (oldvd);
@@ -3328,13 +4561,9 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
 		oldvd = vd->vdev_child[0];
 		newvd = vd->vdev_child[1];
 
-		mutex_enter(&newvd->vdev_dtl_lock);
-		if (newvd->vdev_dtl_map.sm_space == 0 &&
-		    newvd->vdev_dtl_scrub.sm_space == 0) {
-			mutex_exit(&newvd->vdev_dtl_lock);
+		if (vdev_dtl_empty(newvd, DTL_MISSING) &&
+		    !vdev_dtl_required(oldvd))
 			return (oldvd);
-		}
-		mutex_exit(&newvd->vdev_dtl_lock);
 	}
 
 	/*
@@ -3344,15 +4573,12 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
 		newvd = vd->vdev_child[0];
 		oldvd = vd->vdev_child[1];
 
-		mutex_enter(&newvd->vdev_dtl_lock);
 		if (newvd->vdev_unspare &&
-		    newvd->vdev_dtl_map.sm_space == 0 &&
-		    newvd->vdev_dtl_scrub.sm_space == 0) {
+		    vdev_dtl_empty(newvd, DTL_MISSING) &&
+		    !vdev_dtl_required(oldvd)) {
 			newvd->vdev_unspare = 0;
-			mutex_exit(&newvd->vdev_dtl_lock);
 			return (oldvd);
 		}
-		mutex_exit(&newvd->vdev_dtl_lock);
 	}
 
 	return (NULL);
@@ -3361,90 +4587,78 @@ spa_vdev_resilver_done_hunt(vdev_t *vd)
 static void
 spa_vdev_resilver_done(spa_t *spa)
 {
-	vdev_t *vd;
-	vdev_t *pvd;
-	uint64_t guid;
-	uint64_t pguid = 0;
+	vdev_t *vd, *pvd, *ppvd;
+	uint64_t guid, sguid, pguid, ppguid;
 
-	spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+	spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 
 	while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) {
+		pvd = vd->vdev_parent;
+		ppvd = pvd->vdev_parent;
 		guid = vd->vdev_guid;
+		pguid = pvd->vdev_guid;
+		ppguid = ppvd->vdev_guid;
+		sguid = 0;
 		/*
 		 * If we have just finished replacing a hot spared device, then
 		 * we need to detach the parent's first child (the original hot
 		 * spare) as well.
 		 */
-		pvd = vd->vdev_parent;
-		if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops &&
-		    pvd->vdev_id == 0) {
+		if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) {
 			ASSERT(pvd->vdev_ops == &vdev_replacing_ops);
-			ASSERT(pvd->vdev_parent->vdev_children == 2);
-			pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid;
+			ASSERT(ppvd->vdev_children == 2);
+			sguid = ppvd->vdev_child[1]->vdev_guid;
 		}
-		spa_config_exit(spa, SCL_CONFIG, FTAG);
-		if (spa_vdev_detach(spa, guid, B_TRUE) != 0)
+		spa_config_exit(spa, SCL_ALL, FTAG);
+		if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0)
 			return;
-		if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0)
+		if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0)
 			return;
-		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER);
 	}
 
-	spa_config_exit(spa, SCL_CONFIG, FTAG);
+	spa_config_exit(spa, SCL_ALL, FTAG);
 }
 
 /*
- * Update the stored path for this vdev.  Dirty the vdev configuration, relying
- * on spa_vdev_enter/exit() to synchronize the labels and cache.
+ * Update the stored path or FRU for this vdev.
  */
 int
-spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value,
+    boolean_t ispath)
 {
 	vdev_t *vd;
-	uint64_t txg;
 
-	txg = spa_vdev_enter(spa);
-
-	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) {
-		/*
-		 * Determine if this is a reference to a hot spare device.  If
-		 * it is, update the path manually as there is no associated
-		 * vdev_t that can be synced to disk.
-		 */
-		nvlist_t **spares;
-		uint_t i, nspares;
-
-		if (spa->spa_spares.sav_config != NULL) {
-			VERIFY(nvlist_lookup_nvlist_array(
-			    spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES,
-			    &spares, &nspares) == 0);
-			for (i = 0; i < nspares; i++) {
-				uint64_t theguid;
-				VERIFY(nvlist_lookup_uint64(spares[i],
-				    ZPOOL_CONFIG_GUID, &theguid) == 0);
-				if (theguid == guid) {
-					VERIFY(nvlist_add_string(spares[i],
-					    ZPOOL_CONFIG_PATH, newpath) == 0);
-					spa_load_spares(spa);
-					spa->spa_spares.sav_sync = B_TRUE;
-					return (spa_vdev_exit(spa, NULL, txg,
-					    0));
-				}
-			}
-		}
+	spa_vdev_state_enter(spa, SCL_ALL);
 
-		return (spa_vdev_exit(spa, NULL, txg, ENOENT));
-	}
+	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
+		return (spa_vdev_state_exit(spa, NULL, ENOENT));
 
 	if (!vd->vdev_ops->vdev_op_leaf)
-		return (spa_vdev_exit(spa, NULL, txg, ENOTSUP));
+		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
+
+	if (ispath) {
+		spa_strfree(vd->vdev_path);
+		vd->vdev_path = spa_strdup(value);
+	} else {
+		if (vd->vdev_fru != NULL)
+			spa_strfree(vd->vdev_fru);
+		vd->vdev_fru = spa_strdup(value);
+	}
 
-	spa_strfree(vd->vdev_path);
-	vd->vdev_path = spa_strdup(newpath);
+	return (spa_vdev_state_exit(spa, vd, 0));
+}
 
-	vdev_config_dirty(vd->vdev_top);
+int
+spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath)
+{
+	return (spa_vdev_set_common(spa, guid, newpath, B_TRUE));
+}
 
-	return (spa_vdev_exit(spa, NULL, txg, 0));
+int
+spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru)
+{
+	return (spa_vdev_set_common(spa, guid, newfru, B_FALSE));
 }
 
 /*
@@ -3497,7 +4711,17 @@ spa_async_remove(spa_t *spa, vdev_t *vd)
 	if (vd->vdev_remove_wanted) {
 		vd->vdev_remove_wanted = 0;
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE);
-		vdev_clear(spa, vd);
+
+		/*
+		 * We want to clear the stats, but we don't want to do a full
+		 * vdev_clear() as that will cause us to throw away
+		 * degraded/faulted state as well as attempt to reopen the
+		 * device, all of which is a waste.
+		 */
+		vd->vdev_stat.vs_read_errors = 0;
+		vd->vdev_stat.vs_write_errors = 0;
+		vd->vdev_stat.vs_checksum_errors = 0;
+
 		vdev_state_dirty(vd->vdev_top);
 	}
 
@@ -3517,6 +4741,37 @@ spa_async_probe(spa_t *spa, vdev_t *vd)
 		spa_async_probe(spa, vd->vdev_child[c]);
 }
 
+static void
+spa_async_autoexpand(spa_t *spa, vdev_t *vd)
+{
+	sysevent_id_t eid;
+	nvlist_t *attr;
+	char *physpath;
+
+	if (!spa->spa_autoexpand)
+		return;
+
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+		spa_async_autoexpand(spa, cvd);
+	}
+
+	if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL)
+		return;
+
+	physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
+	(void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath);
+
+	VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
+
+	(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+	    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
+
+	nvlist_free(attr);
+	kmem_free(physpath, MAXPATHLEN);
+}
+
 static void
 spa_async_thread(spa_t *spa)
 {
@@ -3533,16 +4788,31 @@ spa_async_thread(spa_t *spa)
 	 * See if the config needs to be updated.
 	 */
 	if (tasks & SPA_ASYNC_CONFIG_UPDATE) {
+		uint64_t old_space, new_space;
+
 		mutex_enter(&spa_namespace_lock);
+		old_space = metaslab_class_get_space(spa_normal_class(spa));
 		spa_config_update(spa, SPA_CONFIG_UPDATE_POOL);
+		new_space = metaslab_class_get_space(spa_normal_class(spa));
 		mutex_exit(&spa_namespace_lock);
+
+		/*
+		 * If the pool grew as a result of the config update,
+		 * then log an internal history event.
+		 */
+		if (new_space != old_space) {
+			spa_history_internal_log(LOG_POOL_VDEV_ONLINE,
+			    spa, NULL, CRED(),
+			    "pool '%s' size: %llu(+%llu)",
+			    spa_name(spa), new_space, new_space - old_space);
+		}
 	}
 
 	/*
 	 * See if any devices need to be marked REMOVED.
 	 */
 	if (tasks & SPA_ASYNC_REMOVE) {
-		spa_vdev_state_enter(spa);
+		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_remove(spa, spa->spa_root_vdev);
 		for (int i = 0; i < spa->spa_l2cache.sav_count; i++)
 			spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]);
@@ -3551,11 +4821,17 @@ spa_async_thread(spa_t *spa)
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
 
+	if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) {
+		spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER);
+		spa_async_autoexpand(spa, spa->spa_root_vdev);
+		spa_config_exit(spa, SCL_CONFIG, FTAG);
+	}
+
 	/*
 	 * See if any devices need to be probed.
 	 */
 	if (tasks & SPA_ASYNC_PROBE) {
-		spa_vdev_state_enter(spa);
+		spa_vdev_state_enter(spa, SCL_NONE);
 		spa_async_probe(spa, spa->spa_root_vdev);
 		(void) spa_vdev_state_exit(spa, NULL, 0);
 	}
@@ -3626,38 +4902,34 @@ spa_async_request(spa_t *spa, int task)
  * SPA syncing routines
  * ==========================================================================
  */
-
 static void
-spa_sync_deferred_frees(spa_t *spa, uint64_t txg)
+spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg)
 {
-	bplist_t *bpl = &spa->spa_sync_bplist;
-	dmu_tx_t *tx;
 	blkptr_t blk;
 	uint64_t itor = 0;
-	zio_t *zio;
-	int error;
 	uint8_t c = 1;
 
-	zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL);
-
 	while (bplist_iterate(bpl, &itor, &blk) == 0) {
 		ASSERT(blk.blk_birth < txg);
-		zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL,
-		    ZIO_FLAG_MUSTSUCCEED));
+		zio_free(spa, txg, &blk);
 	}
 
-	error = zio_wait(zio);
-	ASSERT3U(error, ==, 0);
-
-	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 	bplist_vacate(bpl, tx);
 
 	/*
 	 * Pre-dirty the first block so we sync to convergence faster.
 	 * (Usually only the first block is needed.)
 	 */
-	dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx);
-	dmu_tx_commit(tx);
+	dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx);
+}
+
+static void
+spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx)
+{
+	zio_t *zio = arg;
+
+	zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp,
+	    zio->io_flags));
 }
 
 static void
@@ -3775,7 +5047,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 	zpool_prop_t prop;
 	const char *propname;
 	zprop_type_t proptype;
-	spa_config_dirent_t *dp;
 
 	mutex_enter(&spa->spa_props_lock);
 
@@ -3808,31 +5079,14 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 
 		case ZPOOL_PROP_CACHEFILE:
 			/*
-			 * 'cachefile' is a non-persistent property, but note
-			 * an async request that the config cache needs to be
-			 * udpated.
+			 * 'cachefile' is also a non-persisitent property.
 			 */
-			VERIFY(nvpair_value_string(elem, &strval) == 0);
-
-			dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP);
-
-			if (strval[0] == '\0')
-				dp->scd_path = spa_strdup(spa_config_path);
-			else if (strcmp(strval, "none") == 0)
-				dp->scd_path = NULL;
-			else
-				dp->scd_path = spa_strdup(strval);
-
-			list_insert_head(&spa->spa_config_list, dp);
-			spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
 			break;
 		default:
 			/*
 			 * Set pool property values in the poolprops mos object.
 			 */
 			if (spa->spa_pool_props_object == 0) {
-				objset_t *mos = spa->spa_meta_objset;
-
 				VERIFY((spa->spa_pool_props_object =
 				    zap_create(mos, DMU_OT_POOL_PROPS,
 				    DMU_OT_NONE, 0, tx)) > 0);
@@ -3879,6 +5133,13 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx)
 			case ZPOOL_PROP_FAILUREMODE:
 				spa->spa_failmode = intval;
 				break;
+			case ZPOOL_PROP_AUTOEXPAND:
+				spa->spa_autoexpand = intval;
+				spa_async_request(spa, SPA_ASYNC_AUTOEXPAND);
+				break;
+			case ZPOOL_PROP_DEDUPDITTO:
+				spa->spa_dedup_ditto = intval;
+				break;
 			default:
 				break;
 			}
@@ -3905,11 +5166,11 @@ spa_sync(spa_t *spa, uint64_t txg)
 {
 	dsl_pool_t *dp = spa->spa_dsl_pool;
 	objset_t *mos = spa->spa_meta_objset;
-	bplist_t *bpl = &spa->spa_sync_bplist;
+	bplist_t *defer_bpl = &spa->spa_deferred_bplist;
+	bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK];
 	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd;
 	dmu_tx_t *tx;
-	int dirty_vdevs;
 	int error;
 
 	/*
@@ -3925,13 +5186,26 @@ spa_sync(spa_t *spa, uint64_t txg)
 	 * into config changes that go out with this transaction group.
 	 */
 	spa_config_enter(spa, SCL_STATE, FTAG, RW_READER);
-	while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
-		vdev_state_clean(vd);
-		vdev_config_dirty(vd);
+	while (list_head(&spa->spa_state_dirty_list) != NULL) {
+		/*
+		 * We need the write lock here because, for aux vdevs,
+		 * calling vdev_config_dirty() modifies sav_config.
+		 * This is ugly and will become unnecessary when we
+		 * eliminate the aux vdev wart by integrating all vdevs
+		 * into the root vdev tree.
+		 */
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER);
+		while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) {
+			vdev_state_clean(vd);
+			vdev_config_dirty(vd);
+		}
+		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
+		spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER);
 	}
 	spa_config_exit(spa, SCL_STATE, FTAG);
 
-	VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj));
+	VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj));
 
 	tx = dmu_tx_create_assigned(dp, txg);
 
@@ -3977,13 +5251,13 @@ spa_sync(spa_t *spa, uint64_t txg)
 	if (!txg_list_empty(&dp->dp_dirty_datasets, txg) ||
 	    !txg_list_empty(&dp->dp_dirty_dirs, txg) ||
 	    !txg_list_empty(&dp->dp_sync_tasks, txg))
-		spa_sync_deferred_frees(spa, txg);
+		spa_sync_deferred_bplist(spa, defer_bpl, tx, txg);
 
 	/*
 	 * Iterate to convergence.
 	 */
 	do {
-		spa->spa_sync_pass++;
+		int pass = ++spa->spa_sync_pass;
 
 		spa_sync_config_object(spa, tx);
 		spa_sync_aux_dev(spa, &spa->spa_spares, tx,
@@ -3993,18 +5267,29 @@ spa_sync(spa_t *spa, uint64_t txg)
 		spa_errlog_sync(spa, txg);
 		dsl_pool_sync(dp, txg);
 
-		dirty_vdevs = 0;
-		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) {
-			vdev_sync(vd, txg);
-			dirty_vdevs++;
+		if (pass <= SYNC_PASS_DEFERRED_FREE) {
+			zio_t *zio = zio_root(spa, NULL, NULL, 0);
+			bplist_sync(free_bpl, spa_sync_free, zio, tx);
+			VERIFY(zio_wait(zio) == 0);
+		} else {
+			bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx);
 		}
 
-		bplist_sync(bpl, tx);
-	} while (dirty_vdevs);
+		ddt_sync(spa, txg);
+
+		mutex_enter(&spa->spa_scrub_lock);
+		while (spa->spa_scrub_inflight > 0)
+			cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock);
+		mutex_exit(&spa->spa_scrub_lock);
+
+		while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg))
+			vdev_sync(vd, txg);
+
+	} while (dmu_objset_is_dirty(mos, txg));
 
-	bplist_close(bpl);
+	ASSERT(free_bpl->bpl_queue == NULL);
 
-	dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass);
+	bplist_close(defer_bpl);
 
 	/*
 	 * Rewrite the vdev configuration (which includes the uberblock)
@@ -4027,9 +5312,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 			int svdcount = 0;
 			int children = rvd->vdev_children;
 			int c0 = spa_get_random(children);
-			int c;
 
-			for (c = 0; c < children; c++) {
+			for (int c = 0; c < children; c++) {
 				vd = rvd->vdev_child[(c0 + c) % children];
 				if (vd->vdev_ms_array == 0 || vd->vdev_islog)
 					continue;
@@ -4037,10 +5321,16 @@ spa_sync(spa_t *spa, uint64_t txg)
 				if (svdcount == SPA_DVAS_PER_BP)
 					break;
 			}
-			error = vdev_config_sync(svd, svdcount, txg);
+			error = vdev_config_sync(svd, svdcount, txg, B_FALSE);
+			if (error != 0)
+				error = vdev_config_sync(svd, svdcount, txg,
+				    B_TRUE);
 		} else {
 			error = vdev_config_sync(rvd->vdev_child,
-			    rvd->vdev_children, txg);
+			    rvd->vdev_children, txg, B_FALSE);
+			if (error != 0)
+				error = vdev_config_sync(rvd->vdev_child,
+				    rvd->vdev_children, txg, B_TRUE);
 		}
 
 		spa_config_exit(spa, SCL_STATE, FTAG);
@@ -4070,10 +5360,7 @@ spa_sync(spa_t *spa, uint64_t txg)
 
 	spa->spa_ubsync = spa->spa_uberblock;
 
-	/*
-	 * Clean up the ZIL records for the synced txg.
-	 */
-	dsl_pool_zil_clean(dp);
+	dsl_pool_sync_done(dp, txg);
 
 	/*
 	 * Update usable space statistics.
@@ -4081,6 +5368,8 @@ spa_sync(spa_t *spa, uint64_t txg)
 	while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg)))
 		vdev_sync_done(vd, txg);
 
+	spa_update_dspace(spa);
+
 	/*
 	 * It had better be the case that we didn't dirty anything
 	 * since vdev_config_sync().
@@ -4088,10 +5377,15 @@ spa_sync(spa_t *spa, uint64_t txg)
 	ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg));
 	ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg));
 	ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg));
-	ASSERT(bpl->bpl_queue == NULL);
+	ASSERT(defer_bpl->bpl_queue == NULL);
+	ASSERT(free_bpl->bpl_queue == NULL);
+
+	spa->spa_sync_pass = 0;
 
 	spa_config_exit(spa, SCL_CONFIG, FTAG);
 
+	spa_handle_ignored_writes(spa);
+
 	/*
 	 * If any async tasks have been requested, kick them off.
 	 */
@@ -4161,7 +5455,7 @@ spa_evict_all(void)
 }
 
 vdev_t *
-spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache)
+spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux)
 {
 	vdev_t *vd;
 	int i;
@@ -4169,12 +5463,18 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache)
 	if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL)
 		return (vd);
 
-	if (l2cache) {
+	if (aux) {
 		for (i = 0; i < spa->spa_l2cache.sav_count; i++) {
 			vd = spa->spa_l2cache.sav_vdevs[i];
 			if (vd->vdev_guid == guid)
 				return (vd);
 		}
+
+		for (i = 0; i < spa->spa_spares.sav_count; i++) {
+			vd = spa->spa_spares.sav_vdevs[i];
+			if (vd->vdev_guid == guid)
+				return (vd);
+		}
 	}
 
 	return (NULL);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c
index ee425a91694f2..68a40bec89bec 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,6 +36,7 @@
 #include <sys/sunddi.h>
 #ifdef _KERNEL
 #include <sys/kobj.h>
+#include <sys/zone.h>
 #endif
 
 /*
@@ -74,7 +75,6 @@ spa_config_load(void)
 	void *buf = NULL;
 	nvlist_t *nvlist, *child;
 	nvpair_t *nvpair;
-	spa_t *spa;
 	char *pathname;
 	struct _buf *file;
 	uint64_t fsize;
@@ -118,7 +118,6 @@ spa_config_load(void)
 	mutex_enter(&spa_namespace_lock);
 	nvpair = NULL;
 	while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) {
-
 		if (nvpair_type(nvpair) != DATA_TYPE_NVLIST)
 			continue;
 
@@ -126,13 +125,7 @@ spa_config_load(void)
 
 		if (spa_lookup(nvpair_name(nvpair)) != NULL)
 			continue;
-		spa = spa_add(nvpair_name(nvpair), NULL);
-
-		/*
-		 * We blindly duplicate the configuration here.  If it's
-		 * invalid, we will catch it when the pool is first opened.
-		 */
-		VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0);
+		(void) spa_add(nvpair_name(nvpair), child, NULL);
 	}
 	mutex_exit(&spa_namespace_lock);
 
@@ -208,6 +201,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent)
 
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
+	if (rootdir == NULL || !(spa_mode_global & FWRITE))
+		return;
+
 	/*
 	 * Iterate over all cachefiles for the pool, past or present.  When the
 	 * cachefile is changed, the new one is pushed onto this list, allowing
@@ -309,6 +305,24 @@ spa_config_set(spa_t *spa, nvlist_t *config)
 	mutex_exit(&spa->spa_props_lock);
 }
 
+/* Add discovered rewind info, if any to the provided nvlist */
+void
+spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl)
+{
+	int64_t loss = 0;
+
+	if (tonvl == NULL || spa->spa_load_txg == 0)
+		return;
+
+	VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME,
+	    spa->spa_load_txg_ts) == 0);
+	if (spa->spa_last_ubsync_txg)
+		loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts;
+	VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0);
+	VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS,
+	    spa->spa_load_data_errors) == 0);
+}
+
 /*
  * Generate the pool's configuration based on the current in-core state.
  * We infer whether to generate a complete config or just one top-level config
@@ -321,6 +335,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	vdev_t *rvd = spa->spa_root_vdev;
 	unsigned long hostid = 0;
 	boolean_t locked = B_FALSE;
+	uint64_t split_guid;
 
 	if (vd == NULL) {
 		vd = rvd;
@@ -349,7 +364,15 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 	    txg) == 0);
 	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID,
 	    spa_guid(spa)) == 0);
+#ifdef	_KERNEL
+	hostid = zone_get_hostid(NULL);
+#else	/* _KERNEL */
+	/*
+	 * We're emulating the system's hostid in userland, so we can't use
+	 * zone_get_hostid().
+	 */
 	(void) ddi_strtoul(hw_serial, NULL, 10, &hostid);
+#endif	/* _KERNEL */
 	if (hostid != 0) {
 		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID,
 		    hostid) == 0);
@@ -369,36 +392,79 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats)
 			VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG,
 			    1ULL) == 0);
 		vd = vd->vdev_top;		/* label contains top config */
+	} else {
+		/*
+		 * Only add the (potentially large) split information
+		 * in the mos config, and not in the vdev labels
+		 */
+		if (spa->spa_config_splitting != NULL)
+			VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT,
+			    spa->spa_config_splitting) == 0);
+	}
+
+	/*
+	 * Add the top-level config.  We even add this on pools which
+	 * don't support holes in the namespace as older pools will
+	 * just ignore it.
+	 */
+	vdev_top_config_generate(spa, config);
+
+	/*
+	 * If we're splitting, record the original pool's guid.
+	 */
+	if (spa->spa_config_splitting != NULL &&
+	    nvlist_lookup_uint64(spa->spa_config_splitting,
+	    ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) {
+		VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID,
+		    split_guid) == 0);
 	}
 
 	nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE);
 	VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0);
 	nvlist_free(nvroot);
 
+	if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) {
+		ddt_histogram_t *ddh;
+		ddt_stat_t *dds;
+		ddt_object_t *ddo;
+
+		ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP);
+		ddt_get_dedup_histogram(spa, ddh);
+		VERIFY(nvlist_add_uint64_array(config,
+		    ZPOOL_CONFIG_DDT_HISTOGRAM,
+		    (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0);
+		kmem_free(ddh, sizeof (ddt_histogram_t));
+
+		ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP);
+		ddt_get_dedup_object_stats(spa, ddo);
+		VERIFY(nvlist_add_uint64_array(config,
+		    ZPOOL_CONFIG_DDT_OBJ_STATS,
+		    (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0);
+		kmem_free(ddo, sizeof (ddt_object_t));
+
+		dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP);
+		ddt_get_dedup_stats(spa, dds);
+		VERIFY(nvlist_add_uint64_array(config,
+		    ZPOOL_CONFIG_DDT_STATS,
+		    (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0);
+		kmem_free(dds, sizeof (ddt_stat_t));
+	}
+
+	spa_rewind_data_to_nvlist(spa, config);
+
 	if (locked)
 		spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG);
 
 	return (config);
 }
 
-/*
- * For a pool that's not currently a booting rootpool, update all disk labels,
- * generate a fresh config based on the current in-core state, and sync the
- * global config cache.
- */
-void
-spa_config_update(spa_t *spa, int what)
-{
-	spa_config_update_common(spa, what, FALSE);
-}
-
 /*
  * Update all disk labels, generate a fresh config based on the current
  * in-core state, and sync the global config cache (do not sync the config
  * cache if this is a booting rootpool).
  */
 void
-spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
+spa_config_update(spa_t *spa, int what)
 {
 	vdev_t *rvd = spa->spa_root_vdev;
 	uint64_t txg;
@@ -420,10 +486,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
 		 */
 		for (c = 0; c < rvd->vdev_children; c++) {
 			vdev_t *tvd = rvd->vdev_child[c];
-			if (tvd->vdev_ms_array == 0) {
-				vdev_init(tvd, txg);
-				vdev_config_dirty(tvd);
-			}
+			if (tvd->vdev_ms_array == 0)
+				vdev_metaslab_set_size(tvd);
+			vdev_expand(tvd, txg);
 		}
 	}
 	spa_config_exit(spa, SCL_ALL, FTAG);
@@ -436,9 +501,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot)
 	/*
 	 * Update the global config cache to reflect the new mosconfig.
 	 */
-	if (!isroot)
+	if (!spa->spa_is_root)
 		spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL);
 
 	if (what == SPA_CONFIG_UPDATE_POOL)
-		spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot);
+		spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c
index c642bd768b497..4c834e2d4e0a0 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * Routines to manage the on-disk persistent error log.
  *
@@ -60,9 +58,8 @@
  * This is a stripped-down version of strtoull, suitable only for converting
  * lowercase hexidecimal numbers that don't overflow.
  */
-#ifdef _KERNEL
-static uint64_t
-strtonum(char *str, char **nptr)
+uint64_t
+strtonum(const char *str, char **nptr)
 {
 	uint64_t val = 0;
 	char c;
@@ -82,11 +79,11 @@ strtonum(char *str, char **nptr)
 		str++;
 	}
 
-	*nptr = str;
+	if (nptr)
+		*nptr = (char *)str;
 
 	return (val);
 }
-#endif
 
 /*
  * Convert a bookmark to a string.
@@ -135,7 +132,7 @@ spa_log_error(spa_t *spa, zio_t *zio)
 	 * If we are trying to import a pool, ignore any errors, as we won't be
 	 * writing to the pool any time soon.
 	 */
-	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
 		return;
 
 	mutex_enter(&spa->spa_errlist_lock);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c
index c997240c148f2..b713d66ee9040 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/zap.h>
@@ -105,7 +103,8 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx)
 	 * Figure out maximum size of history log.  We set it at
 	 * 1% of pool size, with a max of 32MB and min of 128KB.
 	 */
-	shpp->sh_phys_max_off = spa_get_dspace(spa) / 100;
+	shpp->sh_phys_max_off =
+	    metaslab_class_get_dspace(spa_normal_class(spa)) / 100;
 	shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20);
 	shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10);
 
@@ -127,12 +126,12 @@ spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp)
 	firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof);
 
 	if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread,
-	    buf)) != 0)
+	    buf, DMU_READ_PREFETCH)) != 0)
 		return (err);
 	if (firstread != sizeof (reclen)) {
 		if ((err = dmu_read(mos, spa->spa_history,
 		    shpp->sh_pool_create_len, sizeof (reclen) - firstread,
-		    buf + firstread)) != 0)
+		    buf + firstread, DMU_READ_PREFETCH)) != 0)
 			return (err);
 	}
 
@@ -380,10 +379,11 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
 		return (0);
 	}
 
-	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf);
+	err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf,
+	    DMU_READ_PREFETCH);
 	if (leftover && err == 0) {
 		err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len,
-		    leftover, buf + read_len);
+		    leftover, buf + read_len, DMU_READ_PREFETCH);
 	}
 	mutex_exit(&spa->spa_history_lock);
 
@@ -391,13 +391,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf)
 	return (err);
 }
 
-void
-spa_history_internal_log(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+static void
+log_internal(history_internal_events_t event, spa_t *spa,
+    dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx)
 {
 	history_arg_t *hap;
 	char *str;
-	va_list adx;
 
 	/*
 	 * If this is part of creating a pool, not everything is
@@ -409,9 +408,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa,
 	hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP);
 	str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP);
 
-	va_start(adx, fmt);
 	(void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx);
-	va_end(adx);
 
 	hap->ha_log_type = LOG_INTERNAL;
 	hap->ha_history_str = str;
@@ -426,3 +423,48 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa,
 	}
 	/* spa_history_log_sync() will free hap and str */
 }
+
+void
+spa_history_internal_log(history_internal_events_t event, spa_t *spa,
+    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...)
+{
+	dmu_tx_t *htx = tx;
+	va_list adx;
+
+	/* create a tx if we didn't get one */
+	if (tx == NULL) {
+		htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir);
+		if (dmu_tx_assign(htx, TXG_WAIT) != 0) {
+			dmu_tx_abort(htx);
+			return;
+		}
+	}
+
+	va_start(adx, fmt);
+	log_internal(event, spa, htx, cr, fmt, adx);
+	va_end(adx);
+
+	/* if we didn't get a tx from the caller, commit the one we made */
+	if (tx == NULL)
+		dmu_tx_commit(htx);
+}
+
+void
+spa_history_log_version(spa_t *spa, history_internal_events_t event)
+{
+#ifdef _KERNEL
+	uint64_t current_vers = spa_version(spa);
+
+	if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) {
+		spa_history_internal_log(event, spa, NULL, CRED(),
+		    "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s",
+		    (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION,
+		    utsname.nodename, utsname.release, utsname.version,
+		    utsname.machine);
+	}
+	cmn_err(CE_CONT, "!%s version %llu pool %s using %llu",
+	    event == LOG_POOL_IMPORT ? "imported" :
+	    event == LOG_POOL_CREATE ? "created" : "accessed",
+	    (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION);
+#endif
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c
index 36046e6df1c04..5a48dc6093a7a 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -43,8 +43,8 @@
 #include <sys/dsl_prop.h>
 #include <sys/fs/zfs.h>
 #include <sys/metaslab_impl.h>
-#include <sys/sunddi.h>
 #include <sys/arc.h>
+#include <sys/ddt.h>
 #include "zfs_prop.h"
 
 /*
@@ -186,7 +186,7 @@
  *
  * SCL_VDEV
  *	Held as reader to prevent changes to the vdev tree during trivial
- *	inquiries such as bp_get_dasize().  SCL_VDEV is distinct from the
+ *	inquiries such as bp_get_dsize().  SCL_VDEV is distinct from the
  *	other locks, and lower than all of them, to ensure that it's safe
  *	to acquire regardless of caller context.
  *
@@ -230,7 +230,7 @@ static kmutex_t spa_l2cache_lock;
 static avl_tree_t spa_l2cache_avl;
 
 kmem_cache_t *spa_buffer_pool;
-int spa_mode;
+int spa_mode_global;
 
 #ifdef ZFS_DEBUG
 /* Everything except dprintf is on by default in debug builds */
@@ -310,8 +310,12 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw)
 void
 spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
 {
+	int wlocks_held = 0;
+
 	for (int i = 0; i < SCL_LOCKS; i++) {
 		spa_config_lock_t *scl = &spa->spa_config_lock[i];
+		if (scl->scl_writer == curthread)
+			wlocks_held |= (1 << i);
 		if (!(locks & (1 << i)))
 			continue;
 		mutex_enter(&scl->scl_lock);
@@ -331,6 +335,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw)
 		(void) refcount_add(&scl->scl_count, tag);
 		mutex_exit(&scl->scl_lock);
 	}
+	ASSERT(wlocks_held <= locks);
 }
 
 void
@@ -415,7 +420,7 @@ spa_lookup(const char *name)
  * exist by calling spa_lookup() first.
  */
 spa_t *
-spa_add(const char *name, const char *altroot)
+spa_add(const char *name, nvlist_t *config, const char *altroot)
 {
 	spa_t *spa;
 	spa_config_dirent_t *dp;
@@ -425,31 +430,37 @@ spa_add(const char *name, const char *altroot)
 	spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP);
 
 	mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
 
 	cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
-	cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL);
 	cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL);
 
+	for (int t = 0; t < TXG_SIZE; t++)
+		bplist_init(&spa->spa_free_bplist[t]);
+	bplist_init(&spa->spa_deferred_bplist);
+
 	(void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name));
 	spa->spa_state = POOL_STATE_UNINITIALIZED;
 	spa->spa_freeze_txg = UINT64_MAX;
 	spa->spa_final_txg = UINT64_MAX;
+	spa->spa_load_max_txg = UINT64_MAX;
+	spa->spa_proc = &p0;
+	spa->spa_proc_state = SPA_PROC_NONE;
 
 	refcount_create(&spa->spa_refcount);
 	spa_config_lock_init(spa);
 
 	avl_add(&spa_namespace_avl, spa);
 
-	mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
-
 	/*
 	 * Set the alternate root, if there is one.
 	 */
@@ -468,6 +479,9 @@ spa_add(const char *name, const char *altroot)
 	dp->scd_path = spa_strdup(spa_config_path);
 	list_insert_head(&spa->spa_config_list, dp);
 
+	if (config != NULL)
+		VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0);
+
 	return (spa);
 }
 
@@ -484,6 +498,8 @@ spa_remove(spa_t *spa)
 	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 	ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED);
 
+	nvlist_free(spa->spa_config_splitting);
+
 	avl_remove(&spa_namespace_avl, spa);
 	cv_broadcast(&spa_namespace_cv);
 
@@ -507,20 +523,24 @@ spa_remove(spa_t *spa)
 
 	spa_config_lock_destroy(spa);
 
+	for (int t = 0; t < TXG_SIZE; t++)
+		bplist_fini(&spa->spa_free_bplist[t]);
+	bplist_fini(&spa->spa_deferred_bplist);
+
 	cv_destroy(&spa->spa_async_cv);
-	cv_destroy(&spa->spa_async_root_cv);
+	cv_destroy(&spa->spa_proc_cv);
 	cv_destroy(&spa->spa_scrub_io_cv);
 	cv_destroy(&spa->spa_suspend_cv);
 
 	mutex_destroy(&spa->spa_async_lock);
-	mutex_destroy(&spa->spa_async_root_lock);
-	mutex_destroy(&spa->spa_scrub_lock);
-	mutex_destroy(&spa->spa_errlog_lock);
 	mutex_destroy(&spa->spa_errlist_lock);
-	mutex_destroy(&spa->spa_sync_bplist.bpl_lock);
+	mutex_destroy(&spa->spa_errlog_lock);
 	mutex_destroy(&spa->spa_history_lock);
+	mutex_destroy(&spa->spa_proc_lock);
 	mutex_destroy(&spa->spa_props_lock);
+	mutex_destroy(&spa->spa_scrub_lock);
 	mutex_destroy(&spa->spa_suspend_lock);
+	mutex_destroy(&spa->spa_vdev_top_lock);
 
 	kmem_free(spa, sizeof (spa_t));
 }
@@ -814,12 +834,6 @@ spa_l2cache_activate(vdev_t *vd)
 	mutex_exit(&spa_l2cache_lock);
 }
 
-void
-spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
-{
-	vdev_space_update(vd, space, alloc, B_FALSE);
-}
-
 /*
  * ==========================================================================
  * SPA vdev locking
@@ -834,7 +848,20 @@ spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc)
 uint64_t
 spa_vdev_enter(spa_t *spa)
 {
+	mutex_enter(&spa->spa_vdev_top_lock);
 	mutex_enter(&spa_namespace_lock);
+	return (spa_vdev_config_enter(spa));
+}
+
+/*
+ * Internal implementation for spa_vdev_enter().  Used when a vdev
+ * operation requires multiple syncs (i.e. removing a device) while
+ * keeping the spa_namespace_lock held.
+ */
+uint64_t
+spa_vdev_config_enter(spa_t *spa)
+{
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
 
 	spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 
@@ -842,14 +869,14 @@ spa_vdev_enter(spa_t *spa)
 }
 
 /*
- * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
- * locking of spa_vdev_enter(), we also want make sure the transactions have
- * synced to disk, and then update the global configuration cache with the new
- * information.
+ * Used in combination with spa_vdev_config_enter() to allow the syncing
+ * of multiple transactions without releasing the spa_namespace_lock.
  */
-int
-spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+void
+spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag)
 {
+	ASSERT(MUTEX_HELD(&spa_namespace_lock));
+
 	int config_changed = B_FALSE;
 
 	ASSERT(txg > spa_last_synced_txg(spa));
@@ -867,10 +894,25 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) {
 		dsl_pool_scrub_restart(spa->spa_dsl_pool);
 		config_changed = B_TRUE;
+		spa->spa_config_generation++;
 	}
 
+	/*
+	 * Verify the metaslab classes.
+	 */
+	ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0);
+	ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0);
+
 	spa_config_exit(spa, SCL_ALL, spa);
 
+	/*
+	 * Panic the system if the specified tag requires it.  This
+	 * is useful for ensuring that configurations are updated
+	 * transactionally.
+	 */
+	if (zio_injection_enabled)
+		zio_handle_panic_injection(spa, tag, 0);
+
 	/*
 	 * Note: this txg_wait_synced() is important because it ensures
 	 * that there won't be more than one config change per txg.
@@ -880,8 +922,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 		txg_wait_synced(spa->spa_dsl_pool, txg);
 
 	if (vd != NULL) {
-		ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0);
+		ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0);
+		spa_config_enter(spa, SCL_ALL, spa, RW_WRITER);
 		vdev_free(vd);
+		spa_config_exit(spa, SCL_ALL, spa);
 	}
 
 	/*
@@ -889,8 +933,20 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
 	 */
 	if (config_changed)
 		spa_config_sync(spa, B_FALSE, B_TRUE);
+}
 
+/*
+ * Unlock the spa_t after adding or removing a vdev.  Besides undoing the
+ * locking of spa_vdev_enter(), we also want make sure the transactions have
+ * synced to disk, and then update the global configuration cache with the new
+ * information.
+ */
+int
+spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
+{
+	spa_vdev_config_exit(spa, vd, txg, error, FTAG);
 	mutex_exit(&spa_namespace_lock);
+	mutex_exit(&spa->spa_vdev_top_lock);
 
 	return (error);
 }
@@ -899,18 +955,37 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error)
  * Lock the given spa_t for the purpose of changing vdev state.
  */
 void
-spa_vdev_state_enter(spa_t *spa)
+spa_vdev_state_enter(spa_t *spa, int oplocks)
 {
-	spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER);
+	int locks = SCL_STATE_ALL | oplocks;
+
+	spa_config_enter(spa, locks, spa, RW_WRITER);
+	spa->spa_vdev_locks = locks;
 }
 
 int
 spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error)
 {
-	if (vd != NULL)
+	if (vd != NULL || error == 0)
+		vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev,
+		    0, 0, B_FALSE);
+
+	if (vd != NULL) {
 		vdev_state_dirty(vd->vdev_top);
+		spa->spa_config_generation++;
+	}
+
+	ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL);
+	spa_config_exit(spa, spa->spa_vdev_locks, spa);
 
-	spa_config_exit(spa, SCL_STATE_ALL, spa);
+	/*
+	 * If anything changed, wait for it to sync.  This ensures that,
+	 * from the system administrator's perspective, zpool(1M) commands
+	 * are synchronous.  This is important for things like zpool offline:
+	 * when the command completes, you expect no further I/O from ZFS.
+	 */
+	if (vd != NULL)
+		txg_wait_synced(spa->spa_dsl_pool, 0);
 
 	return (error);
 }
@@ -1044,48 +1119,30 @@ spa_get_random(uint64_t range)
 	return (r % range);
 }
 
-void
-sprintf_blkptr(char *buf, int len, const blkptr_t *bp)
+uint64_t
+spa_generate_guid(spa_t *spa)
 {
-	int d;
+	uint64_t guid = spa_get_random(-1ULL);
 
-	if (bp == NULL) {
-		(void) snprintf(buf, len, "<NULL>");
-		return;
+	if (spa != NULL) {
+		while (guid == 0 || spa_guid_exists(spa_guid(spa), guid))
+			guid = spa_get_random(-1ULL);
+	} else {
+		while (guid == 0 || spa_guid_exists(guid, 0))
+			guid = spa_get_random(-1ULL);
 	}
 
-	if (BP_IS_HOLE(bp)) {
-		(void) snprintf(buf, len, "<hole>");
-		return;
-	}
+	return (guid);
+}
 
-	(void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ",
-	    (u_longlong_t)BP_GET_LEVEL(bp),
-	    dmu_ot[BP_GET_TYPE(bp)].ot_name,
-	    (u_longlong_t)BP_GET_LSIZE(bp),
-	    (u_longlong_t)BP_GET_PSIZE(bp));
-
-	for (d = 0; d < BP_GET_NDVAS(bp); d++) {
-		const dva_t *dva = &bp->blk_dva[d];
-		(void) snprintf(buf + strlen(buf), len - strlen(buf),
-		    "DVA[%d]=<%llu:%llx:%llx> ", d,
-		    (u_longlong_t)DVA_GET_VDEV(dva),
-		    (u_longlong_t)DVA_GET_OFFSET(dva),
-		    (u_longlong_t)DVA_GET_ASIZE(dva));
-	}
+void
+sprintf_blkptr(char *buf, const blkptr_t *bp)
+{
+	char *type = dmu_ot[BP_GET_TYPE(bp)].ot_name;
+	char *checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name;
+	char *compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name;
 
-	(void) snprintf(buf + strlen(buf), len - strlen(buf),
-	    "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx",
-	    zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name,
-	    zio_compress_table[BP_GET_COMPRESS(bp)].ci_name,
-	    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",
-	    BP_IS_GANG(bp) ? "gang" : "contiguous",
-	    (u_longlong_t)bp->blk_birth,
-	    (u_longlong_t)bp->blk_fill,
-	    (u_longlong_t)bp->blk_cksum.zc_word[0],
-	    (u_longlong_t)bp->blk_cksum.zc_word[1],
-	    (u_longlong_t)bp->blk_cksum.zc_word[2],
-	    (u_longlong_t)bp->blk_cksum.zc_word[3]);
+	SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress);
 }
 
 void
@@ -1191,59 +1248,55 @@ spa_first_txg(spa_t *spa)
 	return (spa->spa_first_txg);
 }
 
+uint64_t
+spa_syncing_txg(spa_t *spa)
+{
+	return (spa->spa_syncing_txg);
+}
+
 pool_state_t
 spa_state(spa_t *spa)
 {
 	return (spa->spa_state);
 }
 
-uint64_t
-spa_freeze_txg(spa_t *spa)
+spa_load_state_t
+spa_load_state(spa_t *spa)
 {
-	return (spa->spa_freeze_txg);
+	return (spa->spa_load_state);
 }
 
-/*
- * Return how much space is allocated in the pool (ie. sum of all asize)
- */
 uint64_t
-spa_get_alloc(spa_t *spa)
+spa_freeze_txg(spa_t *spa)
 {
-	return (spa->spa_root_vdev->vdev_stat.vs_alloc);
+	return (spa->spa_freeze_txg);
 }
 
-/*
- * Return how much (raid-z inflated) space there is in the pool.
- */
+/* ARGSUSED */
 uint64_t
-spa_get_space(spa_t *spa)
+spa_get_asize(spa_t *spa, uint64_t lsize)
 {
-	return (spa->spa_root_vdev->vdev_stat.vs_space);
+	/*
+	 * The worst case is single-sector max-parity RAID-Z blocks, in which
+	 * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1)
+	 * times the size; so just assume that.  Add to this the fact that
+	 * we can have up to 3 DVAs per bp, and one more factor of 2 because
+	 * the block may be dittoed with up to 3 DVAs by ddt_sync().
+	 */
+	return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2);
 }
 
-/*
- * Return the amount of raid-z-deflated space in the pool.
- */
 uint64_t
 spa_get_dspace(spa_t *spa)
 {
-	if (spa->spa_deflate)
-		return (spa->spa_root_vdev->vdev_stat.vs_dspace);
-	else
-		return (spa->spa_root_vdev->vdev_stat.vs_space);
+	return (spa->spa_dspace);
 }
 
-/* ARGSUSED */
-uint64_t
-spa_get_asize(spa_t *spa, uint64_t lsize)
+void
+spa_update_dspace(spa_t *spa)
 {
-	/*
-	 * For now, the worst case is 512-byte RAID-Z blocks, in which
-	 * case the space requirement is exactly 2x; so just assume that.
-	 * Add to this the fact that we can have up to 3 DVAs per bp, and
-	 * we have to multiply by a total of 6x.
-	 */
-	return (lsize * 6);
+	spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) +
+	    ddt_get_dedup_dspace(spa);
 }
 
 /*
@@ -1268,6 +1321,24 @@ spa_version(spa_t *spa)
 	return (spa->spa_ubsync.ub_version);
 }
 
+boolean_t
+spa_deflate(spa_t *spa)
+{
+	return (spa->spa_deflate);
+}
+
+metaslab_class_t *
+spa_normal_class(spa_t *spa)
+{
+	return (spa->spa_normal_class);
+}
+
+metaslab_class_t *
+spa_log_class(spa_t *spa)
+{
+	return (spa->spa_log_class);
+}
+
 int
 spa_max_replication(spa_t *spa)
 {
@@ -1282,23 +1353,45 @@ spa_max_replication(spa_t *spa)
 }
 
 uint64_t
-bp_get_dasize(spa_t *spa, const blkptr_t *bp)
+dva_get_dsize_sync(spa_t *spa, const dva_t *dva)
 {
-	int sz = 0, i;
+	uint64_t asize = DVA_GET_ASIZE(dva);
+	uint64_t dsize = asize;
 
-	if (!spa->spa_deflate)
-		return (BP_GET_ASIZE(bp));
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
-	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
-	for (i = 0; i < SPA_DVAS_PER_BP; i++) {
-		vdev_t *vd =
-		    vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i]));
-		if (vd)
-			sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >>
-			    SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
+	if (asize != 0 && spa->spa_deflate) {
+		vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva));
+		dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio;
 	}
+
+	return (dsize);
+}
+
+uint64_t
+bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp)
+{
+	uint64_t dsize = 0;
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
+	return (dsize);
+}
+
+uint64_t
+bp_get_dsize(spa_t *spa, const blkptr_t *bp)
+{
+	uint64_t dsize = 0;
+
+	spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER);
+
+	for (int d = 0; d < SPA_DVAS_PER_BP; d++)
+		dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]);
+
 	spa_config_exit(spa, SCL_VDEV, FTAG);
-	return (sz);
+
+	return (dsize);
 }
 
 /*
@@ -1351,7 +1444,7 @@ spa_init(int mode)
 	avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t),
 	    offsetof(spa_aux_t, aux_avl));
 
-	spa_mode = mode;
+	spa_mode_global = mode;
 
 	refcount_init();
 	unique_init();
@@ -1400,11 +1493,56 @@ spa_has_slogs(spa_t *spa)
 	return (spa->spa_log_class->mc_rotor != NULL);
 }
 
-/*
- * Return whether this pool is the root pool.
- */
+spa_log_state_t
+spa_get_log_state(spa_t *spa)
+{
+	return (spa->spa_log_state);
+}
+
+void
+spa_set_log_state(spa_t *spa, spa_log_state_t state)
+{
+	spa->spa_log_state = state;
+}
+
 boolean_t
 spa_is_root(spa_t *spa)
 {
 	return (spa->spa_is_root);
 }
+
+boolean_t
+spa_writeable(spa_t *spa)
+{
+	return (!!(spa->spa_mode & FWRITE));
+}
+
+int
+spa_mode(spa_t *spa)
+{
+	return (spa->spa_mode);
+}
+
+uint64_t
+spa_bootfs(spa_t *spa)
+{
+	return (spa->spa_bootfs);
+}
+
+uint64_t
+spa_delegation(spa_t *spa)
+{
+	return (spa->spa_delegation);
+}
+
+objset_t *
+spa_meta_objset(spa_t *spa)
+{
+	return (spa->spa_meta_objset);
+}
+
+enum zio_checksum
+spa_dedup_checksum(spa_t *spa)
+{
+	return (spa->spa_dedup_checksum);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c b/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c
index 0a1fd59eaba67..1ce7b2a3d4660 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
@@ -60,6 +58,8 @@ space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift,
 {
 	bzero(sm, sizeof (*sm));
 
+	cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL);
+
 	avl_create(&sm->sm_root, space_map_seg_compare,
 	    sizeof (space_seg_t), offsetof(struct space_seg, ss_node));
 
@@ -75,6 +75,7 @@ space_map_destroy(space_map_t *sm)
 	ASSERT(!sm->sm_loaded && !sm->sm_loading);
 	VERIFY3U(sm->sm_space, ==, 0);
 	avl_destroy(&sm->sm_root);
+	cv_destroy(&sm->sm_load_cv);
 }
 
 void
@@ -115,12 +116,23 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
 
 	if (merge_before && merge_after) {
 		avl_remove(&sm->sm_root, ss_before);
+		if (sm->sm_pp_root) {
+			avl_remove(sm->sm_pp_root, ss_before);
+			avl_remove(sm->sm_pp_root, ss_after);
+		}
 		ss_after->ss_start = ss_before->ss_start;
 		kmem_free(ss_before, sizeof (*ss_before));
+		ss = ss_after;
 	} else if (merge_before) {
 		ss_before->ss_end = end;
+		if (sm->sm_pp_root)
+			avl_remove(sm->sm_pp_root, ss_before);
+		ss = ss_before;
 	} else if (merge_after) {
 		ss_after->ss_start = start;
+		if (sm->sm_pp_root)
+			avl_remove(sm->sm_pp_root, ss_after);
+		ss = ss_after;
 	} else {
 		ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
 		ss->ss_start = start;
@@ -128,6 +140,9 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
 		avl_insert(&sm->sm_root, ss, where);
 	}
 
+	if (sm->sm_pp_root)
+		avl_add(sm->sm_pp_root, ss);
+
 	sm->sm_space += size;
 }
 
@@ -162,12 +177,17 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
 	left_over = (ss->ss_start != start);
 	right_over = (ss->ss_end != end);
 
+	if (sm->sm_pp_root)
+		avl_remove(sm->sm_pp_root, ss);
+
 	if (left_over && right_over) {
 		newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
 		newseg->ss_start = end;
 		newseg->ss_end = ss->ss_end;
 		ss->ss_end = start;
 		avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER);
+		if (sm->sm_pp_root)
+			avl_add(sm->sm_pp_root, newseg);
 	} else if (left_over) {
 		ss->ss_end = start;
 	} else if (right_over) {
@@ -175,12 +195,16 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
 	} else {
 		avl_remove(&sm->sm_root, ss);
 		kmem_free(ss, sizeof (*ss));
+		ss = NULL;
 	}
 
+	if (sm->sm_pp_root && ss != NULL)
+		avl_add(sm->sm_pp_root, ss);
+
 	sm->sm_space -= size;
 }
 
-int
+boolean_t
 space_map_contains(space_map_t *sm, uint64_t start, uint64_t size)
 {
 	avl_index_t where;
@@ -220,59 +244,10 @@ space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
 {
 	space_seg_t *ss;
 
-	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
-		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
-}
-
-void
-space_map_excise(space_map_t *sm, uint64_t start, uint64_t size)
-{
-	avl_tree_t *t = &sm->sm_root;
-	avl_index_t where;
-	space_seg_t *ss, search;
-	uint64_t end = start + size;
-	uint64_t rm_start, rm_end;
-
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 
-	search.ss_start = start;
-	search.ss_end = start;
-
-	for (;;) {
-		ss = avl_find(t, &search, &where);
-
-		if (ss == NULL)
-			ss = avl_nearest(t, where, AVL_AFTER);
-
-		if (ss == NULL || ss->ss_start >= end)
-			break;
-
-		rm_start = MAX(ss->ss_start, start);
-		rm_end = MIN(ss->ss_end, end);
-
-		space_map_remove(sm, rm_start, rm_end - rm_start);
-	}
-}
-
-/*
- * Replace smd with the union of smd and sms.
- */
-void
-space_map_union(space_map_t *smd, space_map_t *sms)
-{
-	avl_tree_t *t = &sms->sm_root;
-	space_seg_t *ss;
-
-	ASSERT(MUTEX_HELD(smd->sm_lock));
-
-	/*
-	 * For each source segment, remove any intersections with the
-	 * destination, then add the source segment to the destination.
-	 */
-	for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) {
-		space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-		space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start);
-	}
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
 }
 
 /*
@@ -283,8 +258,10 @@ space_map_load_wait(space_map_t *sm)
 {
 	ASSERT(MUTEX_HELD(sm->sm_lock));
 
-	while (sm->sm_loading)
+	while (sm->sm_loading) {
+		ASSERT(!sm->sm_loaded);
 		cv_wait(&sm->sm_load_cv, sm->sm_lock);
+	}
 }
 
 /*
@@ -301,11 +278,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 	int error = 0;
 
 	ASSERT(MUTEX_HELD(sm->sm_lock));
-
-	space_map_load_wait(sm);
-
-	if (sm->sm_loaded)
-		return (0);
+	ASSERT(!sm->sm_loaded);
+	ASSERT(!sm->sm_loading);
 
 	sm->sm_loading = B_TRUE;
 	end = smo->smo_objsize;
@@ -336,7 +310,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype,
 		    smo->smo_object, offset, size);
 
 		mutex_exit(sm->sm_lock);
-		error = dmu_read(os, smo->smo_object, offset, size, entry_map);
+		error = dmu_read(os, smo->smo_object, offset, size, entry_map,
+		    DMU_READ_PREFETCH);
 		mutex_enter(sm->sm_lock);
 		if (error != 0)
 			break;
@@ -389,6 +364,13 @@ space_map_unload(space_map_t *sm)
 	space_map_vacate(sm, NULL, NULL);
 }
 
+uint64_t
+space_map_maxsize(space_map_t *sm)
+{
+	ASSERT(sm->sm_ops != NULL);
+	return (sm->sm_ops->smop_max(sm));
+}
+
 uint64_t
 space_map_alloc(space_map_t *sm, uint64_t size)
 {
@@ -504,3 +486,131 @@ space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx)
 	smo->smo_objsize = 0;
 	smo->smo_alloc = 0;
 }
+
+/*
+ * Space map reference trees.
+ *
+ * A space map is a collection of integers.  Every integer is either
+ * in the map, or it's not.  A space map reference tree generalizes
+ * the idea: it allows its members to have arbitrary reference counts,
+ * as opposed to the implicit reference count of 0 or 1 in a space map.
+ * This representation comes in handy when computing the union or
+ * intersection of multiple space maps.  For example, the union of
+ * N space maps is the subset of the reference tree with refcnt >= 1.
+ * The intersection of N space maps is the subset with refcnt >= N.
+ *
+ * [It's very much like a Fourier transform.  Unions and intersections
+ * are hard to perform in the 'space map domain', so we convert the maps
+ * into the 'reference count domain', where it's trivial, then invert.]
+ *
+ * vdev_dtl_reassess() uses computations of this form to determine
+ * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev
+ * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev
+ * has an outage wherever refcnt >= vdev_children.
+ */
+static int
+space_map_ref_compare(const void *x1, const void *x2)
+{
+	const space_ref_t *sr1 = x1;
+	const space_ref_t *sr2 = x2;
+
+	if (sr1->sr_offset < sr2->sr_offset)
+		return (-1);
+	if (sr1->sr_offset > sr2->sr_offset)
+		return (1);
+
+	if (sr1 < sr2)
+		return (-1);
+	if (sr1 > sr2)
+		return (1);
+
+	return (0);
+}
+
+void
+space_map_ref_create(avl_tree_t *t)
+{
+	avl_create(t, space_map_ref_compare,
+	    sizeof (space_ref_t), offsetof(space_ref_t, sr_node));
+}
+
+void
+space_map_ref_destroy(avl_tree_t *t)
+{
+	space_ref_t *sr;
+	void *cookie = NULL;
+
+	while ((sr = avl_destroy_nodes(t, &cookie)) != NULL)
+		kmem_free(sr, sizeof (*sr));
+
+	avl_destroy(t);
+}
+
+static void
+space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt)
+{
+	space_ref_t *sr;
+
+	sr = kmem_alloc(sizeof (*sr), KM_SLEEP);
+	sr->sr_offset = offset;
+	sr->sr_refcnt = refcnt;
+
+	avl_add(t, sr);
+}
+
+void
+space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end,
+	int64_t refcnt)
+{
+	space_map_ref_add_node(t, start, refcnt);
+	space_map_ref_add_node(t, end, -refcnt);
+}
+
+/*
+ * Convert (or add) a space map into a reference tree.
+ */
+void
+space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt)
+{
+	space_seg_t *ss;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss))
+		space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt);
+}
+
+/*
+ * Convert a reference tree into a space map.  The space map will contain
+ * all members of the reference tree for which refcnt >= minref.
+ */
+void
+space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref)
+{
+	uint64_t start = -1ULL;
+	int64_t refcnt = 0;
+	space_ref_t *sr;
+
+	ASSERT(MUTEX_HELD(sm->sm_lock));
+
+	space_map_vacate(sm, NULL, NULL);
+
+	for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) {
+		refcnt += sr->sr_refcnt;
+		if (refcnt >= minref) {
+			if (start == -1ULL) {
+				start = sr->sr_offset;
+			}
+		} else {
+			if (start != -1ULL) {
+				uint64_t end = sr->sr_offset;
+				ASSERT(start <= end);
+				if (end > start)
+					space_map_add(sm, start, end - start);
+				start = -1ULL;
+			}
+		}
+	}
+	ASSERT(refcnt == 0);
+	ASSERT(start == -1ULL);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h
index 749bf53e5b5e8..c528fac1a6466 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -68,12 +68,26 @@ typedef enum arc_buf_contents {
 #define	ARC_CACHED	(1 << 4)	/* I/O was already in cache */
 #define	ARC_L2CACHE	(1 << 5)	/* cache in L2ARC */
 
-void arc_space_consume(uint64_t space);
-void arc_space_return(uint64_t space);
+/*
+ * The following breakdows of arc_size exist for kstat only.
+ */
+typedef enum arc_space_type {
+	ARC_SPACE_DATA,
+	ARC_SPACE_HDRS,
+	ARC_SPACE_L2HDRS,
+	ARC_SPACE_OTHER,
+	ARC_SPACE_NUMTYPES
+} arc_space_type_t;
+
+void arc_space_consume(uint64_t space, arc_space_type_t type);
+void arc_space_return(uint64_t space, arc_space_type_t type);
 void *arc_data_buf_alloc(uint64_t space);
 void arc_data_buf_free(void *buf, uint64_t space);
 arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag,
     arc_buf_contents_t type);
+arc_buf_t *arc_loan_buf(spa_t *spa, int size);
+void arc_return_buf(arc_buf_t *buf, void *tag);
+void arc_loan_inuse_buf(arc_buf_t *buf, void *tag);
 void arc_buf_add_ref(arc_buf_t *buf, void *tag);
 int arc_buf_remove_ref(arc_buf_t *buf, void *tag);
 int arc_buf_size(arc_buf_t *buf);
@@ -86,28 +100,17 @@ void arc_buf_thaw(arc_buf_t *buf);
 int arc_referenced(arc_buf_t *buf);
 #endif
 
-typedef struct writeprops {
-	dmu_object_type_t wp_type;
-	uint8_t wp_level;
-	uint8_t wp_copies;
-	uint8_t wp_dncompress, wp_oscompress;
-	uint8_t wp_dnchecksum, wp_oschecksum;
-} writeprops_t;
-
-void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp);
-int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf,
+int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf,
     arc_done_func_t *done, void *private, int priority, int zio_flags,
     uint32_t *arc_flags, const zbookmark_t *zb);
-int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp,
+int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     arc_done_func_t *done, void *private, int priority, int flags,
     uint32_t *arc_flags, const zbookmark_t *zb);
-zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp,
-    boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf,
-    arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority,
-    int zio_flags, const zbookmark_t *zb);
-int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags);
-int arc_tryread(spa_t *spa, blkptr_t *bp, void *data);
+zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
+    blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp,
+    arc_done_func_t *ready, arc_done_func_t *done, void *private,
+    int priority, int zio_flags, const zbookmark_t *zb);
+void arc_free(spa_t *spa, const blkptr_t *bp);
 
 void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private);
 int arc_buf_evict(arc_buf_t *buf);
@@ -123,7 +126,7 @@ void arc_fini(void);
  * Level 2 ARC
  */
 
-void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end);
+void l2arc_add_vdev(spa_t *spa, vdev_t *vd);
 void l2arc_remove_vdev(vdev_t *vd);
 boolean_t l2arc_vdev_present(vdev_t *vd);
 void l2arc_init(void);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h
index cdb93a6c35a31..94143bccbc56a 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -29,6 +29,7 @@
 #include <sys/dmu.h>
 #include <sys/spa.h>
 #include <sys/txg.h>
+#include <sys/zio.h>
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
@@ -67,6 +68,10 @@ typedef struct bplist {
 	dmu_buf_t	*bpl_cached_dbuf;
 } bplist_t;
 
+typedef void bplist_sync_cb_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx);
+
+extern void bplist_init(bplist_t *bpl);
+extern void bplist_fini(bplist_t *bpl);
 extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx);
 extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx);
 extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object);
@@ -74,13 +79,15 @@ extern void bplist_close(bplist_t *bpl);
 extern boolean_t bplist_empty(bplist_t *bpl);
 extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp);
 extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx);
+extern void bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx);
 extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp);
-extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx);
+extern void bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func,
+    void *arg, dmu_tx_t *tx);
 extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx);
 extern int bplist_space(bplist_t *bpl,
     uint64_t *usedp, uint64_t *compp, uint64_t *uncompp);
 extern int bplist_space_birthrange(bplist_t *bpl,
-    uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep);
+    uint64_t mintxg, uint64_t maxtxg, uint64_t *dsizep);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h
index 75ce27264e3ce..d99ade07f8c67 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -75,7 +75,6 @@ typedef enum dbuf_states {
 	DB_EVICTING
 } dbuf_states_t;
 
-struct objset_impl;
 struct dnode;
 struct dmu_tx;
 
@@ -134,6 +133,7 @@ typedef struct dbuf_dirty_record {
 			arc_buf_t *dr_data;
 			blkptr_t dr_overridden_by;
 			override_states_t dr_override_state;
+			uint8_t dr_copies;
 		} dl;
 	} dt;
 } dbuf_dirty_record_t;
@@ -148,7 +148,7 @@ typedef struct dmu_buf_impl {
 	dmu_buf_t db;
 
 	/* the objset we belong to */
-	struct objset_impl *db_objset;
+	struct objset *db_objset;
 
 	/*
 	 * the dnode we belong to (NULL when evicted)
@@ -255,6 +255,7 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag);
 uint64_t dbuf_refcount(dmu_buf_impl_t *db);
 
 void dbuf_rele(dmu_buf_impl_t *db, void *tag);
+void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag);
 
 dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid);
 
@@ -264,7 +265,9 @@ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx);
 void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx);
 void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx);
+void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx);
 dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx);
+arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db);
 
 void dbuf_clear(dmu_buf_impl_t *db);
 void dbuf_evict(dmu_buf_impl_t *db);
@@ -323,7 +326,7 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_dbuf_bp(db, bp, fmt, ...) do {			\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) {			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp);		\
+	sprintf_blkptr(__blkbuf, bp);				\
 	dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf);	\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} 							\
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h
new file mode 100644
index 0000000000000..26bcbea5039bb
--- /dev/null
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h
@@ -0,0 +1,240 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_DDT_H
+#define	_SYS_DDT_H
+
+#include <sys/sysmacros.h>
+#include <sys/types.h>
+#include <sys/fs/zfs.h>
+#include <sys/zio.h>
+#include <sys/dmu.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * On-disk DDT formats, in the desired search order (newest version first).
+ */
+enum ddt_type {
+	DDT_TYPE_ZAP = 0,
+	DDT_TYPES
+};
+
+/*
+ * DDT classes, in the desired search order (highest replication level first).
+ */
+enum ddt_class {
+	DDT_CLASS_DITTO = 0,
+	DDT_CLASS_DUPLICATE,
+	DDT_CLASS_UNIQUE,
+	DDT_CLASSES
+};
+
+#define	DDT_TYPE_CURRENT		0
+
+#define	DDT_COMPRESS_BYTEORDER_MASK	0x80
+#define	DDT_COMPRESS_FUNCTION_MASK	0x7f
+
+/*
+ * On-disk ddt entry:  key (name) and physical storage (value).
+ */
+typedef struct ddt_key {
+	zio_cksum_t	ddk_cksum;	/* 256-bit block checksum */
+	uint64_t	ddk_prop;	/* LSIZE, PSIZE, compression */
+} ddt_key_t;
+
+/*
+ * ddk_prop layout:
+ *
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *	|   0	|   0	|   0	| comp	|     PSIZE	|     LSIZE	|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ */
+#define	DDK_GET_LSIZE(ddk)	\
+	BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
+#define	DDK_SET_LSIZE(ddk, x)	\
+	BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	DDK_GET_PSIZE(ddk)	\
+	BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1)
+#define	DDK_SET_PSIZE(ddk, x)	\
+	BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
+
+#define	DDK_GET_COMPRESS(ddk)		BF64_GET((ddk)->ddk_prop, 32, 8)
+#define	DDK_SET_COMPRESS(ddk, x)	BF64_SET((ddk)->ddk_prop, 32, 8, x)
+
+#define	DDT_KEY_WORDS	(sizeof (ddt_key_t) / sizeof (uint64_t))
+
+typedef struct ddt_phys {
+	dva_t		ddp_dva[SPA_DVAS_PER_BP];
+	uint64_t	ddp_refcnt;
+	uint64_t	ddp_phys_birth;
+} ddt_phys_t;
+
+enum ddt_phys_type {
+	DDT_PHYS_DITTO = 0,
+	DDT_PHYS_SINGLE = 1,
+	DDT_PHYS_DOUBLE = 2,
+	DDT_PHYS_TRIPLE = 3,
+	DDT_PHYS_TYPES
+} ddt_phys_type_t;
+
+/*
+ * In-core ddt entry
+ */
+struct ddt_entry {
+	ddt_key_t	dde_key;
+	ddt_phys_t	dde_phys[DDT_PHYS_TYPES];
+	zio_t		*dde_lead_zio[DDT_PHYS_TYPES];
+	void		*dde_repair_data;
+	enum ddt_type	dde_type;
+	enum ddt_class	dde_class;
+	uint8_t		dde_loading;
+	uint8_t		dde_loaded;
+	kcondvar_t	dde_cv;
+	avl_node_t	dde_node;
+};
+
+/*
+ * In-core ddt
+ */
+struct ddt {
+	kmutex_t	ddt_lock;
+	avl_tree_t	ddt_tree;
+	avl_tree_t	ddt_repair_tree;
+	enum zio_checksum ddt_checksum;
+	spa_t		*ddt_spa;
+	objset_t	*ddt_os;
+	uint64_t	ddt_stat_object;
+	uint64_t	ddt_object[DDT_TYPES][DDT_CLASSES];
+	ddt_histogram_t	ddt_histogram[DDT_TYPES][DDT_CLASSES];
+	avl_node_t	ddt_node;
+};
+
+/*
+ * In-core and on-disk bookmark for DDT walks
+ */
+typedef struct ddt_bookmark {
+	uint64_t	ddb_class;
+	uint64_t	ddb_type;
+	uint64_t	ddb_checksum;
+	uint64_t	ddb_cursor;
+} ddt_bookmark_t;
+
+/*
+ * Ops vector to access a specific DDT object type.
+ */
+typedef struct ddt_ops {
+	char ddt_op_name[32];
+	int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx,
+	    boolean_t prehash);
+	int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx);
+	int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde);
+	int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+	    dmu_tx_t *tx);
+	int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+	    dmu_tx_t *tx);
+	int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde,
+	    uint64_t *walk);
+	uint64_t (*ddt_op_count)(objset_t *os, uint64_t object);
+} ddt_ops_t;
+
+#define	DDT_NAMELEN	80
+
+extern void ddt_object_name(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, char *name);
+extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, uint64_t *walk, ddt_entry_t *dde);
+extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class);
+extern int ddt_object_info(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class, dmu_object_info_t *);
+extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type,
+    enum ddt_class class);
+
+extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp,
+    uint64_t txg);
+extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk,
+    const ddt_phys_t *ddp, blkptr_t *bp);
+
+extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp);
+
+extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp);
+extern void ddt_phys_clear(ddt_phys_t *ddp);
+extern void ddt_phys_addref(ddt_phys_t *ddp);
+extern void ddt_phys_decref(ddt_phys_t *ddp);
+extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp,
+    uint64_t txg);
+extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp);
+extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);
+
+extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg);
+
+extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src);
+extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh);
+extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh);
+extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo);
+extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh);
+extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total);
+
+extern uint64_t ddt_get_dedup_dspace(spa_t *spa);
+extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa);
+
+extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde,
+    ddt_phys_t *ddp_willref);
+extern int ddt_ditto_copies_present(ddt_entry_t *dde);
+
+extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len);
+extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len);
+
+extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp);
+extern void ddt_enter(ddt_t *ddt);
+extern void ddt_exit(ddt_t *ddt);
+extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add);
+extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde);
+
+extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class,
+    const blkptr_t *bp);
+
+extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp);
+extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde);
+
+extern int ddt_entry_compare(const void *x1, const void *x2);
+
+extern void ddt_create(spa_t *spa);
+extern int ddt_load(spa_t *spa);
+extern void ddt_unload(spa_t *spa);
+extern void ddt_sync(spa_t *spa, uint64_t txg);
+extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde);
+
+extern const ddt_ops_t ddt_zap_ops;
+
+#ifdef	__cplusplus
+}
+#endif
+
+#endif	/* _SYS_DDT_H */
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h
index 3b1e5c8fbc1fd..b41bc96c38f1b 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,12 +38,14 @@
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/cred.h>
+#include <sys/time.h>
 
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct uio;
+struct xuio;
 struct page;
 struct vnode;
 struct spa;
@@ -59,7 +61,8 @@ struct drr_end;
 struct zbookmark;
 struct spa;
 struct nvlist;
-struct objset_impl;
+struct arc_buf;
+struct zio_prop;
 
 typedef struct objset objset_t;
 typedef struct dmu_tx dmu_tx_t;
@@ -114,6 +117,11 @@ typedef enum dmu_object_type {
 	DMU_OT_FUID_SIZE,		/* FUID table size UINT64 */
 	DMU_OT_NEXT_CLONES,		/* ZAP */
 	DMU_OT_SCRUB_QUEUE,		/* ZAP */
+	DMU_OT_USERGROUP_USED,		/* ZAP */
+	DMU_OT_USERGROUP_QUOTA,		/* ZAP */
+	DMU_OT_USERREFS,		/* ZAP */
+	DMU_OT_DDT_ZAP,			/* ZAP */
+	DMU_OT_DDT_STATS,		/* ZAP */
 	DMU_OT_NUMTYPES
 } dmu_object_type_t;
 
@@ -136,16 +144,6 @@ void zfs_oldacl_byteswap(void *buf, size_t size);
 void zfs_acl_byteswap(void *buf, size_t size);
 void zfs_znode_byteswap(void *buf, size_t size);
 
-#define	DS_MODE_NOHOLD		0	/* internal use only */
-#define	DS_MODE_USER		1	/* simple access, no special needs */
-#define	DS_MODE_OWNER		2	/* the "main" access, e.g. a mount */
-#define	DS_MODE_TYPE_MASK	0x3
-#define	DS_MODE_TYPE(x)		((x) & DS_MODE_TYPE_MASK)
-#define	DS_MODE_READONLY	0x8
-#define	DS_MODE_IS_READONLY(x)	((x) & DS_MODE_READONLY)
-#define	DS_MODE_INCONSISTENT	0x10
-#define	DS_MODE_IS_INCONSISTENT(x)	((x) & DS_MODE_INCONSISTENT)
-
 #define	DS_FIND_SNAPSHOTS	(1<<0)
 #define	DS_FIND_CHILDREN	(1<<1)
 
@@ -156,25 +154,32 @@ void zfs_znode_byteswap(void *buf, size_t size);
 #define	DMU_MAX_ACCESS (10<<20) /* 10MB */
 #define	DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */
 
+#define	DMU_USERUSED_OBJECT	(-1ULL)
+#define	DMU_GROUPUSED_OBJECT	(-2ULL)
+#define	DMU_DEADLIST_OBJECT	(-3ULL)
+
 /*
  * Public routines to create, destroy, open, and close objsets.
  */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp);
-int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type,
-    objset_t **osp);
-void dmu_objset_close(objset_t *os);
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp);
+
 int dmu_objset_evict_dbufs(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_snapshots_destroy(char *fsname, char *snapname);
-int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+    uint64_t flags);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
+int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props,
+    boolean_t recursive);
 int dmu_objset_rename(const char *name, const char *newname,
     boolean_t recursive);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 void dmu_objset_byteswap(void *buf, size_t size);
 
@@ -201,9 +206,16 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr);
 #define	DMU_POOL_HISTORY		"history"
 #define	DMU_POOL_PROPS			"pool_props"
 #define	DMU_POOL_L2CACHE		"l2cache"
+#define	DMU_POOL_TMP_USERREFS		"tmp_userrefs"
+#define	DMU_POOL_DDT			"DDT-%s-%s-%s"
+#define	DMU_POOL_DDT_STATS		"DDT-statistics"
 
 /* 4x8 zbookmark_t */
 #define	DMU_POOL_SCRUB_BOOKMARK		"scrub_bookmark"
+/* 4x8 ddt_bookmark_t */
+#define	DMU_POOL_SCRUB_DDT_BOOKMARK	"scrub_ddt_bookmark"
+/* 1x8 max_class */
+#define	DMU_POOL_SCRUB_DDT_CLASS_MAX	"scrub_ddt_class_max"
 /* 1x8 zap obj DMU_OT_SCRUB_QUEUE */
 #define	DMU_POOL_SCRUB_QUEUE		"scrub_queue"
 /* 1x8 txg */
@@ -235,7 +247,7 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot,
 int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot,
     int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx);
 int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot,
-    int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+    int blocksize, dmu_object_type_t bonustype, int bonuslen);
 
 /*
  * Free an object from this objset.
@@ -298,11 +310,13 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,
     dmu_tx_t *tx);
 
 /*
- * Decide how many copies of a given block we should make.  Can be from
- * 1 to SPA_DVAS_PER_BP.
+ * Decide how to write a block: checksum, compression, number of copies, etc.
  */
-int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb,
-    dmu_object_type_t ot);
+#define	WP_NOFILL	0x1
+#define	WP_DMU_SYNC	0x2
+
+void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp,
+    struct zio_prop *zp);
 /*
  * The bonus data is accessed more or less like a regular buffer.
  * You must dmu_bonus_hold() to get the buffer, which will give you a
@@ -397,6 +411,11 @@ void *dmu_buf_get_user(dmu_buf_t *db);
  */
 void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx);
 
+/*
+ * Tells if the given dbuf is freeable.
+ */
+boolean_t dmu_buf_freeable(dmu_buf_t *);
+
 /*
  * You must create a transaction, then hold the objects which you will
  * (or might) modify as part of this transaction.  Then you must assign
@@ -422,13 +441,33 @@ dmu_tx_t *dmu_tx_create(objset_t *os);
 void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len);
 void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off,
     uint64_t len);
-void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name);
+void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name);
 void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object);
 void dmu_tx_abort(dmu_tx_t *tx);
 int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how);
 void dmu_tx_wait(dmu_tx_t *tx);
 void dmu_tx_commit(dmu_tx_t *tx);
 
+/*
+ * To register a commit callback, dmu_tx_callback_register() must be called.
+ *
+ * dcb_data is a pointer to caller private data that is passed on as a
+ * callback parameter. The caller is responsible for properly allocating and
+ * freeing it.
+ *
+ * When registering a callback, the transaction must be already created, but
+ * it cannot be committed or aborted. It can be assigned to a txg or not.
+ *
+ * The callback will be called after the transaction has been safely written
+ * to stable storage and will also be called if the dmu_tx is aborted.
+ * If there is any error which prevents the transaction from being committed to
+ * disk, the callback will be called with a value of error != 0.
+ */
+typedef void dmu_tx_callback_func_t(void *dcb_data, int error);
+
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+    void *dcb_data);
+
 /*
  * Free up the data blocks for a defined range of a file.  If size is
  * zero, the range from offset to end-of-file is freed.
@@ -445,8 +484,10 @@ int dmu_free_object(objset_t *os, uint64_t object);
  * Canfail routines will return 0 on success, or an errno if there is a
  * nonrecoverable I/O error.
  */
+#define	DMU_READ_PREFETCH	0 /* prefetch */
+#define	DMU_READ_NO_PREFETCH	1 /* don't prefetch */
 int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
-	void *buf);
+	void *buf, uint32_t flags);
 void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
 	const void *buf, dmu_tx_t *tx);
 void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size,
@@ -456,6 +497,19 @@ int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size,
     dmu_tx_t *tx);
 int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t size, struct page *pp, dmu_tx_t *tx);
+struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size);
+void dmu_return_arcbuf(struct arc_buf *buf);
+void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf,
+    dmu_tx_t *tx);
+int dmu_xuio_init(struct xuio *uio, int niov);
+void dmu_xuio_fini(struct xuio *uio);
+int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off,
+    size_t n);
+int dmu_xuio_cnt(struct xuio *uio);
+struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i);
+void dmu_xuio_clear(struct xuio *uio, int i);
+void xuio_stat_wbuf_copied();
+void xuio_stat_wbuf_nocopy();
 
 extern int zfs_prefetch_disable;
 
@@ -466,19 +520,19 @@ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset,
     uint64_t len);
 
 typedef struct dmu_object_info {
-	/* All sizes are in bytes. */
+	/* All sizes are in bytes unless otherwise indicated. */
 	uint32_t doi_data_block_size;
 	uint32_t doi_metadata_block_size;
-	uint64_t doi_bonus_size;
 	dmu_object_type_t doi_type;
 	dmu_object_type_t doi_bonus_type;
+	uint64_t doi_bonus_size;
 	uint8_t doi_indirection;		/* 2 = dnode->indirect->data */
 	uint8_t doi_checksum;
 	uint8_t doi_compress;
 	uint8_t doi_pad[5];
-	/* Values below are number of 512-byte blocks. */
-	uint64_t doi_physical_blks;		/* data + metadata */
-	uint64_t doi_max_block_offset;
+	uint64_t doi_physical_blocks_512;	/* data + metadata, 512b blks */
+	uint64_t doi_max_offset;
+	uint64_t doi_fill_count;		/* number of non-empty blocks */
 } dmu_object_info_t;
 
 typedef void arc_byteswap_func_t(void *buf, size_t size);
@@ -547,6 +601,11 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
  */
 uint64_t dmu_objset_fsid_guid(objset_t *os);
 
+/*
+ * Get the [cm]time for an objset's snapshot dir
+ */
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
+
 int dmu_objset_is_snapshot(objset_t *os);
 
 extern struct spa *dmu_objset_spa(objset_t *os);
@@ -556,12 +615,18 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
 extern void dmu_objset_name(objset_t *os, char *buf);
 extern dmu_objset_type_t dmu_objset_type(objset_t *os);
 extern uint64_t dmu_objset_id(objset_t *os);
+extern uint64_t dmu_objset_logbias(objset_t *os);
 extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
     uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
 extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
     int maxlen, boolean_t *conflict);
 extern int dmu_dir_list_next(objset_t *os, int namelen, char *name,
     uint64_t *idp, uint64_t *offp);
+
+typedef int objset_used_cb_t(dmu_object_type_t bonustype,
+    void *bonus, uint64_t *userp, uint64_t *groupp);
+extern void dmu_objset_register_type(dmu_objset_type_t ost,
+    objset_used_cb_t *cb);
 extern void dmu_objset_set_user(objset_t *os, void *user_ptr);
 extern void *dmu_objset_get_user(objset_t *os);
 
@@ -580,9 +645,20 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
  * storage when the write completes this new data does not become a
  * permanent part of the file until the associated transaction commits.
  */
-typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg);
-int dmu_sync(struct zio *zio, dmu_buf_t *db,
-    struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg);
+
+/*
+ * {zfs,zvol,ztest}_get_done() args
+ */
+typedef struct zgd {
+	struct zilog	*zgd_zilog;
+	struct blkptr	*zgd_bp;
+	dmu_buf_t	*zgd_db;
+	struct rl	*zgd_rl;
+	void		*zgd_private;
+} zgd_t;
+
+typedef void dmu_sync_cb_t(zgd_t *arg, int error);
+int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd);
 
 /*
  * Find the next hole or data block in file starting at *off
@@ -617,15 +693,15 @@ typedef struct dmu_recv_cookie {
 	struct dsl_dataset *drc_real_ds;
 	struct drr_begin *drc_drrb;
 	char *drc_tosnap;
+	char *drc_top_ds;
 	boolean_t drc_newfs;
 	boolean_t drc_force;
 } dmu_recv_cookie_t;
 
-int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *,
-    boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *);
+int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *,
+    boolean_t force, objset_t *origin, dmu_recv_cookie_t *);
 int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp);
 int dmu_recv_end(dmu_recv_cookie_t *drc);
-void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc);
 
 /* CRC64 table */
 #define	ZFS_CRC64_POLY	0xC96C5795D7870F42ULL	/* ECMA-182, reflected form */
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h
index 96ce688e1551a..22f9f5f8c88c4 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -210,11 +210,11 @@ extern "C" {
  *
  * ds_lock
  *    protects:
- *    	ds_user_ptr
- *    	ds_user_evice_func
+ *    	ds_objset
  *    	ds_open_refcount
  *    	ds_snapname
  *    	ds_phys accounting
+ *	ds_phys userrefs zapobj
  *	ds_reserved
  *    held from:
  *    	dsl_dataset_*
@@ -232,6 +232,39 @@ extern "C" {
 struct objset;
 struct dmu_pool;
 
+typedef struct dmu_xuio {
+	int next;
+	int cnt;
+	struct arc_buf **bufs;
+	iovec_t *iovp;
+} dmu_xuio_t;
+
+typedef struct xuio_stats {
+	/* loaned yet not returned arc_buf */
+	kstat_named_t xuiostat_onloan_rbuf;
+	kstat_named_t xuiostat_onloan_wbuf;
+	/* whether a copy is made when loaning out a read buffer */
+	kstat_named_t xuiostat_rbuf_copied;
+	kstat_named_t xuiostat_rbuf_nocopy;
+	/* whether a copy is made when assigning a write buffer */
+	kstat_named_t xuiostat_wbuf_copied;
+	kstat_named_t xuiostat_wbuf_nocopy;
+} xuio_stats_t;
+
+static xuio_stats_t xuio_stats = {
+	{ "onloan_read_buf",	KSTAT_DATA_UINT64 },
+	{ "onloan_write_buf",	KSTAT_DATA_UINT64 },
+	{ "read_buf_copied",	KSTAT_DATA_UINT64 },
+	{ "read_buf_nocopy",	KSTAT_DATA_UINT64 },
+	{ "write_buf_copied",	KSTAT_DATA_UINT64 },
+	{ "write_buf_nocopy",	KSTAT_DATA_UINT64 }
+};
+
+#define	XUIOSTAT_INCR(stat, val)	\
+	atomic_add_64(&xuio_stats.stat.value.ui64, (val))
+#define	XUIOSTAT_BUMP(stat)	XUIOSTAT_INCR(stat, 1)
+
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h
index 1d65727808c32..a153602021595 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -40,40 +40,50 @@ extern "C" {
 
 struct dsl_dataset;
 struct dmu_tx;
-struct objset_impl;
+
+#define	OBJSET_PHYS_SIZE 2048
+#define	OBJSET_OLD_PHYS_SIZE 1024
+
+#define	OBJSET_FLAG_USERACCOUNTING_COMPLETE	(1ULL<<0)
 
 typedef struct objset_phys {
 	dnode_phys_t os_meta_dnode;
 	zil_header_t os_zil_header;
 	uint64_t os_type;
-	char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) -
-	    sizeof (uint64_t)];
+	uint64_t os_flags;
+	char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 -
+	    sizeof (zil_header_t) - sizeof (uint64_t)*2];
+	dnode_phys_t os_userused_dnode;
+	dnode_phys_t os_groupused_dnode;
 } objset_phys_t;
 
 struct objset {
-	struct objset_impl *os;
-	int os_mode;
-};
-
-typedef struct objset_impl {
 	/* Immutable: */
 	struct dsl_dataset *os_dsl_dataset;
 	spa_t *os_spa;
 	arc_buf_t *os_phys_buf;
 	objset_phys_t *os_phys;
 	dnode_t *os_meta_dnode;
+	dnode_t *os_userused_dnode;
+	dnode_t *os_groupused_dnode;
 	zilog_t *os_zil;
-	objset_t os;
-	uint8_t os_checksum;	/* can change, under dsl_dir's locks */
-	uint8_t os_compress;	/* can change, under dsl_dir's locks */
-	uint8_t os_copies;	/* can change, under dsl_dir's locks */
-	uint8_t os_primary_cache;	/* can change, under dsl_dir's locks */
-	uint8_t os_secondary_cache;	/* can change, under dsl_dir's locks */
+
+	/* can change, under dsl_dir's locks: */
+	uint8_t os_checksum;
+	uint8_t os_compress;
+	uint8_t os_copies;
+	uint8_t os_dedup_checksum;
+	uint8_t os_dedup_verify;
+	uint8_t os_logbias;
+	uint8_t os_primary_cache;
+	uint8_t os_secondary_cache;
 
 	/* no lock needed: */
 	struct dmu_tx *os_synctx; /* XXX sketchy */
 	blkptr_t *os_rootbp;
 	zil_header_t os_zil_header;
+	list_t os_synced_dnodes;
+	uint64_t os_flags;
 
 	/* Protected by os_obj_lock */
 	kmutex_t os_obj_lock;
@@ -89,44 +99,57 @@ typedef struct objset_impl {
 	/* stuff we store for the user */
 	kmutex_t os_user_ptr_lock;
 	void *os_user_ptr;
-} objset_impl_t;
+};
 
+#define	DMU_META_OBJSET		0
 #define	DMU_META_DNODE_OBJECT	0
+#define	DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0)
 
 #define	DMU_OS_IS_L2CACHEABLE(os)				\
 	((os)->os_secondary_cache == ZFS_CACHE_ALL ||		\
 	(os)->os_secondary_cache == ZFS_CACHE_METADATA)
 
 /* called from zpl */
-int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode,
-    objset_t **osp);
-void dmu_objset_close(objset_t *os);
-int dmu_objset_create(const char *name, dmu_objset_type_t type,
-    objset_t *clone_parent, uint64_t flags,
+int dmu_objset_hold(const char *name, void *tag, objset_t **osp);
+int dmu_objset_own(const char *name, dmu_objset_type_t type,
+    boolean_t readonly, void *tag, objset_t **osp);
+void dmu_objset_rele(objset_t *os, void *tag);
+void dmu_objset_disown(objset_t *os, void *tag);
+int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp);
+
+int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags,
     void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg);
-int dmu_objset_destroy(const char *name);
-int dmu_objset_rollback(objset_t *os);
-int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive);
+int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin,
+    uint64_t flags);
+int dmu_objset_destroy(const char *name, boolean_t defer);
+int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props,
+    boolean_t recursive);
 void dmu_objset_stats(objset_t *os, nvlist_t *nv);
 void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat);
 void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp,
     uint64_t *usedobjsp, uint64_t *availobjsp);
 uint64_t dmu_objset_fsid_guid(objset_t *os);
-int dmu_objset_find(char *name, int func(char *, void *), void *arg,
+int dmu_objset_find(char *name, int func(const char *, void *), void *arg,
     int flags);
 int dmu_objset_find_spa(spa_t *spa, const char *name,
     int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags);
-int dmu_objset_prefetch(char *name, void *arg);
+int dmu_objset_prefetch(const char *name, void *arg);
 void dmu_objset_byteswap(void *buf, size_t size);
 int dmu_objset_evict_dbufs(objset_t *os);
+timestruc_t dmu_objset_snap_cmtime(objset_t *os);
 
 /* called from dsl */
-void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx);
-objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
+void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx);
+boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg);
+objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds,
     blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx);
 int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp,
-    objset_impl_t **osip);
-void dmu_objset_evict(struct dsl_dataset *ds, void *arg);
+    objset_t **osp);
+void dmu_objset_evict(objset_t *os);
+void dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx);
+boolean_t dmu_objset_userused_enabled(objset_t *os);
+int dmu_objset_userspace_upgrade(objset_t *os);
+boolean_t dmu_objset_userspace_present(objset_t *os);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h
index 3e026891153c3..5b0821253dd78 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,8 +36,9 @@ extern "C" {
 
 struct dnode_phys;
 struct dsl_dataset;
+struct zilog;
 
-typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp,
+typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
     const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg);
 
 #define	TRAVERSE_PRE			(1<<0)
@@ -45,10 +46,12 @@ typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp,
 #define	TRAVERSE_PREFETCH_METADATA	(1<<2)
 #define	TRAVERSE_PREFETCH_DATA		(1<<3)
 #define	TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA)
+#define	TRAVERSE_HARD			(1<<4)
 
-int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start,
-    int flags, blkptr_cb_t func, void *arg);
-int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg);
+int traverse_dataset(struct dsl_dataset *ds,
+    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
+int traverse_pool(spa_t *spa,
+    uint64_t txg_start, int flags, blkptr_cb_t func, void *arg);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h
index 2727daaaa76b1..ed01cdf38210f 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DMU_TX_H
 #define	_SYS_DMU_TX_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/inttypes.h>
 #include <sys/dmu.h>
 #include <sys/txg.h>
@@ -59,6 +57,7 @@ struct dmu_tx {
 	txg_handle_t tx_txgh;
 	void *tx_tempreserve_cookie;
 	struct dmu_tx_hold *tx_needassign_txh;
+	list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */
 	uint8_t tx_anyobj;
 	int tx_err;
 #ifdef ZFS_DEBUG
@@ -98,6 +97,11 @@ typedef struct dmu_tx_hold {
 #endif
 } dmu_tx_hold_t;
 
+typedef struct dmu_tx_callback {
+	list_node_t		dcb_node;    /* linked to tx_callbacks list */
+	dmu_tx_callback_func_t	*dcb_func;   /* caller function pointer */
+	void			*dcb_data;   /* caller private data */
+} dmu_tx_callback_t;
 
 /*
  * These routines are defined in dmu.h, and are called by the user.
@@ -109,6 +113,10 @@ void dmu_tx_abort(dmu_tx_t *tx);
 uint64_t dmu_tx_get_txg(dmu_tx_t *tx);
 void dmu_tx_wait(dmu_tx_t *tx);
 
+void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func,
+    void *dcb_data);
+void dmu_tx_do_callbacks(list_t *cb_list, int error);
+
 /*
  * These routines are defined in dmu_spa.h, and are called by the SPA.
  */
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h
index c94bced933aff..78cadd2b1ee1b 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_DFETCH_H
 #define	_DFETCH_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
@@ -63,6 +61,9 @@ typedef struct zfetch {
 	uint64_t	zf_alloc_fail;	/* # of failed attempts to alloc strm */
 } zfetch_t;
 
+void		zfetch_init(void);
+void		zfetch_fini(void);
+
 void		dmu_zfetch_init(zfetch_t *, struct dnode *);
 void		dmu_zfetch_rele(zfetch_t *);
 void		dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h
index c79ff48a60c56..58e62d93c1460 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -74,6 +74,7 @@ extern "C" {
 #define	DNODES_PER_BLOCK_SHIFT	(DNODE_BLOCK_SHIFT - DNODE_SHIFT)
 #define	DNODES_PER_BLOCK	(1ULL << DNODES_PER_BLOCK_SHIFT)
 #define	DNODES_PER_LEVEL_SHIFT	(DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT)
+#define	DNODES_PER_LEVEL	(1ULL << DNODES_PER_LEVEL_SHIFT)
 
 /* The +2 here is a cheesy way to round up */
 #define	DN_MAX_LEVELS	(2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \
@@ -88,7 +89,7 @@ extern "C" {
 #define	EPB(blkshift, typeshift)	(1 << (blkshift - typeshift))
 
 struct dmu_buf_impl;
-struct objset_impl;
+struct objset;
 struct zio;
 
 enum dnode_dirtycontext {
@@ -98,7 +99,8 @@ enum dnode_dirtycontext {
 };
 
 /* Is dn_used in bytes?  if not, it's in multiples of SPA_MINBLOCKSIZE */
-#define	DNODE_FLAG_USED_BYTES	(1<<0)
+#define	DNODE_FLAG_USED_BYTES		(1<<0)
+#define	DNODE_FLAG_USERUSED_ACCOUNTED	(1<<1)
 
 typedef struct dnode_phys {
 	uint8_t dn_type;		/* dmu_object_type_t */
@@ -131,14 +133,11 @@ typedef struct dnode {
 	 */
 	krwlock_t dn_struct_rwlock;
 
-	/*
-	 * Our link on dataset's dd_dnodes list.
-	 * Protected by dd_accounting_mtx.
-	 */
+	/* Our link on dn_objset->os_dnodes list; protected by os_lock.  */
 	list_node_t dn_link;
 
 	/* immutable: */
-	struct objset_impl *dn_objset;
+	struct objset *dn_objset;
 	uint64_t dn_object;
 	struct dmu_buf_impl *dn_dbuf;
 	dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */
@@ -160,6 +159,7 @@ typedef struct dnode {
 	uint16_t dn_datablkszsec;	/* in 512b sectors */
 	uint32_t dn_datablksz;		/* in bytes */
 	uint64_t dn_maxblkid;
+	uint8_t dn_next_nblkptr[TXG_SIZE];
 	uint8_t dn_next_nlevels[TXG_SIZE];
 	uint8_t dn_next_indblkshift[TXG_SIZE];
 	uint16_t dn_next_bonuslen[TXG_SIZE];
@@ -190,6 +190,9 @@ typedef struct dnode {
 	/* parent IO for current sync write */
 	zio_t *dn_zio;
 
+	/* used in syncing context */
+	dnode_phys_t *dn_oldphys;
+
 	/* holds prefetch structure */
 	struct zfetch	dn_zfetch;
 } dnode_t;
@@ -200,14 +203,14 @@ typedef struct free_range {
 	uint64_t fr_nblks;
 } free_range_t;
 
-dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp,
+dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp,
     uint64_t object);
 void dnode_special_close(dnode_t *dn);
 
 void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx);
-int dnode_hold(struct objset_impl *dd, uint64_t object,
+int dnode_hold(struct objset *dd, uint64_t object,
     void *ref, dnode_t **dnp);
-int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag,
+int dnode_hold_impl(struct objset *dd, uint64_t object, int flag,
     void *ref, dnode_t **dnp);
 boolean_t dnode_add_ref(dnode_t *dn, void *ref);
 void dnode_rele(dnode_t *dn, void *ref);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h
index 8665aec2dda87..6eb7505ea53ff 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -42,8 +42,6 @@ struct dsl_dataset;
 struct dsl_dir;
 struct dsl_pool;
 
-typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
-
 #define	DS_FLAG_INCONSISTENT	(1ULL<<0)
 #define	DS_IS_INCONSISTENT(ds)	\
 	((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT)
@@ -62,6 +60,14 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *);
  */
 #define	DS_FLAG_UNIQUE_ACCURATE	(1ULL<<2)
 
+/*
+ * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called
+ * on a dataset. This allows the dataset to be destroyed using 'zfs release'.
+ */
+#define	DS_FLAG_DEFER_DESTROY	(1ULL<<3)
+#define	DS_IS_DEFER_DESTROY(ds)	\
+	((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY)
+
 /*
  * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose
  * name lookups should be performed case-insensitively.
@@ -93,7 +99,8 @@ typedef struct dsl_dataset_phys {
 	blkptr_t ds_bp;
 	uint64_t ds_next_clones_obj;	/* DMU_OT_DSL_CLONES */
 	uint64_t ds_props_obj;		/* DMU_OT_DSL_PROPS for snaps */
-	uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */
+	uint64_t ds_userrefs_obj;	/* DMU_OT_USERREFS */
+	uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */
 } dsl_dataset_phys_t;
 
 typedef struct dsl_dataset {
@@ -111,6 +118,9 @@ typedef struct dsl_dataset {
 	/* has internal locking: */
 	bplist_t ds_deadlist;
 
+	/* to protect against multiple concurrent incremental recv */
+	kmutex_t ds_recvlock;
+
 	/* protected by lock on pool's dp_dirty_datasets list */
 	txg_node_t ds_dirty_link;
 	list_node_t ds_synced_link;
@@ -120,8 +130,8 @@ typedef struct dsl_dataset {
 	 * Protected by ds_lock:
 	 */
 	kmutex_t ds_lock;
-	void *ds_user_ptr;
-	dsl_dataset_evict_func_t *ds_user_evict_func;
+	objset_t *ds_objset;
+	uint64_t ds_userrefs;
 
 	/*
 	 * ds_owner is protected by the ds_rwlock and the ds_lock
@@ -143,6 +153,15 @@ typedef struct dsl_dataset {
 	char ds_snapname[MAXNAMELEN];
 } dsl_dataset_t;
 
+struct dsl_ds_destroyarg {
+	dsl_dataset_t *ds;		/* ds to destroy */
+	dsl_dataset_t *rm_origin;	/* also remove our origin? */
+	boolean_t is_origin_rm;		/* set if removing origin snap */
+	boolean_t defer;		/* destroy -d requested? */
+	boolean_t releasing;		/* destroying due to release? */
+	boolean_t need_prep;		/* do we need to retry due to EBUSY? */
+};
+
 #define	dsl_dataset_is_snapshot(ds)	\
 	((ds)->ds_phys->ds_num_children != 0)
 
@@ -152,36 +171,38 @@ typedef struct dsl_dataset {
 int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj,
     void *tag, dsl_dataset_t **);
-int dsl_dataset_own(const char *name, int flags, void *owner,
-    dsl_dataset_t **dsp);
+int dsl_dataset_own(const char *name, boolean_t inconsistentok,
+    void *tag, dsl_dataset_t **dsp);
 int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj,
-    int flags, void *owner, dsl_dataset_t **);
+    boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp);
 void dsl_dataset_name(dsl_dataset_t *ds, char *name);
 void dsl_dataset_rele(dsl_dataset_t *ds, void *tag);
-void dsl_dataset_disown(dsl_dataset_t *ds, void *owner);
+void dsl_dataset_disown(dsl_dataset_t *ds, void *tag);
 void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag);
 boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok,
-    void *owner);
-void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner);
+    void *tag);
+void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag);
 uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname,
     dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *);
 uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin,
     uint64_t flags, dmu_tx_t *tx);
-int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag);
-int dsl_snapshots_destroy(char *fsname, char *snapname);
+int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer);
+int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer);
 dsl_checkfunc_t dsl_dataset_destroy_check;
 dsl_syncfunc_t dsl_dataset_destroy_sync;
 dsl_checkfunc_t dsl_dataset_snapshot_check;
 dsl_syncfunc_t dsl_dataset_snapshot_sync;
-int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost);
 int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive);
-int dsl_dataset_promote(const char *name);
+int dsl_dataset_promote(const char *name, char *conflsnap);
 int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head,
     boolean_t force);
-
-void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds,
-    void *p, dsl_dataset_evict_func_t func);
-void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds);
+int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag,
+    boolean_t recursive, boolean_t temphold);
+int dsl_dataset_user_release(char *dsname, char *snapname, char *htag,
+    boolean_t recursive);
+int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj,
+    char *htag);
+int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp);
 
 blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds);
 void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
@@ -192,10 +213,11 @@ boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds);
 
 void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx);
 
-void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx);
-int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio,
+void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp,
     dmu_tx_t *tx);
-int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
+int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp,
+    dmu_tx_t *tx, boolean_t async);
+boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth);
 uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds);
 
 void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx);
@@ -211,13 +233,14 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf);
 int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota,
     uint64_t asize, uint64_t inflight, uint64_t *used,
     uint64_t *ref_rsrv);
-int dsl_dataset_set_quota(const char *dsname, uint64_t quota);
+int dsl_dataset_set_quota(const char *dsname, zprop_source_t source,
+    uint64_t quota);
 void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr,
     dmu_tx_t *tx);
-int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation);
-void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags);
-int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation,
-    dmu_tx_t *tx);
+int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source,
+    uint64_t reservation);
+
+int dsl_destroy_inconsistent(const char *dsname, void *arg);
 
 #ifdef ZFS_DEBUG
 #define	dprintf_ds(ds, fmt, ...) do { \
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h
index a29e44e67d0c5..a26a3f7058a19 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DSL_DELEG_H
 #define	_SYS_DSL_DELEG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dsl_pool.h>
 #include <sys/zfs_context.h>
@@ -51,6 +49,12 @@ extern "C" {
 #define	ZFS_DELEG_PERM_ALLOW		"allow"
 #define	ZFS_DELEG_PERM_USERPROP		"userprop"
 #define	ZFS_DELEG_PERM_VSCAN		"vscan"
+#define	ZFS_DELEG_PERM_USERQUOTA	"userquota"
+#define	ZFS_DELEG_PERM_GROUPQUOTA	"groupquota"
+#define	ZFS_DELEG_PERM_USERUSED		"userused"
+#define	ZFS_DELEG_PERM_GROUPUSED	"groupused"
+#define	ZFS_DELEG_PERM_HOLD		"hold"
+#define	ZFS_DELEG_PERM_RELEASE		"release"
 
 /*
  * Note: the names of properties that are marked delegatable are also
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h
index 86b9636ceaabb..14a64e019e0fa 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -89,6 +89,7 @@ struct dsl_dir {
 	/* Protected by dd_lock */
 	kmutex_t dd_lock;
 	list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
+	timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
 
 	/* gross estimate of space used by in-flight tx's */
 	uint64_t dd_tempreserved[TXG_SIZE];
@@ -107,7 +108,6 @@ int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj,
     const char *tail, void *tag, dsl_dir_t **);
 void dsl_dir_name(dsl_dir_t *dd, char *buf);
 int dsl_dir_namelen(dsl_dir_t *dd);
-int dsl_dir_is_private(dsl_dir_t *dd);
 uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds,
     const char *name, dmu_tx_t *tx);
 dsl_checkfunc_t dsl_dir_destroy_check;
@@ -126,14 +126,18 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type,
     int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx);
 void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta,
     dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx);
-int dsl_dir_set_quota(const char *ddname, uint64_t quota);
-int dsl_dir_set_reservation(const char *ddname, uint64_t reservation);
+int dsl_dir_set_quota(const char *ddname, zprop_source_t source,
+    uint64_t quota);
+int dsl_dir_set_reservation(const char *ddname, zprop_source_t source,
+    uint64_t reservation);
 int dsl_dir_rename(dsl_dir_t *dd, const char *newname);
 int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space);
 int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx);
 boolean_t dsl_dir_is_clone(dsl_dir_t *dd);
 void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds,
     uint64_t reservation, cred_t *cr, dmu_tx_t *tx);
+void dsl_dir_snap_cmtime_update(dsl_dir_t *dd);
+timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd);
 
 /* internal reserved dir name */
 #define	MOS_DIR_NAME "$MOS"
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h
index 3bb4ad4efe55f..4e49d212a3052 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,6 +32,7 @@
 #include <sys/zfs_context.h>
 #include <sys/zio.h>
 #include <sys/dnode.h>
+#include <sys/ddt.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -77,13 +78,15 @@ typedef struct dsl_pool {
 	struct dsl_dir *dp_mos_dir;
 	struct dsl_dataset *dp_origin_snap;
 	uint64_t dp_root_dir_obj;
+	struct taskq *dp_vnrele_taskq;
 
 	/* No lock needed - sync context only */
 	blkptr_t dp_meta_rootbp;
 	list_t dp_synced_datasets;
 	hrtime_t dp_read_overhead;
-	uint64_t dp_throughput;
+	uint64_t dp_throughput; /* bytes per millisec */
 	uint64_t dp_write_limit;
+	uint64_t dp_tmp_userrefs_obj;
 
 	/* Uses dp_lock */
 	kmutex_t dp_lock;
@@ -94,12 +97,15 @@ typedef struct dsl_pool {
 	uint64_t dp_scrub_queue_obj;
 	uint64_t dp_scrub_min_txg;
 	uint64_t dp_scrub_max_txg;
+	uint64_t dp_scrub_start_time;
+	uint64_t dp_scrub_ddt_class_max;
 	zbookmark_t dp_scrub_bookmark;
+	ddt_bookmark_t dp_scrub_ddt_bookmark;
 	boolean_t dp_scrub_pausing;
 	boolean_t dp_scrub_isresilver;
-	uint64_t dp_scrub_start_time;
-	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
 	boolean_t dp_scrub_restart;
+	kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */
+	zio_t *dp_scrub_prefetch_zio_root;
 
 	/* Has its own locking */
 	tx_state_t dp_tx;
@@ -122,15 +128,15 @@ int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp);
 void dsl_pool_close(dsl_pool_t *dp);
 dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg);
 void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg);
-void dsl_pool_zil_clean(dsl_pool_t *dp);
+void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg);
 int dsl_pool_sync_context(dsl_pool_t *dp);
 uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree);
+uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree);
 int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx);
 void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
 void dsl_pool_memory_pressure(dsl_pool_t *dp);
 void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx);
-int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp,
-    zio_done_func_t *done, void *private, uint32_t arc_flags);
+void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp);
 void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx);
 void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2,
@@ -142,6 +148,16 @@ int dsl_pool_scrub_cancel(dsl_pool_t *dp);
 int dsl_pool_scrub_clean(dsl_pool_t *dp);
 void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx);
 void dsl_pool_scrub_restart(dsl_pool_t *dp);
+void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum,
+    const ddt_entry_t *dde);
+
+taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp);
+
+extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, uint64_t *now, dmu_tx_t *tx);
+extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj,
+    const char *tag, dmu_tx_t *tx);
+extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h
index d66caa86cff61..d8a8ab2d64e4a 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h
@@ -19,18 +19,17 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_DSL_PROP_H
 #define	_SYS_DSL_PROP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/dmu.h>
 #include <sys/dsl_pool.h>
 #include <sys/zfs_context.h>
+#include <sys/dsl_synctask.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -50,6 +49,25 @@ typedef struct dsl_prop_cb_record {
 	void *cbr_arg;
 } dsl_prop_cb_record_t;
 
+typedef struct dsl_props_arg {
+	nvlist_t *pa_props;
+	zprop_source_t pa_source;
+} dsl_props_arg_t;
+
+typedef struct dsl_prop_set_arg {
+	const char *psa_name;
+	zprop_source_t psa_source;
+	int psa_intsz;
+	int psa_numints;
+	const void *psa_value;
+
+	/*
+	 * Used to handle the special requirements of the quota and reservation
+	 * properties.
+	 */
+	uint64_t psa_effective_value;
+} dsl_prop_setarg_t;
+
 int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
     dsl_prop_changed_cb_t *callback, void *cbarg);
 int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
@@ -60,17 +78,37 @@ int dsl_prop_get(const char *ddname, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_integer(const char *ddname, const char *propname,
     uint64_t *valuep, char *setpoint);
-int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local);
+int dsl_prop_get_all(objset_t *os, nvlist_t **nvp);
+int dsl_prop_get_received(objset_t *os, nvlist_t **nvp);
 int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname,
     int intsz, int numints, void *buf, char *setpoint);
 int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname,
-    int intsz, int numints, void *buf, char *setpoint);
+    int intsz, int numints, void *buf, char *setpoint,
+    boolean_t snapshot);
 
+dsl_syncfunc_t dsl_props_set_sync;
 int dsl_prop_set(const char *ddname, const char *propname,
-    int intsz, int numints, const void *buf);
-void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
+    zprop_source_t source, int intsz, int numints, const void *buf);
+int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl);
+void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val,
     cred_t *cr, dmu_tx_t *tx);
 
+void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname,
+    zprop_source_t source, uint64_t *value);
+int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#ifdef	ZFS_DEBUG
+void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa);
+#define	DSL_PROP_CHECK_PREDICTION(dd, psa)	\
+	dsl_prop_check_prediction((dd), (psa))
+#else
+#define	DSL_PROP_CHECK_PREDICTION(dd, psa)	/* nothing */
+#endif
+
+/* flag first receive on or after SPA_VERSION_RECVD_PROPS */
+boolean_t dsl_prop_get_hasrecvd(objset_t *os);
+void dsl_prop_set_hasrecvd(objset_t *os);
+void dsl_prop_unset_hasrecvd(objset_t *os);
+
 void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value);
 void dsl_prop_nvlist_add_string(nvlist_t *nv,
     zfs_prop_t prop, const char *value);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h
index 1c9d89e8fd69e..5ce6251ddbd3f 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,14 +36,14 @@
 extern "C" {
 #endif
 
-typedef struct metaslab_class metaslab_class_t;
-typedef struct metaslab_group metaslab_group_t;
+extern space_map_ops_t *zfs_metaslab_ops;
 
 extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo,
     uint64_t start, uint64_t size, uint64_t txg);
 extern void metaslab_fini(metaslab_t *msp);
 extern void metaslab_sync(metaslab_t *msp, uint64_t txg);
 extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg);
+extern void metaslab_sync_reassess(metaslab_group_t *mg);
 
 #define	METASLAB_HINTBP_FAVOR	0x0
 #define	METASLAB_HINTBP_AVOID	0x1
@@ -55,14 +55,24 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg,
     boolean_t now);
 extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg);
 
-extern metaslab_class_t *metaslab_class_create(void);
+extern metaslab_class_t *metaslab_class_create(spa_t *spa,
+    space_map_ops_t *ops);
 extern void metaslab_class_destroy(metaslab_class_t *mc);
-extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg);
-extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg);
+extern int metaslab_class_validate(metaslab_class_t *mc);
+
+extern void metaslab_class_space_update(metaslab_class_t *mc,
+    int64_t alloc_delta, int64_t defer_delta,
+    int64_t space_delta, int64_t dspace_delta);
+extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_space(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc);
+extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc);
 
 extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc,
     vdev_t *vd);
 extern void metaslab_group_destroy(metaslab_group_t *mg);
+extern void metaslab_group_activate(metaslab_group_t *mg);
+extern void metaslab_group_passivate(metaslab_group_t *mg);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h
index 5980cbc843aca..07988dd51a738 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_METASLAB_IMPL_H
 #define	_SYS_METASLAB_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/metaslab.h>
 #include <sys/space_map.h>
 #include <sys/vdev.h>
@@ -39,15 +37,23 @@ extern "C" {
 #endif
 
 struct metaslab_class {
+	spa_t			*mc_spa;
 	metaslab_group_t	*mc_rotor;
-	uint64_t		mc_allocated;
+	space_map_ops_t		*mc_ops;
+	uint64_t		mc_aliquot;
+	uint64_t		mc_alloc;	/* total allocated space */
+	uint64_t		mc_deferred;	/* total deferred frees */
+	uint64_t		mc_space;	/* total space (alloc + free) */
+	uint64_t		mc_dspace;	/* total deflated space */
 };
 
 struct metaslab_group {
 	kmutex_t		mg_lock;
 	avl_tree_t		mg_metaslab_tree;
 	uint64_t		mg_aliquot;
+	uint64_t		mg_bonus_area;
 	int64_t			mg_bias;
+	int64_t			mg_activation_count;
 	metaslab_class_t	*mg_class;
 	vdev_t			*mg_vd;
 	metaslab_group_t	*mg_prev;
@@ -67,7 +73,9 @@ struct metaslab {
 	space_map_obj_t	ms_smo_syncing;	/* syncing space map object	*/
 	space_map_t	ms_allocmap[TXG_SIZE];  /* allocated this txg	*/
 	space_map_t	ms_freemap[TXG_SIZE];	/* freed this txg	*/
+	space_map_t	ms_defermap[TXG_DEFER_SIZE]; /* deferred frees	*/
 	space_map_t	ms_map;		/* in-core free space map	*/
+	int64_t		ms_deferspace;	/* sum of ms_defermap[] space	*/
 	uint64_t	ms_weight;	/* weight vs. others in group	*/
 	metaslab_group_t *ms_group;	/* metaslab group		*/
 	avl_node_t	ms_group_node;	/* node in metaslab group tree	*/
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h
index 24b3ca4476795..868d4fc1d7a29 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -43,8 +43,13 @@ extern "C" {
 typedef struct spa spa_t;
 typedef struct vdev vdev_t;
 typedef struct metaslab metaslab_t;
+typedef struct metaslab_group metaslab_group_t;
+typedef struct metaslab_class metaslab_class_t;
+typedef struct zio zio_t;
 typedef struct zilog zilog_t;
 typedef struct spa_aux_vdev spa_aux_vdev_t;
+typedef struct ddt ddt_t;
+typedef struct ddt_entry ddt_entry_t;
 struct dsl_pool;
 
 /*
@@ -134,15 +139,15 @@ typedef struct zio_cksum {
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 5	|G|			 offset3				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 6	|E| lvl | type	| cksum | comp	|     PSIZE	|     LSIZE	|
+ * 6	|BDX|lvl| type	| cksum | comp	|     PSIZE	|     LSIZE	|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 7	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * 8	|			padding					|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * 9	|			padding					|
+ * 9	|			physical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
- * a	|			birth txg				|
+ * a	|			logical birth txg			|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
  * b	|			fill count				|
  *	+-------+-------+-------+-------+-------+-------+-------+-------+
@@ -166,25 +171,29 @@ typedef struct zio_cksum {
  * cksum	checksum function
  * comp		compression function
  * G		gang block indicator
- * E		endianness
- * type		DMU object type
+ * B		byteorder (endianness)
+ * D		dedup
+ * X		unused
  * lvl		level of indirection
- * birth txg	transaction group in which the block was born
+ * type		DMU object type
+ * phys birth	txg of block allocation; zero if same as logical birth txg
+ * log. birth	transaction group in which the block was logically born
  * fill count	number of non-zero blocks under this bp
  * checksum[4]	256-bit checksum of the data this bp describes
  */
-typedef struct blkptr {
-	dva_t		blk_dva[3];	/* 128-bit Data Virtual Address	*/
-	uint64_t	blk_prop;	/* size, compression, type, etc	*/
-	uint64_t	blk_pad[3];	/* Extra space for the future	*/
-	uint64_t	blk_birth;	/* transaction group at birth	*/
-	uint64_t	blk_fill;	/* fill count			*/
-	zio_cksum_t	blk_cksum;	/* 256-bit checksum		*/
-} blkptr_t;
-
 #define	SPA_BLKPTRSHIFT	7		/* blkptr_t is 128 bytes	*/
 #define	SPA_DVAS_PER_BP	3		/* Number of DVAs in a bp	*/
 
+typedef struct blkptr {
+	dva_t		blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */
+	uint64_t	blk_prop;	/* size, compression, type, etc	    */
+	uint64_t	blk_pad[2];	/* Extra space for the future	    */
+	uint64_t	blk_phys_birth;	/* txg when block was allocated	    */
+	uint64_t	blk_birth;	/* transaction group at birth	    */
+	uint64_t	blk_fill;	/* fill count			    */
+	zio_cksum_t	blk_cksum;	/* 256-bit checksum		    */
+} blkptr_t;
+
 /*
  * Macros to get and set fields in a bp or DVA.
  */
@@ -208,8 +217,7 @@ typedef struct blkptr {
 #define	DVA_SET_GANG(dva, x)	BF64_SET((dva)->dva_word[1], 63, 1, x)
 
 #define	BP_GET_LSIZE(bp)	\
-	(BP_IS_HOLE(bp) ? 0 : \
-	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1))
+	BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)
 #define	BP_SET_LSIZE(bp, x)	\
 	BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x)
 
@@ -218,20 +226,35 @@ typedef struct blkptr {
 #define	BP_SET_PSIZE(bp, x)	\
 	BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x)
 
-#define	BP_GET_COMPRESS(bp)	BF64_GET((bp)->blk_prop, 32, 8)
-#define	BP_SET_COMPRESS(bp, x)	BF64_SET((bp)->blk_prop, 32, 8, x)
+#define	BP_GET_COMPRESS(bp)		BF64_GET((bp)->blk_prop, 32, 8)
+#define	BP_SET_COMPRESS(bp, x)		BF64_SET((bp)->blk_prop, 32, 8, x)
 
-#define	BP_GET_CHECKSUM(bp)	BF64_GET((bp)->blk_prop, 40, 8)
-#define	BP_SET_CHECKSUM(bp, x)	BF64_SET((bp)->blk_prop, 40, 8, x)
+#define	BP_GET_CHECKSUM(bp)		BF64_GET((bp)->blk_prop, 40, 8)
+#define	BP_SET_CHECKSUM(bp, x)		BF64_SET((bp)->blk_prop, 40, 8, x)
 
-#define	BP_GET_TYPE(bp)		BF64_GET((bp)->blk_prop, 48, 8)
-#define	BP_SET_TYPE(bp, x)	BF64_SET((bp)->blk_prop, 48, 8, x)
+#define	BP_GET_TYPE(bp)			BF64_GET((bp)->blk_prop, 48, 8)
+#define	BP_SET_TYPE(bp, x)		BF64_SET((bp)->blk_prop, 48, 8, x)
 
-#define	BP_GET_LEVEL(bp)	BF64_GET((bp)->blk_prop, 56, 5)
-#define	BP_SET_LEVEL(bp, x)	BF64_SET((bp)->blk_prop, 56, 5, x)
+#define	BP_GET_LEVEL(bp)		BF64_GET((bp)->blk_prop, 56, 5)
+#define	BP_SET_LEVEL(bp, x)		BF64_SET((bp)->blk_prop, 56, 5, x)
 
-#define	BP_GET_BYTEORDER(bp)	(0 - BF64_GET((bp)->blk_prop, 63, 1))
-#define	BP_SET_BYTEORDER(bp, x)	BF64_SET((bp)->blk_prop, 63, 1, x)
+#define	BP_GET_PROP_BIT_61(bp)		BF64_GET((bp)->blk_prop, 61, 1)
+#define	BP_SET_PROP_BIT_61(bp, x)	BF64_SET((bp)->blk_prop, 61, 1, x)
+
+#define	BP_GET_DEDUP(bp)		BF64_GET((bp)->blk_prop, 62, 1)
+#define	BP_SET_DEDUP(bp, x)		BF64_SET((bp)->blk_prop, 62, 1, x)
+
+#define	BP_GET_BYTEORDER(bp)		(0 - BF64_GET((bp)->blk_prop, 63, 1))
+#define	BP_SET_BYTEORDER(bp, x)		BF64_SET((bp)->blk_prop, 63, 1, x)
+
+#define	BP_PHYSICAL_BIRTH(bp)		\
+	((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth)
+
+#define	BP_SET_BIRTH(bp, logical, physical)	\
+{						\
+	(bp)->blk_birth = (logical);		\
+	(bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \
+}
 
 #define	BP_GET_ASIZE(bp)	\
 	(DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \
@@ -255,6 +278,12 @@ typedef struct blkptr {
 	((dva1)->dva_word[1] == (dva2)->dva_word[1] && \
 	(dva1)->dva_word[0] == (dva2)->dva_word[0])
 
+#define	BP_EQUAL(bp1, bp2)	\
+	(BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) &&	\
+	DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2]))
+
 #define	ZIO_CHECKSUM_EQUAL(zc1, zc2) \
 	(0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \
 	((zc1).zc_word[1] - (zc2).zc_word[1]) | \
@@ -274,7 +303,10 @@ typedef struct blkptr {
 #define	BP_IDENTITY(bp)		(&(bp)->blk_dva[0])
 #define	BP_IS_GANG(bp)		DVA_GET_GANG(BP_IDENTITY(bp))
 #define	BP_IS_HOLE(bp)		((bp)->blk_birth == 0)
-#define	BP_IS_OLDER(bp, txg)	(!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg))
+
+/* BP_IS_RAIDZ(bp) assumes no block compression */
+#define	BP_IS_RAIDZ(bp)		(DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \
+				BP_GET_PSIZE(bp))
 
 #define	BP_ZERO(bp)				\
 {						\
@@ -287,14 +319,12 @@ typedef struct blkptr {
 	(bp)->blk_prop = 0;			\
 	(bp)->blk_pad[0] = 0;			\
 	(bp)->blk_pad[1] = 0;			\
-	(bp)->blk_pad[2] = 0;			\
+	(bp)->blk_phys_birth = 0;		\
 	(bp)->blk_birth = 0;			\
 	(bp)->blk_fill = 0;			\
 	ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0);	\
 }
 
-#define	BLK_FILL_ALREADY_FREED	(-1ULL)
-
 /*
  * Note: the byteorder is either 0 or -1, both of which are palindromes.
  * This simplifies the endianness handling a bit.
@@ -309,30 +339,92 @@ typedef struct blkptr {
 
 #define	BP_SPRINTF_LEN	320
 
+/*
+ * This macro allows code sharing between zfs, libzpool, and mdb.
+ * 'func' is either snprintf() or mdb_snprintf().
+ * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line.
+ */
+#define	SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress)	\
+{									\
+	static const char *copyname[] =					\
+	    { "zero", "single", "double", "triple" };			\
+	int size = BP_SPRINTF_LEN;					\
+	int len = 0;							\
+	int copies = 0;							\
+									\
+	if (bp == NULL) {						\
+		len = func(buf + len, size - len, "<NULL>");		\
+	} else if (BP_IS_HOLE(bp)) {					\
+		len = func(buf + len, size - len, "<hole>");		\
+	} else {							\
+		for (int d = 0; d < BP_GET_NDVAS(bp); d++) {		\
+			const dva_t *dva = &bp->blk_dva[d];		\
+			if (DVA_IS_VALID(dva))				\
+				copies++;				\
+			len += func(buf + len, size - len,		\
+			    "DVA[%d]=<%llu:%llx:%llx>%c", d,		\
+			    (u_longlong_t)DVA_GET_VDEV(dva),		\
+			    (u_longlong_t)DVA_GET_OFFSET(dva),		\
+			    (u_longlong_t)DVA_GET_ASIZE(dva),		\
+			    ws);					\
+		}							\
+		if (BP_IS_GANG(bp) &&					\
+		    DVA_GET_ASIZE(&bp->blk_dva[2]) <=			\
+		    DVA_GET_ASIZE(&bp->blk_dva[1]) / 2)			\
+			copies--;					\
+		len += func(buf + len, size - len,			\
+		    "[L%llu %s] %s %s %s %s %s %s%c"			\
+		    "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c"	\
+		    "cksum=%llx:%llx:%llx:%llx",			\
+		    (u_longlong_t)BP_GET_LEVEL(bp),			\
+		    type,						\
+		    checksum,						\
+		    compress,						\
+		    BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE",		\
+		    BP_IS_GANG(bp) ? "gang" : "contiguous",		\
+		    BP_GET_DEDUP(bp) ? "dedup" : "unique",		\
+		    copyname[copies],					\
+		    ws,							\
+		    (u_longlong_t)BP_GET_LSIZE(bp),			\
+		    (u_longlong_t)BP_GET_PSIZE(bp),			\
+		    (u_longlong_t)bp->blk_birth,			\
+		    (u_longlong_t)BP_PHYSICAL_BIRTH(bp),		\
+		    (u_longlong_t)bp->blk_fill,				\
+		    ws,							\
+		    (u_longlong_t)bp->blk_cksum.zc_word[0],		\
+		    (u_longlong_t)bp->blk_cksum.zc_word[1],		\
+		    (u_longlong_t)bp->blk_cksum.zc_word[2],		\
+		    (u_longlong_t)bp->blk_cksum.zc_word[3]);		\
+	}								\
+	ASSERT(len < size);						\
+}
+
 #include <sys/dmu.h>
 
 #define	BP_GET_BUFC_TYPE(bp)						\
 	(((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \
 	ARC_BUFC_METADATA : ARC_BUFC_DATA);
-/*
- * Routines found in spa.c
- */
+
+typedef enum spa_import_type {
+	SPA_IMPORT_EXISTING,
+	SPA_IMPORT_ASSEMBLE
+} spa_import_type_t;
 
 /* state manipulation functions */
 extern int spa_open(const char *pool, spa_t **, void *tag);
+extern int spa_open_rewind(const char *pool, spa_t **, void *tag,
+    nvlist_t *policy, nvlist_t **config);
 extern int spa_get_stats(const char *pool, nvlist_t **config,
     char *altroot, size_t buflen);
 extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props,
     const char *history_str, nvlist_t *zplprops);
-extern int spa_check_rootconf(char *devpath, char *devid,
-    nvlist_t **bestconf, uint64_t *besttxg);
-extern boolean_t spa_rootdev_validate(nvlist_t *nv);
 extern int spa_import_rootpool(char *devpath, char *devid);
 extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props);
-extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *);
+extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *);
 extern nvlist_t *spa_tryimport(nvlist_t *tryconfig);
 extern int spa_destroy(char *pool);
-extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force);
+extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force,
+    boolean_t hardforce);
 extern int spa_reset(char *pool);
 extern void spa_async_request(spa_t *spa, int flag);
 extern void spa_async_unrequest(spa_t *spa, int flag);
@@ -346,14 +438,19 @@ extern void spa_inject_delref(spa_t *spa);
 #define	SPA_ASYNC_PROBE		0x04
 #define	SPA_ASYNC_RESILVER_DONE	0x08
 #define	SPA_ASYNC_RESILVER	0x10
+#define	SPA_ASYNC_AUTOEXPAND	0x20
 
 /* device manipulation */
 extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot);
 extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot,
     int replacing);
-extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done);
+extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid,
+    int replace_done);
 extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare);
 extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath);
+extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru);
+extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config,
+    nvlist_t *props, boolean_t exp);
 
 /* spare state (which is global across all pools) */
 extern void spa_spare_add(vdev_t *vd);
@@ -367,7 +464,6 @@ extern void spa_l2cache_remove(vdev_t *vd);
 extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool);
 extern void spa_l2cache_activate(vdev_t *vd);
 extern void spa_l2cache_drop(spa_t *spa);
-extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc);
 
 /* scrubbing */
 extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
@@ -376,6 +472,10 @@ extern int spa_scrub(spa_t *spa, pool_scrub_type_t type);
 extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
 extern void spa_sync_allpools(void);
 
+#define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
+#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
+#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
+
 /* spa namespace global mutex */
 extern kmutex_t spa_namespace_lock;
 
@@ -393,7 +493,6 @@ extern void spa_config_set(spa_t *spa, nvlist_t *config);
 extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg,
     int getstats);
 extern void spa_config_update(spa_t *spa, int what);
-extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
 
 /*
  * Miscellaneous SPA routines in spa_misc.c
@@ -401,7 +500,7 @@ extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot);
 
 /* Namespace manipulation */
 extern spa_t *spa_lookup(const char *name);
-extern spa_t *spa_add(const char *name, const char *altroot);
+extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot);
 extern void spa_remove(spa_t *spa);
 extern spa_t *spa_next(spa_t *prev);
 
@@ -410,6 +509,7 @@ extern void spa_open_ref(spa_t *spa, void *tag);
 extern void spa_close(spa_t *spa, void *tag);
 extern boolean_t spa_refcount_zero(spa_t *spa);
 
+#define	SCL_NONE	0x00
 #define	SCL_CONFIG	0x01
 #define	SCL_STATE	0x02
 #define	SCL_L2ARC	0x04		/* hack until L2ARC 2.0 */
@@ -429,12 +529,30 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw);
 
 /* Pool vdev add/remove lock */
 extern uint64_t spa_vdev_enter(spa_t *spa);
+extern uint64_t spa_vdev_config_enter(spa_t *spa);
+extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg,
+    int error, char *tag);
 extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error);
 
 /* Pool vdev state change lock */
-extern void spa_vdev_state_enter(spa_t *spa);
+extern void spa_vdev_state_enter(spa_t *spa, int oplock);
 extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error);
 
+/* Log state */
+typedef enum spa_log_state {
+	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
+	SPA_LOG_MISSING,	/* missing log(s) */
+	SPA_LOG_CLEAR,		/* clear the log(s) */
+	SPA_LOG_GOOD,		/* log(s) are good */
+} spa_log_state_t;
+
+extern spa_log_state_t spa_get_log_state(spa_t *spa);
+extern void spa_set_log_state(spa_t *spa, spa_log_state_t state);
+extern int spa_offline_log(spa_t *spa);
+
+/* Log claim callback */
+extern void spa_claim_notify(zio_t *zio);
+
 /* Accessor functions */
 extern boolean_t spa_shutting_down(spa_t *spa);
 extern struct dsl_pool *spa_get_dsl(spa_t *spa);
@@ -446,18 +564,26 @@ extern char *spa_name(spa_t *spa);
 extern uint64_t spa_guid(spa_t *spa);
 extern uint64_t spa_last_synced_txg(spa_t *spa);
 extern uint64_t spa_first_txg(spa_t *spa);
+extern uint64_t spa_syncing_txg(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
 extern pool_state_t spa_state(spa_t *spa);
+extern spa_load_state_t spa_load_state(spa_t *spa);
 extern uint64_t spa_freeze_txg(spa_t *spa);
-extern uint64_t spa_get_alloc(spa_t *spa);
-extern uint64_t spa_get_space(spa_t *spa);
-extern uint64_t spa_get_dspace(spa_t *spa);
 extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize);
+extern uint64_t spa_get_dspace(spa_t *spa);
+extern void spa_update_dspace(spa_t *spa);
 extern uint64_t spa_version(spa_t *spa);
+extern boolean_t spa_deflate(spa_t *spa);
+extern metaslab_class_t *spa_normal_class(spa_t *spa);
+extern metaslab_class_t *spa_log_class(spa_t *spa);
 extern int spa_max_replication(spa_t *spa);
 extern int spa_busy(void);
 extern uint8_t spa_get_failmode(spa_t *spa);
 extern boolean_t spa_suspended(spa_t *spa);
+extern uint64_t spa_bootfs(spa_t *spa);
+extern uint64_t spa_delegation(spa_t *spa);
+extern objset_t *spa_meta_objset(spa_t *spa);
+extern enum zio_checksum spa_dedup_checksum(spa_t *spa);
 
 /* Miscellaneous support routines */
 extern int spa_rename(const char *oldname, const char *newname);
@@ -465,16 +591,24 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid);
 extern char *spa_strdup(const char *);
 extern void spa_strfree(char *);
 extern uint64_t spa_get_random(uint64_t range);
-extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp);
+extern uint64_t spa_generate_guid(spa_t *spa);
+extern void sprintf_blkptr(char *buf, const blkptr_t *bp);
 extern void spa_freeze(spa_t *spa);
 extern void spa_upgrade(spa_t *spa, uint64_t version);
 extern void spa_evict_all(void);
 extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid,
     boolean_t l2cache);
 extern boolean_t spa_has_spare(spa_t *, uint64_t guid);
-extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp);
+extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva);
+extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp);
+extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp);
 extern boolean_t spa_has_slogs(spa_t *spa);
 extern boolean_t spa_is_root(spa_t *spa);
+extern boolean_t spa_writeable(spa_t *spa);
+extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to);
+
+extern int spa_mode(spa_t *spa);
+extern uint64_t strtonum(const char *str, char **nptr);
 
 /* history logging */
 typedef enum history_log_type {
@@ -497,16 +631,17 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read,
     char *his_buf);
 extern int spa_history_log(spa_t *spa, const char *his_buf,
     history_log_type_t what);
-void spa_history_internal_log(history_internal_events_t event, spa_t *spa,
-    dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_internal_log(history_internal_events_t event,
+    spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...);
+extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt);
 
 /* error handling */
 struct zbookmark;
-struct zio;
-extern void spa_log_error(spa_t *spa, struct zio *zio);
+extern void spa_log_error(spa_t *spa, zio_t *zio);
 extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd,
-    struct zio *zio, uint64_t stateoroffset, uint64_t length);
+    zio_t *zio, uint64_t stateoroffset, uint64_t length);
 extern void zfs_post_remove(spa_t *spa, vdev_t *vd);
+extern void zfs_post_state_change(spa_t *spa, vdev_t *vd);
 extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd);
 extern uint64_t spa_get_errlog_size(spa_t *spa);
 extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count);
@@ -528,6 +663,7 @@ extern void spa_boot_init();
 extern int spa_prop_set(spa_t *spa, nvlist_t *nvp);
 extern int spa_prop_get(spa_t *spa, nvlist_t **nvp);
 extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx);
+extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t);
 
 /* asynchronous event notification */
 extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
@@ -536,7 +672,7 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name);
 #define	dprintf_bp(bp, fmt, ...) do {				\
 	if (zfs_flags & ZFS_DEBUG_DPRINTF) { 			\
 	char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP);	\
-	sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp));		\
+	sprintf_blkptr(__blkbuf, (bp));				\
 	dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf);		\
 	kmem_free(__blkbuf, BP_SPRINTF_LEN);			\
 	} \
@@ -545,7 +681,7 @@ _NOTE(CONSTCOND) } while (0)
 #define	dprintf_bp(bp, fmt, ...)
 #endif
 
-extern int spa_mode;			/* mode, e.g. FREAD | FWRITE */
+extern int spa_mode_global;			/* mode, e.g. FREAD | FWRITE */
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h
index b56073b97516b..1d3622f5a108b 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_SPA_BOOT_H
 #define	_SYS_SPA_BOOT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/nvpair.h>
 
 #ifdef	__cplusplus
@@ -36,7 +34,6 @@ extern "C" {
 
 extern char *spa_get_bootprop(char *prop);
 extern void spa_free_bootprop(char *prop);
-extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h
index 8aeb414fe9de3..9daec092b4aad 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -78,19 +78,33 @@ typedef struct spa_config_dirent {
 	char		*scd_path;
 } spa_config_dirent_t;
 
-typedef enum spa_log_state {
-	SPA_LOG_UNKNOWN = 0,	/* unknown log state */
-	SPA_LOG_MISSING,	/* missing log(s) */
-	SPA_LOG_CLEAR,		/* clear the log(s) */
-	SPA_LOG_GOOD,		/* log(s) are good */
-} spa_log_state_t;
-
 enum zio_taskq_type {
 	ZIO_TASKQ_ISSUE = 0,
+	ZIO_TASKQ_ISSUE_HIGH,
 	ZIO_TASKQ_INTERRUPT,
+	ZIO_TASKQ_INTERRUPT_HIGH,
 	ZIO_TASKQ_TYPES
 };
 
+/*
+ * State machine for the zpool-pooname process.  The states transitions
+ * are done as follows:
+ *
+ *	From		   To			Routine
+ *	PROC_NONE	-> PROC_CREATED		spa_activate()
+ *	PROC_CREATED	-> PROC_ACTIVE		spa_thread()
+ *	PROC_ACTIVE	-> PROC_DEACTIVATE	spa_deactivate()
+ *	PROC_DEACTIVATE	-> PROC_GONE		spa_thread()
+ *	PROC_GONE	-> PROC_NONE		spa_deactivate()
+ */
+typedef enum spa_proc_state {
+	SPA_PROC_NONE,		/* spa_proc = &p0, no process created */
+	SPA_PROC_CREATED,	/* spa_activate() has proc, is waiting */
+	SPA_PROC_ACTIVE,	/* taskqs created, spa_proc set */
+	SPA_PROC_DEACTIVATE,	/* spa_deactivate() requests process exit */
+	SPA_PROC_GONE		/* spa_thread() is exiting, spa_proc = &p0 */
+} spa_proc_state_t;
+
 struct spa {
 	/*
 	 * Fields protected by spa_namespace_lock.
@@ -99,12 +113,14 @@ struct spa {
 	avl_node_t	spa_avl;		/* node in spa_namespace_avl */
 	nvlist_t	*spa_config;		/* last synced config */
 	nvlist_t	*spa_config_syncing;	/* currently syncing config */
+	nvlist_t	*spa_config_splitting;	/* config for splitting */
 	uint64_t	spa_config_txg;		/* txg of last config change */
 	int		spa_sync_pass;		/* iterate-to-convergence */
 	pool_state_t	spa_state;		/* pool state */
 	int		spa_inject_ref;		/* injection references */
 	uint8_t		spa_sync_on;		/* sync threads are running */
 	spa_load_state_t spa_load_state;	/* current load operation */
+	boolean_t	spa_load_verbatim;	/* load the given config? */
 	taskq_t		*spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES];
 	dsl_pool_t	*spa_dsl_pool;
 	metaslab_class_t *spa_normal_class;	/* normal data class */
@@ -112,6 +128,8 @@ struct spa {
 	uint64_t	spa_first_txg;		/* first txg after spa_open() */
 	uint64_t	spa_final_txg;		/* txg of export/destroy */
 	uint64_t	spa_freeze_txg;		/* freeze pool at this txg */
+	uint64_t	spa_load_max_txg;	/* best initial ub_txg */
+	uint64_t	spa_claim_max_txg;	/* highest claimed birth txg */
 	objset_t	*spa_meta_objset;	/* copy of dp->dp_meta_objset */
 	txg_list_t	spa_vdev_txg_list;	/* per-txg dirty vdev list */
 	vdev_t		*spa_root_vdev;		/* top-level vdev container */
@@ -121,11 +139,14 @@ struct spa {
 	spa_aux_vdev_t	spa_spares;		/* hot spares */
 	spa_aux_vdev_t	spa_l2cache;		/* L2ARC cache devices */
 	uint64_t	spa_config_object;	/* MOS object for pool config */
+	uint64_t	spa_config_generation;	/* config generation number */
 	uint64_t	spa_syncing_txg;	/* txg currently syncing */
-	uint64_t	spa_sync_bplist_obj;	/* object for deferred frees */
-	bplist_t	spa_sync_bplist;	/* deferred-free bplist */
+	uint64_t	spa_deferred_bplist_obj; /* object for deferred frees */
+	bplist_t	spa_deferred_bplist;	/* deferred-free bplist */
+	bplist_t	spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */
 	uberblock_t	spa_ubsync;		/* last synced uberblock */
 	uberblock_t	spa_uberblock;		/* current uberblock */
+	boolean_t	spa_extreme_rewind;	/* rewind past deferred frees */
 	kmutex_t	spa_scrub_lock;		/* resilver/scrub lock */
 	uint64_t	spa_scrub_inflight;	/* in-flight scrub I/Os */
 	uint64_t	spa_scrub_maxinflight;	/* max in-flight scrub I/Os */
@@ -141,12 +162,16 @@ struct spa {
 	int		spa_async_suspended;	/* async tasks suspended */
 	kcondvar_t	spa_async_cv;		/* wait for thread_exit() */
 	uint16_t	spa_async_tasks;	/* async task mask */
-	kmutex_t	spa_async_root_lock;	/* protects async root count */
-	uint64_t	spa_async_root_count;	/* number of async root zios */
-	kcondvar_t	spa_async_root_cv;	/* notify when count == 0 */
 	char		*spa_root;		/* alternate root directory */
 	uint64_t	spa_ena;		/* spa-wide ereport ENA */
-	boolean_t	spa_last_open_failed;	/* true if last open faled */
+	int		spa_last_open_failed;	/* error if last open failed */
+	uint64_t	spa_last_ubsync_txg;	/* "best" uberblock txg */
+	uint64_t	spa_last_ubsync_txg_ts;	/* timestamp from that ub */
+	uint64_t	spa_load_txg;		/* ub txg that loaded */
+	uint64_t	spa_load_txg_ts;	/* timestamp from that ub */
+	uint64_t	spa_load_meta_errors;	/* verify metadata err count */
+	uint64_t	spa_load_data_errors;	/* verify data err count */
+	uint64_t	spa_verify_min_txg;	/* start txg of verify scrub */
 	kmutex_t	spa_errlog_lock;	/* error log lock */
 	uint64_t	spa_errlog_last;	/* last error log object */
 	uint64_t	spa_errlog_scrub;	/* scrub error log object */
@@ -163,14 +188,31 @@ struct spa {
 	uint64_t	spa_failmode;		/* failure mode for the pool */
 	uint64_t	spa_delegation;		/* delegation on/off */
 	list_t		spa_config_list;	/* previous cache file(s) */
+	zio_t		*spa_async_zio_root;	/* root of all async I/O */
 	zio_t		*spa_suspend_zio_root;	/* root of all suspended I/O */
 	kmutex_t	spa_suspend_lock;	/* protects suspend_zio_root */
 	kcondvar_t	spa_suspend_cv;		/* notification of resume */
 	uint8_t		spa_suspended;		/* pool is suspended */
-	boolean_t	spa_import_faulted;	/* allow faulted vdevs */
+	uint8_t		spa_claiming;		/* pool is doing zil_claim() */
 	boolean_t	spa_is_root;		/* pool is root */
 	int		spa_minref;		/* num refs when first opened */
+	int		spa_mode;		/* FREAD | FWRITE */
 	spa_log_state_t spa_log_state;		/* log state */
+	uint64_t	spa_autoexpand;		/* lun expansion on/off */
+	ddt_t		*spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */
+	uint64_t	spa_ddt_stat_object;	/* DDT statistics */
+	uint64_t	spa_dedup_ditto;	/* dedup ditto threshold */
+	uint64_t	spa_dedup_checksum;	/* default dedup checksum */
+	uint64_t	spa_dspace;		/* dspace in normal class */
+	kmutex_t	spa_vdev_top_lock;	/* dueling offline/remove */
+	kmutex_t	spa_proc_lock;		/* protects spa_proc* */
+	kcondvar_t	spa_proc_cv;		/* spa_proc_state transitions */
+	spa_proc_state_t spa_proc_state;	/* see definition */
+	struct proc	*spa_proc;		/* "zpool-poolname" process */
+	uint64_t	spa_did;		/* if procp != p0, did of t1 */
+	boolean_t	spa_autoreplace;	/* autoreplace set in open */
+	int		spa_vdev_locks;		/* locks grabbed */
+
 	/*
 	 * spa_refcnt & spa_config_lock must be the last elements
 	 * because refcount_t changes size based on compilation options.
@@ -183,12 +225,6 @@ struct spa {
 
 extern const char *spa_config_path;
 
-#define	BOOTFS_COMPRESS_VALID(compress) \
-	((compress) == ZIO_COMPRESS_LZJB || \
-	((compress) == ZIO_COMPRESS_ON && \
-	ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \
-	(compress) == ZIO_COMPRESS_OFF)
-
 #ifdef	__cplusplus
 }
 #endif
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h
index db9daef1f156f..6f935c9db27e5 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_SPACE_MAP_H
 #define	_SYS_SPACE_MAP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/avl.h>
 #include <sys/dmu.h>
 
@@ -48,16 +46,24 @@ typedef struct space_map {
 	uint8_t		sm_loading;	/* map loading? */
 	kcondvar_t	sm_load_cv;	/* map load completion */
 	space_map_ops_t	*sm_ops;	/* space map block picker ops vector */
+	avl_tree_t	*sm_pp_root;	/* picker-private AVL tree */
 	void		*sm_ppd;	/* picker-private data */
 	kmutex_t	*sm_lock;	/* pointer to lock that protects map */
 } space_map_t;
 
 typedef struct space_seg {
 	avl_node_t	ss_node;	/* AVL node */
+	avl_node_t	ss_pp_node;	/* AVL picker-private node */
 	uint64_t	ss_start;	/* starting offset of this segment */
 	uint64_t	ss_end;		/* ending offset (non-inclusive) */
 } space_seg_t;
 
+typedef struct space_ref {
+	avl_node_t	sr_node;	/* AVL node */
+	uint64_t	sr_offset;	/* offset (start or end) */
+	int64_t		sr_refcnt;	/* associated reference count */
+} space_ref_t;
+
 typedef struct space_map_obj {
 	uint64_t	smo_object;	/* on-disk space map object */
 	uint64_t	smo_objsize;	/* size of the object */
@@ -70,6 +76,8 @@ struct space_map_ops {
 	uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size);
 	void	(*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size);
 	void	(*smop_free)(space_map_t *sm, uint64_t start, uint64_t size);
+	uint64_t (*smop_max)(space_map_t *sm);
+	boolean_t (*smop_fragmented)(space_map_t *sm);
 };
 
 /*
@@ -133,13 +141,12 @@ extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
 extern void space_map_destroy(space_map_t *sm);
 extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size);
-extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size);
+extern boolean_t space_map_contains(space_map_t *sm,
+    uint64_t start, uint64_t size);
 extern void space_map_vacate(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
 extern void space_map_walk(space_map_t *sm,
     space_map_func_t *func, space_map_t *mdest);
-extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size);
-extern void space_map_union(space_map_t *smd, space_map_t *sms);
 
 extern void space_map_load_wait(space_map_t *sm);
 extern int space_map_load(space_map_t *sm, space_map_ops_t *ops,
@@ -149,12 +156,22 @@ extern void space_map_unload(space_map_t *sm);
 extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size);
 extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size);
 extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size);
+extern uint64_t space_map_maxsize(space_map_t *sm);
 
 extern void space_map_sync(space_map_t *sm, uint8_t maptype,
     space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx);
 extern void space_map_truncate(space_map_obj_t *smo,
     objset_t *os, dmu_tx_t *tx);
 
+extern void space_map_ref_create(avl_tree_t *t);
+extern void space_map_ref_destroy(avl_tree_t *t);
+extern void space_map_ref_add_seg(avl_tree_t *t,
+    uint64_t start, uint64_t end, int64_t refcnt);
+extern void space_map_ref_add_map(avl_tree_t *t,
+    space_map_t *sm, int64_t refcnt);
+extern void space_map_ref_generate_map(avl_tree_t *t,
+    space_map_t *sm, int64_t minref);
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h
index 23bdff211b4a4..6429a6bd8a499 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_TXG_H
 #define	_SYS_TXG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/zfs_context.h>
 
@@ -41,6 +39,9 @@ extern "C" {
 #define	TXG_INITIAL		TXG_SIZE	/* initial txg 		*/
 #define	TXG_IDX			(txg & TXG_MASK)
 
+/* Number of txgs worth of frees we defer adding to in-core spacemaps */
+#define	TXG_DEFER_SIZE		2
+
 #define	TXG_WAIT		1ULL
 #define	TXG_NOWAIT		2ULL
 
@@ -71,8 +72,7 @@ extern void txg_sync_stop(struct dsl_pool *dp);
 extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp);
 extern void txg_rele_to_quiesce(txg_handle_t *txghp);
 extern void txg_rele_to_sync(txg_handle_t *txghp);
-extern void txg_suspend(struct dsl_pool *dp);
-extern void txg_resume(struct dsl_pool *dp);
+extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks);
 
 /*
  * Delay the caller by the specified number of ticks or until
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h
index 7413c662b3555..7b356eac1293b 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -37,13 +37,13 @@ struct tx_cpu {
 	kmutex_t	tc_lock;
 	kcondvar_t	tc_cv[TXG_SIZE];
 	uint64_t	tc_count[TXG_SIZE];
+	list_t		tc_callbacks[TXG_SIZE]; /* commit cb list */
 	char		tc_pad[16];
 };
 
 typedef struct tx_state {
 	tx_cpu_t	*tx_cpu;	/* protects right to enter txg	*/
 	kmutex_t	tx_sync_lock;	/* protects tx_state_t */
-	krwlock_t	tx_suspend;
 	uint64_t	tx_open_txg;	/* currently open txg id */
 	uint64_t	tx_quiesced_txg; /* quiesced txg waiting for sync */
 	uint64_t	tx_syncing_txg;	/* currently syncing txg id */
@@ -64,6 +64,8 @@ typedef struct tx_state {
 
 	kthread_t	*tx_sync_thread;
 	kthread_t	*tx_quiesce_thread;
+
+	taskq_t		*tx_commit_cb_taskq; /* commit callback taskq */
 } tx_state_t;
 
 #ifdef	__cplusplus
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h
index 93d936ae4b18d..b5bb915731452 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,19 +19,16 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_UBERBLOCK_H
 #define	_SYS_UBERBLOCK_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/spa.h>
 #include <sys/vdev.h>
 #include <sys/zio.h>
-#include <sys/zio_checksum.h>
 
 #ifdef	__cplusplus
 extern "C" {
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h
index 55a0dd5aec0d0..c135df9b106b3 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_UBERBLOCK_IMPL_H
 #define	_SYS_UBERBLOCK_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/uberblock.h>
 
 #ifdef	__cplusplus
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h
index c070d6f3d623d..3bf5ba8042e3d 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,13 +36,22 @@
 extern "C" {
 #endif
 
+typedef enum vdev_dtl_type {
+	DTL_MISSING,	/* 0% replication: no copies of the data */
+	DTL_PARTIAL,	/* less than 100% replication: some copies missing */
+	DTL_SCRUB,	/* unable to fully repair during scrub/resilver */
+	DTL_OUTAGE,	/* temporarily missing (used to attempt detach) */
+	DTL_TYPES
+} vdev_dtl_type_t;
+
 extern boolean_t zfs_nocacheflush;
 
 extern int vdev_open(vdev_t *);
+extern void vdev_open_children(vdev_t *);
+extern boolean_t vdev_uses_zvols(vdev_t *);
 extern int vdev_validate(vdev_t *);
 extern void vdev_close(vdev_t *);
 extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace);
-extern void vdev_init(vdev_t *, uint64_t txg);
 extern void vdev_reopen(vdev_t *);
 extern int vdev_validate_aux(vdev_t *vd);
 extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
@@ -50,33 +59,40 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio);
 extern boolean_t vdev_is_bootable(vdev_t *vd);
 extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev);
 extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid);
-extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size);
-extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size);
+extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d,
+    uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d,
+    uint64_t txg, uint64_t size);
+extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d);
 extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg,
     int scrub_done);
+extern boolean_t vdev_dtl_required(vdev_t *vd);
 extern boolean_t vdev_resilver_needed(vdev_t *vd,
     uint64_t *minp, uint64_t *maxp);
 
 extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg);
 extern void vdev_metaslab_fini(vdev_t *vd);
+extern void vdev_metaslab_set_size(vdev_t *);
+extern void vdev_expand(vdev_t *vd, uint64_t txg);
+extern void vdev_split(vdev_t *vd);
+
 
 extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs);
 extern void vdev_clear_stats(vdev_t *vd);
 extern void vdev_stat_update(zio_t *zio, uint64_t psize);
 extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type,
     boolean_t complete);
-extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec);
 extern void vdev_propagate_state(vdev_t *vd);
 extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state,
     vdev_aux_t aux);
 
-extern void vdev_space_update(vdev_t *vd, int64_t space_delta,
-    int64_t alloc_delta, boolean_t update_root);
+extern void vdev_space_update(vdev_t *vd,
+    int64_t alloc_delta, int64_t defer_delta, int64_t space_delta);
 
 extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize);
 
-extern int vdev_fault(spa_t *spa, uint64_t guid);
-extern int vdev_degrade(spa_t *spa, uint64_t guid);
+extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux);
+extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux);
 extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags,
     vdev_state_t *);
 extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags);
@@ -101,11 +117,13 @@ extern void vdev_queue_io_done(zio_t *zio);
 
 extern void vdev_config_dirty(vdev_t *vd);
 extern void vdev_config_clean(vdev_t *vd);
-extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg);
+extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg,
+    boolean_t);
 
 extern void vdev_state_dirty(vdev_t *vd);
 extern void vdev_state_clean(vdev_t *vd);
 
+extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config);
 extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd,
     boolean_t getstats, boolean_t isspare, boolean_t isl2cache);
 
@@ -123,7 +141,8 @@ typedef enum {
 	VDEV_LABEL_REPLACE,	/* replace an existing device */
 	VDEV_LABEL_SPARE,	/* add a new hot spare */
 	VDEV_LABEL_REMOVE,	/* remove an existing device */
-	VDEV_LABEL_L2CACHE	/* add an L2ARC cache device */
+	VDEV_LABEL_L2CACHE,	/* add an L2ARC cache device */
+	VDEV_LABEL_SPLIT	/* generating new label for split-off dev */
 } vdev_labeltype_t;
 
 extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h
index 26904d089a3be..238b9610f5859 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -112,7 +112,9 @@ struct vdev {
 	uint64_t	vdev_id;	/* child number in vdev parent	*/
 	uint64_t	vdev_guid;	/* unique ID for this vdev	*/
 	uint64_t	vdev_guid_sum;	/* self guid + all child guids	*/
+	uint64_t	vdev_orig_guid;	/* orig. guid prior to remove	*/
 	uint64_t	vdev_asize;	/* allocatable device capacity	*/
+	uint64_t	vdev_min_asize;	/* min acceptable asize		*/
 	uint64_t	vdev_ashift;	/* block alignment shift	*/
 	uint64_t	vdev_state;	/* see VDEV_STATE_* #defines	*/
 	uint64_t	vdev_prevstate;	/* used when reopening a vdev	*/
@@ -123,9 +125,13 @@ struct vdev {
 	vdev_t		*vdev_parent;	/* parent vdev			*/
 	vdev_t		**vdev_child;	/* array of children		*/
 	uint64_t	vdev_children;	/* number of children		*/
-	space_map_t	vdev_dtl_map;	/* dirty time log in-core state	*/
-	space_map_t	vdev_dtl_scrub;	/* DTL for scrub repair writes	*/
+	space_map_t	vdev_dtl[DTL_TYPES]; /* in-core dirty time logs	*/
 	vdev_stat_t	vdev_stat;	/* virtual device statistics	*/
+	boolean_t	vdev_expanding;	/* expand the vdev?		*/
+	boolean_t	vdev_reopening;	/* reopen in progress?		*/
+	int		vdev_open_error; /* error on last open		*/
+	kthread_t	*vdev_open_thread; /* thread opening children	*/
+	uint64_t	vdev_crtxg;	/* txg when top-level was added */
 
 	/*
 	 * Top-level vdev state.
@@ -140,16 +146,18 @@ struct vdev {
 	txg_node_t	vdev_txg_node;	/* per-txg dirty vdev linkage	*/
 	boolean_t	vdev_remove_wanted; /* async remove wanted?	*/
 	boolean_t	vdev_probe_wanted; /* async probe wanted?	*/
+	boolean_t	vdev_removing;	/* device is being removed?	*/
 	list_node_t	vdev_config_dirty_node; /* config dirty list	*/
 	list_node_t	vdev_state_dirty_node; /* state dirty list	*/
 	uint64_t	vdev_deflate_ratio; /* deflation ratio (x512)	*/
 	uint64_t	vdev_islog;	/* is an intent log device	*/
+	uint64_t	vdev_ishole;	/* is a hole in the namespace 	*/
 
 	/*
 	 * Leaf vdev state.
 	 */
 	uint64_t	vdev_psize;	/* physical device capacity	*/
-	space_map_obj_t	vdev_dtl;	/* dirty time log on-disk state	*/
+	space_map_obj_t	vdev_dtl_smo;	/* dirty time log space map obj	*/
 	txg_node_t	vdev_dtl_node;	/* per-txg dirty DTL linkage	*/
 	uint64_t	vdev_wholedisk;	/* true if this is a whole disk */
 	uint64_t	vdev_offline;	/* persistent offline state	*/
@@ -160,12 +168,14 @@ struct vdev {
 	char		*vdev_path;	/* vdev path (if any)		*/
 	char		*vdev_devid;	/* vdev devid (if any)		*/
 	char		*vdev_physpath;	/* vdev device path (if any)	*/
+	char		*vdev_fru;	/* physical FRU location	*/
 	uint64_t	vdev_not_present; /* not present during import	*/
 	uint64_t	vdev_unspare;	/* unspare when resilvering done */
 	hrtime_t	vdev_last_try;	/* last reopen time		*/
 	boolean_t	vdev_nowritecache; /* true if flushwritecache failed */
 	boolean_t	vdev_checkremove; /* temporary online test	*/
 	boolean_t	vdev_forcefault; /* force online fault		*/
+	boolean_t	vdev_splitting;	/* split or repair in progress  */
 	uint8_t		vdev_tmpoffline; /* device taken offline temporarily? */
 	uint8_t		vdev_detached;	/* device detached?		*/
 	uint8_t		vdev_cant_read;	/* vdev is failing all reads	*/
@@ -176,6 +186,7 @@ struct vdev {
 	vdev_cache_t	vdev_cache;	/* physical block cache		*/
 	spa_aux_vdev_t	*vdev_aux;	/* for l2cache vdevs		*/
 	zio_t		*vdev_probe_zio; /* root of current probe	*/
+	vdev_aux_t	vdev_label_aux;	/* on-disk aux state		*/
 
 	/*
 	 * For DTrace to work in userland (libzpool) context, these fields must
@@ -189,8 +200,11 @@ struct vdev {
 	kmutex_t	vdev_probe_lock; /* protects vdev_probe_zio	*/
 };
 
-#define	VDEV_SKIP_SIZE		(8 << 10)
-#define	VDEV_BOOT_HEADER_SIZE	(8 << 10)
+#define	VDEV_RAIDZ_MAXPARITY	3
+
+#define	VDEV_PAD_SIZE		(8 << 10)
+/* 2 padding areas (vl_pad1 and vl_pad2) to skip */
+#define	VDEV_SKIP_SIZE		VDEV_PAD_SIZE * 2
 #define	VDEV_PHYS_SIZE		(112 << 10)
 #define	VDEV_UBERBLOCK_RING	(128 << 10)
 
@@ -202,26 +216,14 @@ struct vdev {
 	offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)])
 #define	VDEV_UBERBLOCK_SIZE(vd)		(1ULL << VDEV_UBERBLOCK_SHIFT(vd))
 
-/* ZFS boot block */
-#define	VDEV_BOOT_MAGIC		0x2f5b007b10cULL
-#define	VDEV_BOOT_VERSION	1		/* version number	*/
-
-typedef struct vdev_boot_header {
-	uint64_t	vb_magic;		/* VDEV_BOOT_MAGIC	*/
-	uint64_t	vb_version;		/* VDEV_BOOT_VERSION	*/
-	uint64_t	vb_offset;		/* start offset	(bytes) */
-	uint64_t	vb_size;		/* size (bytes)		*/
-	char		vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)];
-} vdev_boot_header_t;
-
 typedef struct vdev_phys {
-	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)];
-	zio_block_tail_t vp_zbt;
+	char		vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)];
+	zio_eck_t	vp_zbt;
 } vdev_phys_t;
 
 typedef struct vdev_label {
-	char		vl_pad[VDEV_SKIP_SIZE];			/*   8K	*/
-	vdev_boot_header_t vl_boot_header;			/*   8K	*/
+	char		vl_pad1[VDEV_PAD_SIZE];			/*  8K */
+	char		vl_pad2[VDEV_PAD_SIZE];			/*  8K */
 	vdev_phys_t	vl_vdev_phys;				/* 112K	*/
 	char		vl_uberblock[VDEV_UBERBLOCK_RING];	/* 128K	*/
 } vdev_label_t;							/* 256K total */
@@ -250,10 +252,14 @@ typedef struct vdev_label {
 #define	VDEV_ALLOC_ADD		1
 #define	VDEV_ALLOC_SPARE	2
 #define	VDEV_ALLOC_L2CACHE	3
+#define	VDEV_ALLOC_ROOTPOOL	4
+#define	VDEV_ALLOC_SPLIT	5
 
 /*
  * Allocate or free a vdev
  */
+extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid,
+    vdev_ops_t *ops);
 extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config,
     vdev_t *parent, uint_t id, int alloctype);
 extern void vdev_free(vdev_t *vd);
@@ -270,6 +276,7 @@ extern void vdev_remove_parent(vdev_t *cvd);
 /*
  * vdev sync load and sync
  */
+extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd);
 extern void vdev_load(vdev_t *vd);
 extern void vdev_sync(vdev_t *vd, uint64_t txg);
 extern void vdev_sync_done(vdev_t *vd, uint64_t txg);
@@ -285,13 +292,15 @@ extern vdev_ops_t vdev_raidz_ops;
 extern vdev_ops_t vdev_disk_ops;
 extern vdev_ops_t vdev_file_ops;
 extern vdev_ops_t vdev_missing_ops;
+extern vdev_ops_t vdev_hole_ops;
 extern vdev_ops_t vdev_spare_ops;
 
 /*
  * Common size functions
  */
 extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize);
-extern uint64_t vdev_get_rsize(vdev_t *vd);
+extern uint64_t vdev_get_min_asize(vdev_t *vd);
+extern void vdev_set_min_asize(vdev_t *vd);
 
 /*
  * zdb uses this tunable, so it must be declared here to make lint happy.
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h
index f88cc068bd579..3b9de2a2f93a0 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZAP_H
 #define	_SYS_ZAP_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * ZAP - ZFS Attribute Processor
  *
@@ -87,9 +85,6 @@
 extern "C" {
 #endif
 
-#define	ZAP_MAXNAMELEN 256
-#define	ZAP_MAXVALUELEN 1024
-
 /*
  * The matchtype specifies which entry will be accessed.
  * MT_EXACT: only find an exact match (non-normalized)
@@ -106,6 +101,18 @@ typedef enum matchtype
 	MT_FIRST
 } matchtype_t;
 
+typedef enum zap_flags {
+	/* Use 64-bit hash value (serialized cursors will always use 64-bits) */
+	ZAP_FLAG_HASH64 = 1 << 0,
+	/* Key is binary, not string (zap_add_uint64() can be used) */
+	ZAP_FLAG_UINT64_KEY = 1 << 1,
+	/*
+	 * First word of key (which must be an array of uint64) is
+	 * already randomly distributed.
+	 */
+	ZAP_FLAG_PRE_HASHED_KEY = 1 << 2,
+} zap_flags_t;
+
 /*
  * Create a new zapobj with no attributes and return its object number.
  * MT_EXACT will cause the zap object to only support MT_EXACT lookups,
@@ -123,6 +130,9 @@ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot,
     dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
+uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx);
 
 /*
  * Create a new zapobj with no attributes from the given (unallocated)
@@ -185,6 +195,12 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     matchtype_t mt, char *realname, int rn_len,
     boolean_t *normalization_conflictp);
+int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf);
+int zap_contains(objset_t *ds, uint64_t zapobj, const char *name);
+
+int zap_count_write(objset_t *os, uint64_t zapobj, const char *name,
+    int add, uint64_t *towrite, uint64_t *tooverwrite);
 
 /*
  * Create an attribute with the given name and value.
@@ -192,9 +208,12 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name,
  * If an attribute with the given name already exists, the call will
  * fail and return EEXIST.
  */
-int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
+int zap_add(objset_t *ds, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
+int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx);
 
 /*
  * Set the attribute with the given name to the given value.  If an
@@ -206,6 +225,9 @@ int zap_add(objset_t *ds, uint64_t zapobj, const char *name,
  */
 int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
+int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx);
 
 /*
  * Get the length (in integers) and the integer size of the specified
@@ -216,6 +238,8 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name,
  */
 int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers);
+int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t *integer_size, uint64_t *num_integers);
 
 /*
  * Remove the specified attribute.
@@ -226,6 +250,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name,
 int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx);
 int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name,
     matchtype_t mt, dmu_tx_t *tx);
+int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx);
 
 /*
  * Returns (in *count) the number of attributes in the specified zap
@@ -257,6 +283,8 @@ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx);
 int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx);
 int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value);
+int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx);
 
 struct zap;
 struct zap_leaf;
@@ -266,6 +294,7 @@ typedef struct zap_cursor {
 	struct zap *zc_zap;
 	struct zap_leaf *zc_leaf;
 	uint64_t zc_zapobj;
+	uint64_t zc_serialized;
 	uint64_t zc_hash;
 	uint32_t zc_cd;
 } zap_cursor_t;
@@ -316,6 +345,11 @@ void zap_cursor_advance(zap_cursor_t *zc);
  */
 uint64_t zap_cursor_serialize(zap_cursor_t *zc);
 
+/*
+ * Advance the cursor to the attribute having the given key.
+ */
+int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt);
+
 /*
  * Initialize a zap cursor pointing to the position recorded by
  * zap_cursor_serialize (in the "serialized" argument).  You can also
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h
index 0dc02ab6b0ac0..5aa0efc98d4f9 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZAP_IMPL_H
 #define	_SYS_ZAP_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zap.h>
 #include <sys/zfs_context.h>
 #include <sys/avl.h>
@@ -42,13 +40,13 @@ extern int fzap_default_block_shift;
 
 #define	FZAP_BLOCK_SHIFT(zap)	((zap)->zap_f.zap_block_shift)
 
-#define	ZAP_MAXCD		(uint32_t)(-1)
-#define	ZAP_HASHBITS		28
 #define	MZAP_ENT_LEN		64
 #define	MZAP_NAME_LEN		(MZAP_ENT_LEN - 8 - 4 - 2)
 #define	MZAP_MAX_BLKSHIFT	SPA_MAXBLOCKSHIFT
 #define	MZAP_MAX_BLKSZ		(1 << MZAP_MAX_BLKSHIFT)
 
+#define	ZAP_NEED_CD		(-1U)
+
 typedef struct mzap_ent_phys {
 	uint64_t mze_value;
 	uint32_t mze_cd;
@@ -72,7 +70,6 @@ typedef struct mzap_ent {
 	mzap_ent_phys_t mze_phys;
 } mzap_ent_t;
 
-
 /*
  * The (fat) zap is stored in one object. It is an array of
  * 1<<FZAP_BLOCK_SHIFT byte blocks. The layout looks like one of:
@@ -129,6 +126,7 @@ typedef struct zap_phys {
 	uint64_t zap_num_entries;	/* number of entries */
 	uint64_t zap_salt;		/* salt to stir into hash function */
 	uint64_t zap_normflags;		/* flags for u8_textprep_str() */
+	uint64_t zap_flags;		/* zap_flags_t */
 	/*
 	 * This structure is followed by padding, and then the embedded
 	 * pointer table.  The embedded pointer table takes up second
@@ -170,10 +168,13 @@ typedef struct zap {
 
 typedef struct zap_name {
 	zap_t *zn_zap;
-	const char *zn_name_orij;
+	int zn_key_intlen;
+	const void *zn_key_orig;
+	int zn_key_orig_numints;
+	const void *zn_key_norm;
+	int zn_key_norm_numints;
 	uint64_t zn_hash;
 	matchtype_t zn_matchtype;
-	const char *zn_name_norm;
 	char zn_normbuf[ZAP_MAXNAMELEN];
 } zap_name_t;
 
@@ -185,8 +186,11 @@ int zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
     krw_t lti, boolean_t fatreader, boolean_t adding, zap_t **zapp);
 void zap_unlockdir(zap_t *zap);
 void zap_evict(dmu_buf_t *db, void *vmzap);
-zap_name_t *zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt);
+zap_name_t *zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt);
 void zap_name_free(zap_name_t *zn);
+int zap_hashbits(zap_t *zap);
+uint32_t zap_maxcd(zap_t *zap);
+uint64_t zap_getflags(zap_t *zap);
 
 #define	ZAP_HASH_IDX(hash, n) (((n) == 0) ? 0 : ((hash) >> (64 - (n))))
 
@@ -195,6 +199,8 @@ int fzap_count(zap_t *zap, uint64_t *count);
 int fzap_lookup(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers, void *buf,
     char *realname, int rn_len, boolean_t *normalization_conflictp);
+int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
+    uint64_t *tooverwrite);
 int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx);
 int fzap_update(zap_name_t *zn,
@@ -209,7 +215,8 @@ void zap_put_leaf(struct zap_leaf *l);
 int fzap_add_cd(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, uint32_t cd, dmu_tx_t *tx);
-void fzap_upgrade(zap_t *zap, dmu_tx_t *tx);
+void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags);
+int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h
index 14144e059e540..173b6b195e19f 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h
@@ -19,20 +19,20 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZAP_LEAF_H
 #define	_SYS_ZAP_LEAF_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
 
 struct zap;
+struct zap_name;
+struct zap_stats;
 
 #define	ZAP_LEAF_MAGIC 0x2AB1EAF
 
@@ -129,12 +129,12 @@ typedef struct zap_leaf_phys {
 typedef union zap_leaf_chunk {
 	struct zap_leaf_entry {
 		uint8_t le_type; 		/* always ZAP_CHUNK_ENTRY */
-		uint8_t le_int_size;		/* size of ints */
+		uint8_t le_value_intlen;	/* size of value's ints */
 		uint16_t le_next;		/* next entry in hash chain */
 		uint16_t le_name_chunk;		/* first chunk of the name */
-		uint16_t le_name_length;	/* bytes in name, incl null */
+		uint16_t le_name_numints;	/* ints in name (incl null) */
 		uint16_t le_value_chunk;	/* first chunk of the value */
-		uint16_t le_value_length;	/* value length in ints */
+		uint16_t le_value_numints;	/* value length in ints */
 		uint32_t le_cd;			/* collision differentiator */
 		uint64_t le_hash;		/* hash value of the name */
 	} l_entry;
@@ -177,7 +177,7 @@ typedef struct zap_entry_handle {
  * value must equal zap_hash(name).
  */
 extern int zap_leaf_lookup(zap_leaf_t *l,
-    zap_name_t *zn, zap_entry_handle_t *zeh);
+    struct zap_name *zn, zap_entry_handle_t *zeh);
 
 /*
  * Return a handle to the entry with this hash+cd, or the entry with the
@@ -193,10 +193,10 @@ extern int zap_leaf_lookup_closest(zap_leaf_t *l,
  * num_integers in the attribute.
  */
 extern int zap_entry_read(const zap_entry_handle_t *zeh,
-	uint8_t integer_size, uint64_t num_integers, void *buf);
+    uint8_t integer_size, uint64_t num_integers, void *buf);
 
-extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
-	uint16_t buflen, char *buf);
+extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh,
+    uint16_t buflen, char *buf);
 
 /*
  * Replace the value of an existing entry.
@@ -204,7 +204,7 @@ extern int zap_entry_read_name(const zap_entry_handle_t *zeh,
  * zap_entry_update may fail if it runs out of space (ENOSPC).
  */
 extern int zap_entry_update(zap_entry_handle_t *zeh,
-	uint8_t integer_size, uint64_t num_integers, const void *buf);
+    uint8_t integer_size, uint64_t num_integers, const void *buf);
 
 /*
  * Remove an entry.
@@ -216,17 +216,16 @@ extern void zap_entry_remove(zap_entry_handle_t *zeh);
  * belong in this leaf (according to its hash value).  Fills in the
  * entry handle on success.  Returns 0 on success or ENOSPC on failure.
  */
-extern int zap_entry_create(zap_leaf_t *l,
-	const char *name, uint64_t h, uint32_t cd,
-	uint8_t integer_size, uint64_t num_integers, const void *buf,
-	zap_entry_handle_t *zeh);
+extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd,
+    uint8_t integer_size, uint64_t num_integers, const void *buf,
+    zap_entry_handle_t *zeh);
 
 /*
  * Return true if there are additional entries with the same normalized
  * form.
  */
 extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
-    zap_name_t *zn, const char *name, zap_t *zap);
+    struct zap_name *zn, const char *name, struct zap *zap);
 
 /*
  * Other stuff.
@@ -235,7 +234,8 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh,
 extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort);
 extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len);
 extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort);
-extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs);
+extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l,
+    struct zap_stats *zs);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h
index bd91b33d16886..3488962e216f0 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -114,8 +114,6 @@ typedef struct zfs_acl_phys {
 	uint8_t		z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */
 } zfs_acl_phys_t;
 
-
-
 typedef struct acl_ops {
 	uint32_t	(*ace_mask_get) (void *acep); /* get  access mask */
 	void 		(*ace_mask_set) (void *acep,
@@ -161,12 +159,21 @@ typedef struct zfs_acl {
 	zfs_acl_node_t	*z_curr_node;	/* current node iterator is handling */
 	list_t		z_acl;		/* chunks of ACE data */
 	acl_ops_t	z_ops;		/* ACL operations */
-	boolean_t	z_has_fuids;	/* FUIDs present in ACL? */
 } zfs_acl_t;
 
 #define	ACL_DATA_ALLOCED	0x1
 #define	ZFS_ACL_SIZE(aclcnt)	(sizeof (ace_t) * (aclcnt))
 
+struct zfs_fuid_info;
+
+typedef struct zfs_acl_ids {
+	uint64_t		z_fuid;		/* file owner fuid */
+	uint64_t		z_fgid;		/* file group owner fuid */
+	uint64_t		z_mode;		/* mode to set on create */
+	zfs_acl_t		*z_aclp;	/* ACL to create with file */
+	struct zfs_fuid_info 	*z_fuidp;	/* for tracking fuids for log */
+} zfs_acl_ids_t;
+
 /*
  * Property values for acl_mode and acl_inherit.
  *
@@ -183,17 +190,20 @@ typedef struct zfs_acl {
 
 struct znode;
 struct zfsvfs;
-struct zfs_fuid_info;
 
 #ifdef _KERNEL
-void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *,
-    dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **);
+int zfs_acl_ids_create(struct znode *, int, vattr_t *,
+    cred_t *, vsecattr_t *, zfs_acl_ids_t *);
+void zfs_acl_ids_free(zfs_acl_ids_t *);
+boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *);
 int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
 int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *);
 void zfs_acl_rele(void *);
 void zfs_oldace_byteswap(ace_t *, int);
 void zfs_ace_byteswap(void *, size_t, boolean_t);
+extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr);
 extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *);
+int zfs_fastaccesschk_execute(struct znode *, cred_t *);
 extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *);
 extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *);
 extern int zfs_acl_access(struct znode *, int, cred_t *);
@@ -202,9 +212,9 @@ int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *);
 int zfs_zaccess_rename(struct znode *, struct znode *,
     struct znode *, struct znode *, cred_t *cr);
 void zfs_acl_free(zfs_acl_t *);
-int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **);
-int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *,
-    struct zfs_fuid_info **, dmu_tx_t *);
+int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *,
+    struct zfs_fuid_info **, zfs_acl_t **);
+int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *);
 
 #endif
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h
index a5be3e1303db2..558e9e1884e37 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ZFS_CONTEXT_H
 #define	_SYS_ZFS_CONTEXT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -62,7 +60,9 @@ extern "C" {
 #include <sys/zfs_debug.h>
 #include <sys/sysevent.h>
 #include <sys/sysevent/eventdefs.h>
+#include <sys/sysevent/dev.h>
 #include <sys/fm/util.h>
+#include <sys/sunddi.h>
 
 #define	CPU_SEQID	(CPU->cpu_seqid)
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h
index ce29625d1e3ad..c15c946d5dc17 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_ZFS_CTLDIR_H
 #define	_ZFS_CTLDIR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/pathname.h>
 #include <sys/vnode.h>
 #include <sys/zfs_vfsops.h>
@@ -66,6 +64,7 @@ int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp);
 
 #define	ZFSCTL_INO_ROOT		0x1
 #define	ZFSCTL_INO_SNAPDIR	0x2
+#define	ZFSCTL_INO_SHARES	0x3
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h
index ebb66e8ae4e90..f050f7f24de44 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FS_ZFS_DIR_H
 #define	_SYS_FS_ZFS_DIR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/pathname.h>
 #include <sys/dmu.h>
 #include <sys/zfs_znode.h>
@@ -44,11 +42,11 @@ extern "C" {
 #define	ZRENAMING	0x0010		/* znode is being renamed */
 #define	ZCILOOK		0x0020		/* case-insensitive lookup requested */
 #define	ZCIEXACT	0x0040		/* c-i requires c-s match (rename) */
+#define	ZHAVELOCK	0x0080		/* z_name_lock is already held */
 
 /* mknode flags */
 #define	IS_ROOT_NODE	0x01		/* create a root node */
 #define	IS_XATTR	0x02		/* create an extended attribute node */
-#define	IS_REPLAY	0x04		/* we are replaying intent log */
 
 extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **,
     int, int *, pathname_t *);
@@ -59,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int,
 extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *,
     pathname_t *);
 extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *,
-    uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **);
+    uint_t, znode_t **, int, zfs_acl_ids_t *);
 extern void zfs_rmnode(znode_t *);
 extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old);
 extern boolean_t zfs_dirempty(znode_t *);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h
index 810ffc81a8ccb..0feb3ce4bb7ca 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FS_ZFS_FUID_H
 #define	_SYS_FS_ZFS_FUID_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef _KERNEL
 #include <sys/kidmap.h>
 #include <sys/sid.h>
@@ -51,11 +49,11 @@ typedef enum {
  * Estimate space needed for one more fuid table entry.
  * for now assume its current size + 1K
  */
-#define	FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
+#define	FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1))
 
-#define	FUID_INDEX(x)	(x >> 32)
-#define	FUID_RID(x)	(x & 0xffffffff)
-#define	FUID_ENCODE(idx, rid) ((idx << 32) | rid)
+#define	FUID_INDEX(x)	((x) >> 32)
+#define	FUID_RID(x)	((x) & 0xffffffff)
+#define	FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid))
 /*
  * FUIDs cause problems for the intent log
  * we need to replay the creation of the FUID,
@@ -102,19 +100,27 @@ typedef struct zfs_fuid_info {
 #ifdef _KERNEL
 struct znode;
 extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t);
+extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t,
+    uint64_t, uint64_t, zfs_fuid_type_t);
 extern void zfs_fuid_destroy(zfsvfs_t *);
 extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t,
-    dmu_tx_t *, cred_t *, zfs_fuid_info_t **);
+    cred_t *, zfs_fuid_info_t **);
 extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t,
-    dmu_tx_t *, zfs_fuid_info_t **);
-extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid,
-    uid_t *gid);
+    zfs_fuid_info_t **);
+extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr,
+    uid_t *uid, uid_t *gid);
 extern zfs_fuid_info_t *zfs_fuid_info_alloc(void);
-extern void zfs_fuid_info_free();
+extern void zfs_fuid_info_free(zfs_fuid_info_t *);
 extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *);
+void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *);
+extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain,
+    char **retdomain, boolean_t addok);
+extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx);
+extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 #endif
 
 char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t);
+void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *);
 uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *);
 void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *);
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
index 1692608bb9ce6..90eecb812f23a 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h
@@ -19,19 +19,18 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZFS_IOCTL_H
 #define	_SYS_ZFS_IOCTL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/cred.h>
 #include <sys/dmu.h>
 #include <sys/zio.h>
 #include <sys/dsl_deleg.h>
+#include <sys/spa.h>
 
 #ifdef _KERNEL
 #include <sys/nvpair.h>
@@ -47,26 +46,85 @@ extern "C" {
 #define	ZFS_SNAPDIR_HIDDEN		0
 #define	ZFS_SNAPDIR_VISIBLE		1
 
-#define	DMU_BACKUP_STREAM_VERSION (1ULL)
-#define	DMU_BACKUP_HEADER_VERSION (2ULL)
+/*
+ * Field manipulation macros for the drr_versioninfo field of the
+ * send stream header.
+ */
+
+/*
+ * Header types for zfs send streams.
+ */
+typedef enum drr_headertype {
+	DMU_SUBSTREAM = 0x1,
+	DMU_COMPOUNDSTREAM = 0x2
+} drr_headertype_t;
+
+#define	DMU_GET_STREAM_HDRTYPE(vi)	BF64_GET((vi), 0, 2)
+#define	DMU_SET_STREAM_HDRTYPE(vi, x)	BF64_SET((vi), 0, 2, x)
+
+#define	DMU_GET_FEATUREFLAGS(vi)	BF64_GET((vi), 2, 30)
+#define	DMU_SET_FEATUREFLAGS(vi, x)	BF64_SET((vi), 2, 30, x)
+
+/*
+ * Feature flags for zfs send streams (flags in drr_versioninfo)
+ */
+
+#define	DMU_BACKUP_FEATURE_DEDUP	(0x1)
+#define	DMU_BACKUP_FEATURE_DEDUPPROPS	(0x2)
+
+/*
+ * Mask of all supported backup features
+ */
+#define	DMU_BACKUP_FEATURE_MASK	(DMU_BACKUP_FEATURE_DEDUP | \
+		DMU_BACKUP_FEATURE_DEDUPPROPS)
+
+/* Are all features in the given flag word currently supported? */
+#define	DMU_STREAM_SUPPORTED(x)	(!((x) & ~DMU_BACKUP_FEATURE_MASK))
+
+/*
+ * The drr_versioninfo field of the dmu_replay_record has the
+ * following layout:
+ *
+ *	64	56	48	40	32	24	16	8	0
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *  	|		reserved	|        feature-flags	    |C|S|
+ *	+-------+-------+-------+-------+-------+-------+-------+-------+
+ *
+ * The low order two bits indicate the header type: SUBSTREAM (0x1)
+ * or COMPOUNDSTREAM (0x2).  Using two bits for this is historical:
+ * this field used to be a version number, where the two version types
+ * were 1 and 2.  Using two bits for this allows earlier versions of
+ * the code to be able to recognize send streams that don't use any
+ * of the features indicated by feature flags.
+ */
+
 #define	DMU_BACKUP_MAGIC 0x2F5bacbacULL
 
 #define	DRR_FLAG_CLONE		(1<<0)
 #define	DRR_FLAG_CI_DATA	(1<<1)
 
+/*
+ * flags in the drr_checksumflags field in the DRR_WRITE and
+ * DRR_WRITE_BYREF blocks
+ */
+#define	DRR_CHECKSUM_DEDUP	(1<<0)
+
+#define	DRR_IS_DEDUP_CAPABLE(flags)	((flags) & DRR_CHECKSUM_DEDUP)
+
 /*
  * zfs ioctl command structure
  */
 typedef struct dmu_replay_record {
 	enum {
 		DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS,
-		DRR_WRITE, DRR_FREE, DRR_END,
+		DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF,
+		DRR_NUMTYPES
 	} drr_type;
 	uint32_t drr_payloadlen;
 	union {
 		struct drr_begin {
 			uint64_t drr_magic;
-			uint64_t drr_version;
+			uint64_t drr_versioninfo; /* was drr_version */
 			uint64_t drr_creation_time;
 			dmu_objset_type_t drr_type;
 			uint32_t drr_flags;
@@ -76,6 +134,7 @@ typedef struct dmu_replay_record {
 		} drr_begin;
 		struct drr_end {
 			zio_cksum_t drr_checksum;
+			uint64_t drr_toguid;
 		} drr_end;
 		struct drr_object {
 			uint64_t drr_object;
@@ -83,14 +142,16 @@ typedef struct dmu_replay_record {
 			dmu_object_type_t drr_bonustype;
 			uint32_t drr_blksz;
 			uint32_t drr_bonuslen;
-			uint8_t drr_checksum;
+			uint8_t drr_checksumtype;
 			uint8_t drr_compress;
 			uint8_t drr_pad[6];
+			uint64_t drr_toguid;
 			/* bonus content follows */
 		} drr_object;
 		struct drr_freeobjects {
 			uint64_t drr_firstobj;
 			uint64_t drr_numobjs;
+			uint64_t drr_toguid;
 		} drr_freeobjects;
 		struct drr_write {
 			uint64_t drr_object;
@@ -98,13 +159,35 @@ typedef struct dmu_replay_record {
 			uint32_t drr_pad;
 			uint64_t drr_offset;
 			uint64_t drr_length;
+			uint64_t drr_toguid;
+			uint8_t drr_checksumtype;
+			uint8_t drr_checksumflags;
+			uint8_t drr_pad2[6];
+			ddt_key_t drr_key; /* deduplication key */
 			/* content follows */
 		} drr_write;
 		struct drr_free {
 			uint64_t drr_object;
 			uint64_t drr_offset;
 			uint64_t drr_length;
+			uint64_t drr_toguid;
 		} drr_free;
+		struct drr_write_byref {
+			/* where to put the data */
+			uint64_t drr_object;
+			uint64_t drr_offset;
+			uint64_t drr_length;
+			uint64_t drr_toguid;
+			/* where to find the prior copy of the data */
+			uint64_t drr_refguid;
+			uint64_t drr_refobject;
+			uint64_t drr_refoffset;
+			/* properties of the data */
+			uint8_t drr_checksumtype;
+			uint8_t drr_checksumflags;
+			uint8_t drr_pad2[6];
+			ddt_key_t drr_key; /* deduplication key */
+		} drr_write_byref;
 	} drr_u;
 } dmu_replay_record_t;
 
@@ -118,7 +201,11 @@ typedef struct zinject_record {
 	uint32_t	zi_error;
 	uint64_t	zi_type;
 	uint32_t	zi_freq;
-	uint32_t	zi_pad;	/* pad out to 64 bit alignment */
+	uint32_t	zi_failfast;
+	char		zi_func[MAXNAMELEN];
+	uint32_t	zi_iotype;
+	int32_t		zi_duration;
+	uint64_t	zi_timer;
 } zinject_record_t;
 
 #define	ZINJECT_NULL		0x1
@@ -148,6 +235,7 @@ typedef struct zfs_cmd {
 	char		zc_name[MAXPATHLEN];
 	char		zc_value[MAXPATHLEN * 2];
 	char		zc_string[MAXNAMELEN];
+	char		zc_top_ds[MAXPATHLEN];
 	uint64_t	zc_guid;
 	uint64_t	zc_nvlist_conf;		/* really (char *) */
 	uint64_t	zc_nvlist_conf_size;
@@ -162,15 +250,27 @@ typedef struct zfs_cmd {
 	uint64_t 	zc_history_len;
 	uint64_t	zc_history_offset;
 	uint64_t	zc_obj;
+	uint64_t	zc_iflags;		/* internal to zfs(7fs) */
 	zfs_share_t	zc_share;
 	dmu_objset_stats_t zc_objset_stats;
 	struct drr_begin zc_begin_record;
 	zinject_record_t zc_inject_record;
+	boolean_t	zc_defer_destroy;
+	boolean_t	zc_temphold;
 } zfs_cmd_t;
 
+typedef struct zfs_useracct {
+	char zu_domain[256];
+	uid_t zu_rid;
+	uint32_t zu_pad;
+	uint64_t zu_space;
+} zfs_useracct_t;
+
 #define	ZVOL_MAX_MINOR	(1 << 16)
 #define	ZFS_MIN_MINOR	(ZVOL_MAX_MINOR + 1)
 
+#define	ZPOOL_EXPORT_AFTER_SPLIT 0x1
+
 #ifdef _KERNEL
 
 typedef struct zfs_creat {
@@ -185,7 +285,7 @@ extern int zfs_secpolicy_rename_perms(const char *from,
     const char *to, cred_t *cr);
 extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr);
 extern int zfs_busy(void);
-extern int zfs_unmount_snap(char *, void *);
+extern int zfs_unmount_snap(const char *, void *);
 
 #endif	/* _KERNEL */
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h
index 87b75e6e75b5a..e961b756107b3 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FS_ZFS_VFSOPS_H
 #define	_SYS_FS_ZFS_VFSOPS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/isa_defs.h>
 #include <sys/types32.h>
 #include <sys/list.h>
@@ -49,13 +47,13 @@ struct zfsvfs {
 	uint64_t	z_root;		/* id of root znode */
 	uint64_t	z_unlinkedobj;	/* id of unlinked zapobj */
 	uint64_t	z_max_blksz;	/* maximum block size for files */
-	uint64_t	z_assign;	/* TXG_NOWAIT or set by zil_replay() */
 	uint64_t	z_fuid_obj;	/* fuid table object number */
 	uint64_t	z_fuid_size;	/* fuid table size */
 	avl_tree_t	z_fuid_idx;	/* fuid tree keyed by index */
 	avl_tree_t	z_fuid_domain;	/* fuid tree keyed by domain */
 	krwlock_t	z_fuid_lock;	/* fuid lock */
 	boolean_t	z_fuid_loaded;	/* fuid tables are loaded */
+	boolean_t	z_fuid_dirty;   /* need to sync fuid table ? */
 	struct zfs_fuid_info	*z_fuid_replay; /* fuid info for replay */
 	zilog_t		*z_log;		/* intent log pointer */
 	uint_t		z_acl_mode;	/* acl chmod/mode behavior */
@@ -74,8 +72,12 @@ struct zfsvfs {
 	boolean_t	z_issnap;	/* true if this is a snapshot */
 	boolean_t	z_vscan;	/* virus scan on/off */
 	boolean_t	z_use_fuids;	/* version allows fuids */
-	kmutex_t	z_online_recv_lock; /* recv in prog grabs as WRITER */
+	boolean_t	z_replay;	/* set during ZIL replay */
 	uint64_t	z_version;	/* ZPL version */
+	uint64_t	z_shares_dir;	/* hidden shares dir */
+	kmutex_t	z_lock;
+	uint64_t	z_userquota_obj;
+	uint64_t	z_groupquota_obj;
 #define	ZFS_OBJ_MTX_SZ	64
 	kmutex_t	z_hold_mtx[ZFS_OBJ_MTX_SZ];	/* znode hold locks */
 };
@@ -130,8 +132,20 @@ typedef struct zfid_long {
 
 extern uint_t zfs_fsyncer_key;
 
-extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode);
-extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode);
+extern int zfs_suspend_fs(zfsvfs_t *zfsvfs);
+extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname);
+extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t *valuep);
+extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep);
+extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t quota);
+extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs,
+    boolean_t isgroup, uint64_t fuid);
+extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers);
+extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp);
+extern void zfsvfs_free(zfsvfs_t *zfsvfs);
+extern int zfs_check_global_label(const char *dsname, const char *hexsl);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h
index a5416525c7a37..a064627f157b6 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -57,6 +57,7 @@ extern "C" {
 #define	ZFS_OPAQUE		0x0000010000000000
 #define	ZFS_AV_QUARANTINED 	0x0000020000000000
 #define	ZFS_AV_MODIFIED 	0x0000040000000000
+#define	ZFS_REPARSE		0x0000080000000000
 
 #define	ZFS_ATTR_SET(zp, attr, value)	\
 { \
@@ -77,6 +78,7 @@ extern "C" {
 #define	ZFS_ACL_DEFAULTED	0x20		/* ACL should be defaulted */
 #define	ZFS_ACL_AUTO_INHERIT	0x40		/* ACL should be inherited */
 #define	ZFS_BONUS_SCANSTAMP	0x80		/* Scanstamp in bonus area */
+#define	ZFS_NO_EXECS_DENIED	0x100		/* exec was given to everyone */
 
 /*
  * Is ID ephemeral?
@@ -93,12 +95,15 @@ extern "C" {
 
 /*
  * Special attributes for master node.
+ * "userquota@" and "groupquota@" are also valid (from
+ * zfs_userquota_prop_prefixes[]).
  */
 #define	ZFS_FSID		"FSID"
 #define	ZFS_UNLINKED_SET	"DELETE_QUEUE"
 #define	ZFS_ROOT_OBJ		"ROOT"
 #define	ZPL_VERSION_STR		"VERSION"
 #define	ZFS_FUID_TABLES		"FUID"
+#define	ZFS_SHARES_DIR		"SHARES"
 
 #define	ZFS_MAX_BLOCKSIZE	(SPA_MAXBLOCKSIZE)
 
@@ -171,6 +176,7 @@ typedef struct znode_phys {
 typedef struct zfs_dirlock {
 	char		*dl_name;	/* directory entry being locked */
 	uint32_t	dl_sharecnt;	/* 0 if exclusive, > 0 if shared */
+	uint8_t		dl_namelock;	/* 1 if z_name_lock is NOT held */
 	uint16_t	dl_namesize;	/* set if dl_name was allocated */
 	kcondvar_t	dl_cv;		/* wait for entry to be unlocked */
 	struct znode	*dl_dzp;	/* directory znode */
@@ -182,7 +188,6 @@ typedef struct znode {
 	vnode_t		*z_vnode;
 	uint64_t	z_id;		/* object ID for this znode */
 	kmutex_t	z_lock;		/* znode modification lock */
-	krwlock_t	z_map_lock;	/* page map lock */
 	krwlock_t	z_parent_lock;	/* parent lock for directories */
 	krwlock_t	z_name_lock;	/* "master" lock for dirent locks */
 	zfs_dirlock_t	*z_dirlocks;	/* directory entry lock list */
@@ -198,6 +203,7 @@ typedef struct znode {
 	uint64_t	z_gen;		/* generation (same as zp_gen) */
 	uint32_t	z_sync_cnt;	/* synchronous open count */
 	kmutex_t	z_acl_lock;	/* acl data lock */
+	zfs_acl_t	*z_acl_cached;	/* cached acl */
 	list_node_t	z_link_node;	/* all znodes in fs link */
 	/*
 	 * These are dmu managed fields.
@@ -310,7 +316,6 @@ extern int	zfs_create_op_tables();
 extern int	zfs_sync(vfs_t *vfsp, short flag, cred_t *cr);
 extern dev_t	zfs_cmpldev(uint64_t);
 extern int	zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value);
-extern int	zfs_set_version(const char *name, uint64_t newvers);
 extern int	zfs_get_stats(objset_t *os, nvlist_t *nv);
 extern void	zfs_znode_dmu_fini(znode_t *);
 
@@ -337,6 +342,7 @@ extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
     vsecattr_t *vsecp, zfs_fuid_info_t *fuidp);
 extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap);
 extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
+extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx);
 
 extern caddr_t zfs_map_page(page_t *, enum seg_rw);
 extern void zfs_unmap_page(page_t *, caddr_t);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h
index 4d02d14f70756..b603241db733c 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -55,28 +55,40 @@ typedef struct zil_header {
 	uint64_t zh_claim_txg;	/* txg in which log blocks were claimed */
 	uint64_t zh_replay_seq;	/* highest replayed sequence number */
 	blkptr_t zh_log;	/* log chain */
-	uint64_t zh_claim_seq;	/* highest claimed sequence number */
-	uint64_t zh_pad[5];
+	uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */
+	uint64_t zh_flags;	/* header flags */
+	uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */
+	uint64_t zh_pad[3];
 } zil_header_t;
 
 /*
- * Log block trailer - structure at the end of the header and each log block
+ * zh_flags bit settings
+ */
+#define	ZIL_REPLAY_NEEDED	0x1	/* replay needed - internal only */
+#define	ZIL_CLAIM_LR_SEQ_VALID	0x2	/* zh_claim_lr_seq field is valid */
+
+/*
+ * Log block chaining.
  *
- * The zit_bt contains a zbt_cksum which for the intent log is
+ * Log blocks are chained together. Originally they were chained at the
+ * end of the block. For performance reasons the chain was moved to the
+ * beginning of the block which allows writes for only the data being used.
+ * The older position is supported for backwards compatability.
+ *
+ * The zio_eck_t contains a zec_cksum which for the intent log is
  * the sequence number of this log block. A seq of 0 is invalid.
- * The zbt_cksum is checked by the SPA against the sequence
+ * The zec_cksum is checked by the SPA against the sequence
  * number passed in the blk_cksum field of the blkptr_t
  */
-typedef struct zil_trailer {
-	uint64_t zit_pad;
-	blkptr_t zit_next_blk;	/* next block in chain */
-	uint64_t zit_nused;	/* bytes in log block used */
-	zio_block_tail_t zit_bt; /* block trailer */
-} zil_trailer_t;
+typedef struct zil_chain {
+	uint64_t zc_pad;
+	blkptr_t zc_next_blk;	/* next block in chain */
+	uint64_t zc_nused;	/* bytes in log block used */
+	zio_eck_t zc_eck;	/* block trailer */
+} zil_chain_t;
 
 #define	ZIL_MIN_BLKSZ	4096ULL
 #define	ZIL_MAX_BLKSZ	SPA_MAXBLOCKSIZE
-#define	ZIL_BLK_DATA_SZ(lwb)	((lwb)->lwb_sz - sizeof (zil_trailer_t))
 
 /*
  * The words of a log block checksum.
@@ -133,7 +145,8 @@ typedef enum zil_create {
 #define	TX_MKDIR_ACL		17	/* mkdir with ACL */
 #define	TX_MKDIR_ATTR		18	/* mkdir with attr */
 #define	TX_MKDIR_ACL_ATTR	19	/* mkdir with ACL + attrs */
-#define	TX_MAX_TYPE		20	/* Max transaction type */
+#define	TX_WRITE2		20	/* dmu_sync EALREADY write */
+#define	TX_MAX_TYPE		21	/* Max transaction type */
 
 /*
  * The transactions for mkdir, symlink, remove, rmdir, link, and rename
@@ -142,6 +155,20 @@ typedef enum zil_create {
  */
 #define	TX_CI	((uint64_t)0x1 << 63) /* case-insensitive behavior requested */
 
+/*
+ * Transactions for write, truncate, setattr, acl_v0, and acl can be logged
+ * out of order.  For convenience in the code, all such records must have
+ * lr_foid at the same offset.
+ */
+#define	TX_OOO(txtype)			\
+	((txtype) == TX_WRITE ||	\
+	(txtype) == TX_TRUNCATE ||	\
+	(txtype) == TX_SETATTR ||	\
+	(txtype) == TX_ACL_V0 ||	\
+	(txtype) == TX_ACL ||		\
+	(txtype) == TX_WRITE2)
+
+
 /*
  * Format of log records.
  * The fields are carefully defined to allow them to be aligned
@@ -161,6 +188,14 @@ typedef struct {			/* common log record header */
 	uint64_t	lrc_seq;	/* see comment above */
 } lr_t;
 
+/*
+ * Common start of all out-of-order record types (TX_OOO() above).
+ */
+typedef struct {
+	lr_t		lr_common;	/* common portion of log record */
+	uint64_t	lr_foid;	/* object id */
+} lr_ooo_t;
+
 /*
  * Handle option extended vattr attributes.
  *
@@ -251,7 +286,7 @@ typedef struct {
 	uint64_t	lr_foid;	/* file object to write */
 	uint64_t	lr_offset;	/* offset to write to */
 	uint64_t	lr_length;	/* user data length to write */
-	uint64_t	lr_blkoff;	/* offset represented by lr_blkptr */
+	uint64_t	lr_blkoff;	/* no longer used */
 	blkptr_t	lr_blkptr;	/* spa block pointer for replay */
 	/* write data will follow for small writes */
 } lr_write_t;
@@ -299,13 +334,34 @@ typedef struct {
  */
 
 /*
- * ZFS intent log transaction structure
+ * Writes are handled in three different ways:
+ *
+ * WR_INDIRECT:
+ *    In this mode, if we need to commit the write later, then the block
+ *    is immediately written into the file system (using dmu_sync),
+ *    and a pointer to the block is put into the log record.
+ *    When the txg commits the block is linked in.
+ *    This saves additionally writing the data into the log record.
+ *    There are a few requirements for this to occur:
+ *	- write is greater than zfs/zvol_immediate_write_sz
+ *	- not using slogs (as slogs are assumed to always be faster
+ *	  than writing into the main pool)
+ *	- the write occupies only one block
+ * WR_COPIED:
+ *    If we know we'll immediately be committing the
+ *    transaction (FSYNC or FDSYNC), the we allocate a larger
+ *    log record here for the data and copy the data in.
+ * WR_NEED_COPY:
+ *    Otherwise we don't allocate a buffer, and *if* we need to
+ *    flush the write later then a buffer is allocated and
+ *    we retrieve the data using the dmu.
  */
 typedef enum {
 	WR_INDIRECT,	/* indirect - a large write (dmu_sync() data */
 			/* and put blkptr in log, rather than actual data) */
 	WR_COPIED,	/* immediate - data is copied into lr_write_t */
 	WR_NEED_COPY,	/* immediate - data needs to be copied if pushed */
+	WR_NUM_STATES	/* number of states */
 } itx_wr_state_t;
 
 typedef struct itx {
@@ -318,27 +374,14 @@ typedef struct itx {
 	/* followed by type-specific part of lr_xx_t and its immediate data */
 } itx_t;
 
-
-/*
- * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done()
- * to handle the cleanup of the dmu_sync() buffer write
- */
-typedef struct {
-	zilog_t		*zgd_zilog;	/* zilog */
-	blkptr_t	*zgd_bp;	/* block pointer */
-	struct rl	*zgd_rl;	/* range lock */
-} zgd_t;
-
-
-typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
+typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg,
     uint64_t txg);
-typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
+typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg,
     uint64_t txg);
 typedef int zil_replay_func_t();
-typedef void zil_replay_cleaner_t();
 typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio);
 
-extern uint64_t	zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
+extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg);
 
 extern void	zil_init(void);
@@ -350,28 +393,31 @@ extern void	zil_free(zilog_t *zilog);
 extern zilog_t	*zil_open(objset_t *os, zil_get_data_t *get_data);
 extern void	zil_close(zilog_t *zilog);
 
-extern void	zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-    zil_replay_func_t *replay_func[TX_MAX_TYPE],
-    zil_replay_cleaner_t *replay_cleaner);
+extern void	zil_replay(objset_t *os, void *arg,
+    zil_replay_func_t *replay_func[TX_MAX_TYPE]);
+extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_destroy(zilog_t *zilog, boolean_t keep_first);
 extern void	zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx);
 
 extern itx_t	*zil_itx_create(uint64_t txtype, size_t lrsize);
+extern void	zil_itx_destroy(itx_t *itx);
 extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx);
 
 extern void	zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid);
 
-extern int	zil_claim(char *osname, void *txarg);
-extern int	zil_check_log_chain(char *osname, void *txarg);
-extern int	zil_clear_log_chain(char *osname, void *txarg);
+extern int	zil_vdev_offline(const char *osname, void *txarg);
+extern int	zil_claim(const char *osname, void *txarg);
+extern int	zil_check_log_chain(const char *osname, void *txarg);
 extern void	zil_sync(zilog_t *zilog, dmu_tx_t *tx);
 extern void	zil_clean(zilog_t *zilog);
-extern int	zil_is_committed(zilog_t *zilog);
 
 extern int	zil_suspend(zilog_t *zilog);
 extern void	zil_resume(zilog_t *zilog);
 
-extern void	zil_add_block(zilog_t *zilog, blkptr_t *bp);
+extern void	zil_add_block(zilog_t *zilog, const blkptr_t *bp);
+extern int	zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp);
+
+extern void	zil_set_logbias(zilog_t *zilog, uint64_t slogval);
 
 extern int zil_disable;
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h
index 0fc800b96dea9..c46063b0527af 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZIL_IMPL_H
 #define	_SYS_ZIL_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zil.h>
 #include <sys/dmu_objset.h>
 
@@ -45,8 +43,8 @@ typedef struct lwb {
 	int		lwb_sz;		/* size of block and buffer */
 	char		*lwb_buf;	/* log write buffer */
 	zio_t		*lwb_zio;	/* zio for this buffer */
+	dmu_tx_t	*lwb_tx;	/* tx for log block allocation */
 	uint64_t	lwb_max_txg;	/* highest txg in this lwb */
-	txg_handle_t	lwb_txgh;	/* txg handle for txg_exit() */
 	list_node_t	lwb_node;	/* zilog->zl_lwb_list linkage */
 } lwb_t;
 
@@ -59,6 +57,8 @@ typedef struct zil_vdev_node {
 	avl_node_t	zv_node;	/* AVL tree linkage */
 } zil_vdev_node_t;
 
+#define	ZIL_PREV_BLKS 16
+
 /*
  * Stable storage intent log management structure.  One per dataset.
  */
@@ -70,20 +70,27 @@ struct zilog {
 	objset_t	*zl_os;		/* object set we're logging */
 	zil_get_data_t	*zl_get_data;	/* callback to get object content */
 	zio_t		*zl_root_zio;	/* log writer root zio */
-	uint64_t	zl_itx_seq;	/* next itx sequence number */
+	uint64_t	zl_itx_seq;	/* next in-core itx sequence number */
+	uint64_t	zl_lr_seq;	/* on-disk log record sequence number */
 	uint64_t	zl_commit_seq;	/* committed upto this number */
-	uint64_t	zl_lr_seq;	/* log record sequence number */
+	uint64_t	zl_commit_lr_seq; /* last committed on-disk lr seq */
 	uint64_t	zl_destroy_txg;	/* txg of last zil_destroy() */
-	uint64_t	zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */
+	uint64_t	zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */
+	uint64_t	zl_replaying_seq; /* current replay seq number */
 	uint32_t	zl_suspend;	/* log suspend count */
 	kcondvar_t	zl_cv_writer;	/* log writer thread completion */
 	kcondvar_t	zl_cv_suspend;	/* log suspend completion */
 	uint8_t		zl_suspending;	/* log is currently suspending */
 	uint8_t		zl_keep_first;	/* keep first log block in destroy */
-	uint8_t		zl_stop_replay;	/* don't replay any further */
+	uint8_t		zl_replay;	/* replaying records while set */
 	uint8_t		zl_stop_sync;	/* for debugging */
 	uint8_t		zl_writer;	/* boolean: write setup in progress */
-	uint8_t		zl_log_error;	/* boolean: log write error */
+	uint8_t		zl_logbias;	/* latency or throughput */
+	int		zl_parse_error;	/* last zil_parse() error */
+	uint64_t	zl_parse_blk_seq; /* highest blk seq on last parse */
+	uint64_t	zl_parse_lr_seq; /* highest lr seq on last parse */
+	uint64_t	zl_parse_blk_count; /* number of blocks parsed */
+	uint64_t	zl_parse_lr_count; /* number of log records parsed */
 	list_t		zl_itx_list;	/* in-memory itx list */
 	uint64_t	zl_itx_list_sz;	/* total size of records on list */
 	uint64_t	zl_cur_used;	/* current commit log size used */
@@ -92,15 +99,21 @@ struct zilog {
 	kmutex_t	zl_vdev_lock;	/* protects zl_vdev_tree */
 	avl_tree_t	zl_vdev_tree;	/* vdevs to flush in zil_commit() */
 	taskq_t		*zl_clean_taskq; /* runs lwb and itx clean tasks */
-	avl_tree_t	zl_dva_tree;	/* track DVAs during log parse */
+	avl_tree_t	zl_bp_tree;	/* track bps during log parse */
 	clock_t		zl_replay_time;	/* lbolt of when replay started */
 	uint64_t	zl_replay_blks;	/* number of log blocks replayed */
+	zil_header_t	zl_old_header;	/* debugging aid */
+	uint_t		zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */
+	uint_t		zl_prev_rotor;	/* rotor for zl_prev[] */
 };
 
-typedef struct zil_dva_node {
+typedef struct zil_bp_node {
 	dva_t		zn_dva;
 	avl_node_t	zn_node;
-} zil_dva_node_t;
+} zil_bp_node_t;
+
+#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \
+    sizeof (lr_write_t))
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h
index 4de78dfee0141..b81b6a4392b0a 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -38,12 +38,15 @@
 extern "C" {
 #endif
 
-#define	ZBT_MAGIC	0x210da7ab10c7a11ULL	/* zio data bloc tail */
+/*
+ * Embedded checksum
+ */
+#define	ZEC_MAGIC	0x210da7ab10c7a11ULL
 
-typedef struct zio_block_tail {
-	uint64_t	zbt_magic;	/* for validation, endianness	*/
-	zio_cksum_t	zbt_cksum;	/* 256-bit checksum		*/
-} zio_block_tail_t;
+typedef struct zio_eck {
+	uint64_t	zec_magic;	/* for validation, endianness	*/
+	zio_cksum_t	zec_cksum;	/* 256-bit checksum		*/
+} zio_eck_t;
 
 /*
  * Gang block headers are self-checksumming and contain an array
@@ -51,16 +54,16 @@ typedef struct zio_block_tail {
  */
 #define	SPA_GANGBLOCKSIZE	SPA_MINBLOCKSIZE
 #define	SPA_GBH_NBLKPTRS	((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_block_tail_t)) / sizeof (blkptr_t))
+	sizeof (zio_eck_t)) / sizeof (blkptr_t))
 #define	SPA_GBH_FILLER		((SPA_GANGBLOCKSIZE - \
-	sizeof (zio_block_tail_t) - \
+	sizeof (zio_eck_t) - \
 	(SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\
 	sizeof (uint64_t))
 
 typedef struct zio_gbh {
 	blkptr_t		zg_blkptr[SPA_GBH_NBLKPTRS];
 	uint64_t		zg_filler[SPA_GBH_FILLER];
-	zio_block_tail_t	zg_tail;
+	zio_eck_t		zg_tail;
 } zio_gbh_phys_t;
 
 enum zio_checksum {
@@ -73,12 +76,19 @@ enum zio_checksum {
 	ZIO_CHECKSUM_FLETCHER_2,
 	ZIO_CHECKSUM_FLETCHER_4,
 	ZIO_CHECKSUM_SHA256,
+	ZIO_CHECKSUM_ZILOG2,
 	ZIO_CHECKSUM_FUNCTIONS
 };
 
-#define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_2
+#define	ZIO_CHECKSUM_ON_VALUE	ZIO_CHECKSUM_FLETCHER_4
 #define	ZIO_CHECKSUM_DEFAULT	ZIO_CHECKSUM_ON
 
+#define	ZIO_CHECKSUM_MASK	0xffULL
+#define	ZIO_CHECKSUM_VERIFY	(1 << 8)
+
+#define	ZIO_DEDUPCHECKSUM	ZIO_CHECKSUM_SHA256
+#define	ZIO_DEDUPDITTO_MIN	100
+
 enum zio_compress {
 	ZIO_COMPRESS_INHERIT = 0,
 	ZIO_COMPRESS_ON,
@@ -94,12 +104,19 @@ enum zio_compress {
 	ZIO_COMPRESS_GZIP_7,
 	ZIO_COMPRESS_GZIP_8,
 	ZIO_COMPRESS_GZIP_9,
+	ZIO_COMPRESS_ZLE,
 	ZIO_COMPRESS_FUNCTIONS
 };
 
 #define	ZIO_COMPRESS_ON_VALUE	ZIO_COMPRESS_LZJB
 #define	ZIO_COMPRESS_DEFAULT	ZIO_COMPRESS_OFF
 
+#define	BOOTFS_COMPRESS_VALID(compress)			\
+	((compress) == ZIO_COMPRESS_LZJB ||		\
+	((compress) == ZIO_COMPRESS_ON &&		\
+	ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) ||	\
+	(compress) == ZIO_COMPRESS_OFF)
+
 #define	ZIO_FAILURE_MODE_WAIT		0
 #define	ZIO_FAILURE_MODE_CONTINUE	1
 #define	ZIO_FAILURE_MODE_PANIC		2
@@ -107,65 +124,88 @@ enum zio_compress {
 #define	ZIO_PRIORITY_NOW		(zio_priority_table[0])
 #define	ZIO_PRIORITY_SYNC_READ		(zio_priority_table[1])
 #define	ZIO_PRIORITY_SYNC_WRITE		(zio_priority_table[2])
-#define	ZIO_PRIORITY_ASYNC_READ		(zio_priority_table[3])
-#define	ZIO_PRIORITY_ASYNC_WRITE	(zio_priority_table[4])
-#define	ZIO_PRIORITY_FREE		(zio_priority_table[5])
-#define	ZIO_PRIORITY_CACHE_FILL		(zio_priority_table[6])
-#define	ZIO_PRIORITY_LOG_WRITE		(zio_priority_table[7])
-#define	ZIO_PRIORITY_RESILVER		(zio_priority_table[8])
-#define	ZIO_PRIORITY_SCRUB		(zio_priority_table[9])
-#define	ZIO_PRIORITY_TABLE_SIZE		10
-
-#define	ZIO_FLAG_MUSTSUCCEED		0x00000
-#define	ZIO_FLAG_CANFAIL		0x00001
-#define	ZIO_FLAG_SPECULATIVE		0x00002
-#define	ZIO_FLAG_CONFIG_WRITER		0x00004
-#define	ZIO_FLAG_DONT_RETRY		0x00008
-
-#define	ZIO_FLAG_DONT_CACHE		0x00010
-#define	ZIO_FLAG_DONT_QUEUE		0x00020
-#define	ZIO_FLAG_DONT_AGGREGATE		0x00040
-#define	ZIO_FLAG_DONT_PROPAGATE		0x00080
-
-#define	ZIO_FLAG_IO_BYPASS		0x00100
-#define	ZIO_FLAG_IO_REPAIR		0x00200
-#define	ZIO_FLAG_IO_RETRY		0x00400
-#define	ZIO_FLAG_IO_REWRITE		0x00800
-
-#define	ZIO_FLAG_PROBE			0x01000
-#define	ZIO_FLAG_RESILVER		0x02000
-#define	ZIO_FLAG_SCRUB			0x04000
-#define	ZIO_FLAG_SCRUB_THREAD		0x08000
-
-#define	ZIO_FLAG_GANG_CHILD		0x10000
-
-#define	ZIO_FLAG_GANG_INHERIT		\
-	(ZIO_FLAG_CANFAIL |		\
-	ZIO_FLAG_SPECULATIVE |		\
-	ZIO_FLAG_CONFIG_WRITER |	\
-	ZIO_FLAG_DONT_RETRY |		\
-	ZIO_FLAG_DONT_CACHE |		\
-	ZIO_FLAG_DONT_AGGREGATE |	\
-	ZIO_FLAG_RESILVER |		\
-	ZIO_FLAG_SCRUB |		\
-	ZIO_FLAG_SCRUB_THREAD)
-
-#define	ZIO_FLAG_VDEV_INHERIT		\
-	(ZIO_FLAG_GANG_INHERIT |	\
-	ZIO_FLAG_IO_REPAIR |		\
-	ZIO_FLAG_IO_RETRY |		\
-	ZIO_FLAG_PROBE)
+#define	ZIO_PRIORITY_LOG_WRITE		(zio_priority_table[3])
+#define	ZIO_PRIORITY_CACHE_FILL		(zio_priority_table[4])
+#define	ZIO_PRIORITY_AGG		(zio_priority_table[5])
+#define	ZIO_PRIORITY_FREE		(zio_priority_table[6])
+#define	ZIO_PRIORITY_ASYNC_WRITE	(zio_priority_table[7])
+#define	ZIO_PRIORITY_ASYNC_READ		(zio_priority_table[8])
+#define	ZIO_PRIORITY_RESILVER		(zio_priority_table[9])
+#define	ZIO_PRIORITY_SCRUB		(zio_priority_table[10])
+#define	ZIO_PRIORITY_TABLE_SIZE		11
 
 #define	ZIO_PIPELINE_CONTINUE		0x100
 #define	ZIO_PIPELINE_STOP		0x101
 
+enum zio_flag {
+	/*
+	 * Flags inherited by gang, ddt, and vdev children,
+	 * and that must be equal for two zios to aggregate
+	 */
+	ZIO_FLAG_DONT_AGGREGATE	= 1 << 0,
+	ZIO_FLAG_IO_REPAIR	= 1 << 1,
+	ZIO_FLAG_SELF_HEAL	= 1 << 2,
+	ZIO_FLAG_RESILVER	= 1 << 3,
+	ZIO_FLAG_SCRUB		= 1 << 4,
+	ZIO_FLAG_SCRUB_THREAD	= 1 << 5,
+
+#define	ZIO_FLAG_AGG_INHERIT	(ZIO_FLAG_CANFAIL - 1)
+
+	/*
+	 * Flags inherited by ddt, gang, and vdev children.
+	 */
+	ZIO_FLAG_CANFAIL	= 1 << 6,	/* must be first for INHERIT */
+	ZIO_FLAG_SPECULATIVE	= 1 << 7,
+	ZIO_FLAG_CONFIG_WRITER	= 1 << 8,
+	ZIO_FLAG_DONT_RETRY	= 1 << 9,
+	ZIO_FLAG_DONT_CACHE	= 1 << 10,
+	ZIO_FLAG_NODATA		= 1 << 11,
+	ZIO_FLAG_INDUCE_DAMAGE	= 1 << 12,
+
+#define	ZIO_FLAG_DDT_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
+#define	ZIO_FLAG_GANG_INHERIT	(ZIO_FLAG_IO_RETRY - 1)
+
+	/*
+	 * Flags inherited by vdev children.
+	 */
+	ZIO_FLAG_IO_RETRY	= 1 << 13,	/* must be first for INHERIT */
+	ZIO_FLAG_PROBE		= 1 << 14,
+	ZIO_FLAG_TRYHARD	= 1 << 15,
+	ZIO_FLAG_OPTIONAL	= 1 << 16,
+
+#define	ZIO_FLAG_VDEV_INHERIT	(ZIO_FLAG_DONT_QUEUE - 1)
+
+	/*
+	 * Flags not inherited by any children.
+	 */
+	ZIO_FLAG_DONT_QUEUE	= 1 << 17,	/* must be first for INHERIT */
+	ZIO_FLAG_DONT_PROPAGATE	= 1 << 18,
+	ZIO_FLAG_IO_BYPASS	= 1 << 19,
+	ZIO_FLAG_IO_REWRITE	= 1 << 20,
+	ZIO_FLAG_RAW		= 1 << 21,
+	ZIO_FLAG_GANG_CHILD	= 1 << 22,
+	ZIO_FLAG_DDT_CHILD	= 1 << 23,
+	ZIO_FLAG_GODFATHER	= 1 << 24
+};
+
+#define	ZIO_FLAG_MUSTSUCCEED		0
+
+#define	ZIO_DDT_CHILD_FLAGS(zio)				\
+	(((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) |		\
+	ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL)
+
 #define	ZIO_GANG_CHILD_FLAGS(zio)				\
 	(((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) |		\
 	ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL)
 
+#define	ZIO_VDEV_CHILD_FLAGS(zio)				\
+	(((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) |		\
+	ZIO_FLAG_CANFAIL)
+
 enum zio_child {
 	ZIO_CHILD_VDEV = 0,
 	ZIO_CHILD_GANG,
+	ZIO_CHILD_DDT,
 	ZIO_CHILD_LOGICAL,
 	ZIO_CHILD_TYPES
 };
@@ -183,7 +223,6 @@ enum zio_wait_type {
 #define	ECKSUM	EBADE
 #define	EFRAGS	EBADR
 
-typedef struct zio zio_t;
 typedef void zio_done_func_t(zio_t *zio);
 
 extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE];
@@ -192,18 +231,15 @@ extern char *zio_type_name[ZIO_TYPES];
 /*
  * A bookmark is a four-tuple <objset, object, level, blkid> that uniquely
  * identifies any block in the pool.  By convention, the meta-objset (MOS)
- * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is
- * level -1 of the meta-dnode, and intent log blocks (which are chained
- * off the root block) have blkid == sequence number.  In summary:
+ * is objset 0, and the meta-dnode is object 0.  This covers all blocks
+ * except root blocks and ZIL blocks, which are defined as follows:
  *
- *	mos is objset 0
- *	meta-dnode is object 0
- *	root block is <objset, 0, -1, 0>
- *	intent log is <objset, 0, -1, ZIL sequence number>
+ * Root blocks (objset_phys_t) are object 0, level -1:  <objset, 0, -1, 0>.
+ * ZIL blocks are bookmarked <objset, 0, -2, blkid == ZIL sequence number>.
+ * dmu_sync()ed ZIL data blocks are bookmarked <objset, object, -2, blkid>.
  *
- * Note: this structure is called a bookmark because its first purpose was
- * to remember where to resume a pool-wide traverse.  The absolute ordering
- * for block visitation during traversal is defined in compare_bookmark().
+ * Note: this structure is called a bookmark because its original purpose
+ * was to remember where to resume a pool-wide traverse.
  *
  * Note: this structure is passed between userland and the kernel.
  * Therefore it must not change size or alignment between 32/64 bit
@@ -216,14 +252,66 @@ typedef struct zbookmark {
 	uint64_t	zb_blkid;
 } zbookmark_t;
 
+#define	SET_BOOKMARK(zb, objset, object, level, blkid)  \
+{                                                       \
+	(zb)->zb_objset = objset;                       \
+	(zb)->zb_object = object;                       \
+	(zb)->zb_level = level;                         \
+	(zb)->zb_blkid = blkid;                         \
+}
+
+#define	ZB_DESTROYED_OBJSET	(-1ULL)
+
+#define	ZB_ROOT_OBJECT		(0ULL)
+#define	ZB_ROOT_LEVEL		(-1LL)
+#define	ZB_ROOT_BLKID		(0ULL)
+
+#define	ZB_ZIL_OBJECT		(0ULL)
+#define	ZB_ZIL_LEVEL		(-2LL)
+
 typedef struct zio_prop {
 	enum zio_checksum	zp_checksum;
 	enum zio_compress	zp_compress;
 	dmu_object_type_t	zp_type;
 	uint8_t			zp_level;
-	uint8_t			zp_ndvas;
+	uint8_t			zp_copies;
+	uint8_t			zp_dedup;
+	uint8_t			zp_dedup_verify;
 } zio_prop_t;
 
+typedef struct zio_cksum_report zio_cksum_report_t;
+
+typedef void zio_cksum_finish_f(zio_cksum_report_t *rep,
+    const void *good_data);
+typedef void zio_cksum_free_f(void *cbdata, size_t size);
+
+struct zio_bad_cksum;				/* defined in zio_checksum.h */
+
+struct zio_cksum_report {
+	struct zio_cksum_report *zcr_next;
+	nvlist_t		*zcr_ereport;
+	nvlist_t		*zcr_detector;
+	void			*zcr_cbdata;
+	size_t			zcr_cbinfo;	/* passed to zcr_free() */
+	uint64_t		zcr_align;
+	uint64_t		zcr_length;
+	zio_cksum_finish_f	*zcr_finish;
+	zio_cksum_free_f	*zcr_free;
+
+	/* internal use only */
+	struct zio_bad_cksum	*zcr_ckinfo;	/* information from failure */
+};
+
+typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr,
+    void *arg);
+
+zio_vsd_cksum_report_f	zio_vsd_default_cksum_report;
+
+typedef struct zio_vsd_ops {
+	zio_done_func_t		*vsd_free;
+	zio_vsd_cksum_report_f	*vsd_cksum_report;
+} zio_vsd_ops_t;
+
 typedef struct zio_gang_node {
 	zio_gbh_phys_t		*gn_gbh;
 	struct zio_gang_node	*gn_child[SPA_GBH_NBLKPTRS];
@@ -254,6 +342,13 @@ typedef int zio_pipe_stage_t(zio_t *zio);
 #define	ZIO_REEXECUTE_NOW	0x01
 #define	ZIO_REEXECUTE_SUSPEND	0x02
 
+typedef struct zio_link {
+	zio_t		*zl_parent;
+	zio_t		*zl_child;
+	list_node_t	zl_parent_node;
+	list_node_t	zl_child_node;
+} zio_link_t;
+
 struct zio {
 	/* Core information about this I/O */
 	zbookmark_t	io_bookmark;
@@ -263,15 +358,15 @@ struct zio {
 	int		io_cmd;
 	uint8_t		io_priority;
 	uint8_t		io_reexecute;
-	uint8_t		io_async_root;
+	uint8_t		io_state[ZIO_WAIT_TYPES];
 	uint64_t	io_txg;
 	spa_t		*io_spa;
 	blkptr_t	*io_bp;
+	blkptr_t	*io_bp_override;
 	blkptr_t	io_bp_copy;
-	zio_t		*io_parent;
-	zio_t		*io_child;
-	zio_t		*io_sibling_prev;
-	zio_t		*io_sibling_next;
+	list_t		io_parent_list;
+	list_t		io_child_list;
+	zio_link_t	*io_walk_link;
 	zio_t		*io_logical;
 	zio_transform_t *io_transform_stack;
 
@@ -279,35 +374,40 @@ struct zio {
 	zio_done_func_t	*io_ready;
 	zio_done_func_t	*io_done;
 	void		*io_private;
+	int64_t		io_prev_space_delta;	/* DMU private */
 	blkptr_t	io_bp_orig;
 
 	/* Data represented by this I/O */
 	void		*io_data;
+	void		*io_orig_data;
 	uint64_t	io_size;
+	uint64_t	io_orig_size;
 
 	/* Stuff for the vdev stack */
 	vdev_t		*io_vd;
 	void		*io_vsd;
-	zio_done_func_t	*io_vsd_free;
+	const zio_vsd_ops_t *io_vsd_ops;
+
 	uint64_t	io_offset;
 	uint64_t	io_deadline;
 	avl_node_t	io_offset_node;
 	avl_node_t	io_deadline_node;
 	avl_tree_t	*io_vdev_tree;
-	zio_t		*io_delegate_list;
-	zio_t		*io_delegate_next;
 
 	/* Internal pipeline state */
-	int		io_flags;
-	zio_stage_t	io_stage;
-	uint32_t	io_pipeline;
-	int		io_orig_flags;
-	zio_stage_t	io_orig_stage;
-	uint32_t	io_orig_pipeline;
+	enum zio_flag	io_flags;
+	enum zio_stage	io_stage;
+	enum zio_stage	io_pipeline;
+	enum zio_flag	io_orig_flags;
+	enum zio_stage	io_orig_stage;
+	enum zio_stage	io_orig_pipeline;
 	int		io_error;
 	int		io_child_error[ZIO_CHILD_TYPES];
 	uint64_t	io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES];
+	uint64_t	io_child_count;
+	uint64_t	io_parent_count;
 	uint64_t	*io_stall;
+	zio_t		*io_gang_leader;
 	zio_gang_node_t	*io_gang_tree;
 	void		*io_executor;
 	void		*io_waiter;
@@ -315,59 +415,69 @@ struct zio {
 	kcondvar_t	io_cv;
 
 	/* FMA state */
+	zio_cksum_report_t *io_cksum_report;
 	uint64_t	io_ena;
 };
 
-extern zio_t *zio_null(zio_t *pio, spa_t *spa,
-    zio_done_func_t *done, void *private, int flags);
+extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd,
+    zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_root(spa_t *spa,
-    zio_done_func_t *done, void *private, int flags);
+    zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb);
+    int priority, enum zio_flag flags, const zbookmark_t *zb);
 
 extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, zio_prop_t *zp,
+    void *data, uint64_t size, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb);
+    int priority, enum zio_flag flags, const zbookmark_t *zb);
 
 extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, zbookmark_t *zb);
+    int priority, enum zio_flag flags, zbookmark_t *zb);
 
-extern void zio_skip_write(zio_t *zio);
+extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies);
 
-extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags);
+extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp);
 
-extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags);
+extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg,
+    const blkptr_t *bp,
+    zio_done_func_t *done, void *private, enum zio_flag flags);
 
 extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, int flags);
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags);
 
 extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags,
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
     boolean_t labels);
 
 extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset,
     uint64_t size, void *data, int checksum,
-    zio_done_func_t *done, void *private, int priority, int flags,
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags,
     boolean_t labels);
 
-extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp,
-    blkptr_t *old_bp, uint64_t txg);
-extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg);
+extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg,
+    const blkptr_t *bp, enum zio_flag flags);
+
+extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp,
+    blkptr_t *old_bp, uint64_t size, boolean_t use_slog);
+extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp);
 extern void zio_flush(zio_t *zio, vdev_t *vd);
+extern void zio_shrink(zio_t *zio, uint64_t size);
 
 extern int zio_wait(zio_t *zio);
 extern void zio_nowait(zio_t *zio);
 extern void zio_execute(zio_t *zio);
 extern void zio_interrupt(zio_t *zio);
 
+extern zio_t *zio_walk_parents(zio_t *cio);
+extern zio_t *zio_walk_children(zio_t *pio);
+extern zio_t *zio_unique_parent(zio_t *cio);
+extern void zio_add_child(zio_t *pio, zio_t *cio);
+
 extern void *zio_buf_alloc(size_t size);
 extern void zio_buf_free(void *buf, size_t size);
 extern void *zio_data_buf_alloc(size_t size);
@@ -377,11 +487,11 @@ extern void zio_resubmit_stage_async(void *);
 
 extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd,
     uint64_t offset, void *data, uint64_t size, int type, int priority,
-    int flags, zio_done_func_t *done, void *private);
+    enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset,
     void *data, uint64_t size, int type, int priority,
-    int flags, zio_done_func_t *done, void *private);
+    enum zio_flag flags, zio_done_func_t *done, void *private);
 
 extern void zio_vdev_io_bypass(zio_t *zio);
 extern void zio_vdev_io_reissue(zio_t *zio);
@@ -390,11 +500,15 @@ extern void zio_vdev_io_redone(zio_t *zio);
 extern void zio_checksum_verified(zio_t *zio);
 extern int zio_worst_error(int e1, int e2);
 
-extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent);
-extern uint8_t zio_compress_select(uint8_t child, uint8_t parent);
+extern enum zio_checksum zio_checksum_select(enum zio_checksum child,
+    enum zio_checksum parent);
+extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa,
+    enum zio_checksum child, enum zio_checksum parent);
+extern enum zio_compress zio_compress_select(enum zio_compress child,
+    enum zio_compress parent);
 
 extern void zio_suspend(spa_t *spa, zio_t *zio);
-extern void zio_resume(spa_t *spa);
+extern int zio_resume(spa_t *spa);
 extern void zio_resume_wait(spa_t *spa);
 
 /*
@@ -413,9 +527,30 @@ extern int zio_inject_fault(char *name, int flags, int *id,
 extern int zio_inject_list_next(int *id, char *name, size_t buflen,
     struct zinject_record *record);
 extern int zio_clear_fault(int id);
+extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type);
 extern int zio_handle_fault_injection(zio_t *zio, int error);
-extern int zio_handle_device_injection(vdev_t *vd, int error);
+extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error);
 extern int zio_handle_label_injection(zio_t *zio, int error);
+extern void zio_handle_ignored_writes(zio_t *zio);
+
+/*
+ * Checksum ereport functions
+ */
+extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio,
+    uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info);
+extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+    const void *good_data, const void *bad_data, boolean_t drop_if_identical);
+
+extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report);
+extern void zfs_ereport_free_checksum(zio_cksum_report_t *report);
+
+/* If we have the good data in hand, this function can be used */
+extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+    struct zio *zio, uint64_t offset, uint64_t length,
+    const void *good_data, const void *bad_data, struct zio_bad_cksum *info);
+
+/* Called from spa_sync(), but primarily an injection handler */
+extern void spa_handle_ignored_writes(spa_t *spa);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h
index da407399da060..d1a5f34d52234 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -27,6 +27,7 @@
 #define	_SYS_ZIO_CHECKSUM_H
 
 #include <sys/zio.h>
+#include <zfs_fletcher.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -43,28 +44,30 @@ typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp);
 typedef struct zio_checksum_info {
 	zio_checksum_t	*ci_func[2]; /* checksum function for each byteorder */
 	int		ci_correctable;	/* number of correctable bits	*/
-	int		ci_zbt;		/* uses zio block tail?	*/
+	int		ci_eck;		/* uses zio embedded checksum? */
+	int		ci_dedup;	/* strong enough for dedup? */
 	char		*ci_name;	/* descriptive name */
 } zio_checksum_info_t;
 
+typedef struct zio_bad_cksum {
+	zio_cksum_t		zbc_expected;
+	zio_cksum_t		zbc_actual;
+	const char		*zbc_checksum_name;
+	uint8_t			zbc_byteswapped;
+	uint8_t			zbc_injected;
+	uint8_t			zbc_has_cksum;	/* expected/actual valid */
+} zio_bad_cksum_t;
+
 extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS];
 
 /*
  * Checksum routines.
  */
-extern zio_checksum_t fletcher_2_native;
-extern zio_checksum_t fletcher_4_native;
-extern zio_checksum_t fletcher_4_incremental_native;
-
-extern zio_checksum_t fletcher_2_byteswap;
-extern zio_checksum_t fletcher_4_byteswap;
-extern zio_checksum_t fletcher_4_incremental_byteswap;
-
 extern zio_checksum_t zio_checksum_SHA256;
 
 extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
     void *data, uint64_t size);
-extern int zio_checksum_error(zio_t *zio);
+extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h
index 66ee8d45b3b67..30bed1a676e32 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h
@@ -20,15 +20,13 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ZIO_COMPRESS_H
 #define	_SYS_ZIO_COMPRESS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zio.h>
 
 #ifdef	__cplusplus
@@ -66,14 +64,18 @@ extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
 extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len,
     int level);
+extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len,
+    int level);
+extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len,
+    int level);
 
 /*
  * Compress and decompress data if necessary.
  */
-extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize,
-    void **destp, uint64_t *destsizep, uint64_t *destbufsizep);
-extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
-    void *dest, uint64_t destsize);
+extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst,
+    size_t s_len);
+extern int zio_decompress_data(enum zio_compress c, void *src, void *dst,
+    size_t s_len, size_t d_len);
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h
index e7503b733cc05..d90bd8bd59217 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -34,104 +34,136 @@ extern "C" {
 #endif
 
 /*
- * I/O Groups: pipeline stage definitions.
+ * zio pipeline stage definitions
  */
-typedef enum zio_stage {
-	ZIO_STAGE_OPEN = 0,			/* RWFCI */
+enum zio_stage {
+	ZIO_STAGE_OPEN			= 1 << 0,	/* RWFCI */
 
-	ZIO_STAGE_ISSUE_ASYNC,			/* -W--- */
+	ZIO_STAGE_READ_BP_INIT		= 1 << 1,	/* R---- */
+	ZIO_STAGE_FREE_BP_INIT		= 1 << 2,	/* --F-- */
+	ZIO_STAGE_ISSUE_ASYNC		= 1 << 3,	/* RWF-- */
+	ZIO_STAGE_WRITE_BP_INIT		= 1 << 4,	/* -W--- */
 
-	ZIO_STAGE_READ_BP_INIT,			/* R---- */
-	ZIO_STAGE_WRITE_BP_INIT,		/* -W--- */
+	ZIO_STAGE_CHECKSUM_GENERATE	= 1 << 5,	/* -W--- */
 
-	ZIO_STAGE_CHECKSUM_GENERATE,		/* -W--- */
+	ZIO_STAGE_DDT_READ_START	= 1 << 6,	/* R---- */
+	ZIO_STAGE_DDT_READ_DONE		= 1 << 7,	/* R---- */
+	ZIO_STAGE_DDT_WRITE		= 1 << 8,	/* -W--- */
+	ZIO_STAGE_DDT_FREE		= 1 << 9,	/* --F-- */
 
-	ZIO_STAGE_GANG_ASSEMBLE,		/* RWFC- */
-	ZIO_STAGE_GANG_ISSUE,			/* RWFC- */
+	ZIO_STAGE_GANG_ASSEMBLE		= 1 << 10,	/* RWFC- */
+	ZIO_STAGE_GANG_ISSUE		= 1 << 11,	/* RWFC- */
 
-	ZIO_STAGE_DVA_ALLOCATE,			/* -W--- */
-	ZIO_STAGE_DVA_FREE,			/* --F-- */
-	ZIO_STAGE_DVA_CLAIM,			/* ---C- */
+	ZIO_STAGE_DVA_ALLOCATE		= 1 << 12,	/* -W--- */
+	ZIO_STAGE_DVA_FREE		= 1 << 13,	/* --F-- */
+	ZIO_STAGE_DVA_CLAIM		= 1 << 14,	/* ---C- */
 
-	ZIO_STAGE_READY,			/* RWFCI */
+	ZIO_STAGE_READY			= 1 << 15,	/* RWFCI */
 
-	ZIO_STAGE_VDEV_IO_START,		/* RW--I */
-	ZIO_STAGE_VDEV_IO_DONE,			/* RW--I */
-	ZIO_STAGE_VDEV_IO_ASSESS,		/* RW--I */
+	ZIO_STAGE_VDEV_IO_START		= 1 << 16,	/* RW--I */
+	ZIO_STAGE_VDEV_IO_DONE		= 1 << 17,	/* RW--I */
+	ZIO_STAGE_VDEV_IO_ASSESS	= 1 << 18,	/* RW--I */
 
-	ZIO_STAGE_CHECKSUM_VERIFY,		/* R---- */
+	ZIO_STAGE_CHECKSUM_VERIFY	= 1 << 19,	/* R---- */
 
-	ZIO_STAGE_DONE,				/* RWFCI */
-	ZIO_STAGES
-} zio_stage_t;
+	ZIO_STAGE_DONE			= 1 << 20	/* RWFCI */
+};
 
-#define	ZIO_INTERLOCK_STAGES					\
-	((1U << ZIO_STAGE_READY) |				\
-	(1U << ZIO_STAGE_DONE))
+#define	ZIO_INTERLOCK_STAGES			\
+	(ZIO_STAGE_READY |			\
+	ZIO_STAGE_DONE)
 
-#define	ZIO_INTERLOCK_PIPELINE					\
+#define	ZIO_INTERLOCK_PIPELINE			\
 	ZIO_INTERLOCK_STAGES
 
-#define	ZIO_VDEV_IO_STAGES					\
-	((1U << ZIO_STAGE_VDEV_IO_START) |			\
-	(1U << ZIO_STAGE_VDEV_IO_DONE) |			\
-	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define	ZIO_VDEV_IO_STAGES			\
+	(ZIO_STAGE_VDEV_IO_START |		\
+	ZIO_STAGE_VDEV_IO_DONE |		\
+	ZIO_STAGE_VDEV_IO_ASSESS)
 
-#define	ZIO_VDEV_CHILD_PIPELINE					\
-	(ZIO_VDEV_IO_STAGES |					\
-	(1U << ZIO_STAGE_DONE))
+#define	ZIO_VDEV_CHILD_PIPELINE			\
+	(ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_DONE)
 
-#define	ZIO_READ_COMMON_STAGES					\
-	(ZIO_INTERLOCK_STAGES |					\
-	ZIO_VDEV_IO_STAGES |					\
-	(1U << ZIO_STAGE_CHECKSUM_VERIFY))
+#define	ZIO_READ_COMMON_STAGES			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_CHECKSUM_VERIFY)
 
-#define	ZIO_READ_PHYS_PIPELINE					\
+#define	ZIO_READ_PHYS_PIPELINE			\
 	ZIO_READ_COMMON_STAGES
 
-#define	ZIO_READ_PIPELINE					\
-	(ZIO_READ_COMMON_STAGES |				\
-	(1U << ZIO_STAGE_READ_BP_INIT))
+#define	ZIO_READ_PIPELINE			\
+	(ZIO_READ_COMMON_STAGES |		\
+	ZIO_STAGE_READ_BP_INIT)
 
-#define	ZIO_WRITE_COMMON_STAGES					\
-	(ZIO_INTERLOCK_STAGES |					\
-	ZIO_VDEV_IO_STAGES |					\
-	(1U << ZIO_STAGE_ISSUE_ASYNC) |				\
-	(1U << ZIO_STAGE_CHECKSUM_GENERATE))
-
-#define	ZIO_WRITE_PHYS_PIPELINE					\
-	ZIO_WRITE_COMMON_STAGES
-
-#define	ZIO_REWRITE_PIPELINE					\
-	(ZIO_WRITE_COMMON_STAGES |				\
-	(1U << ZIO_STAGE_WRITE_BP_INIT))
-
-#define	ZIO_WRITE_PIPELINE					\
-	(ZIO_WRITE_COMMON_STAGES |				\
-	(1U << ZIO_STAGE_WRITE_BP_INIT) |			\
-	(1U << ZIO_STAGE_DVA_ALLOCATE))
-
-#define	ZIO_GANG_STAGES						\
-	((1U << ZIO_STAGE_GANG_ASSEMBLE) |			\
-	(1U << ZIO_STAGE_GANG_ISSUE))
+#define	ZIO_DDT_CHILD_READ_PIPELINE		\
+	ZIO_READ_COMMON_STAGES
 
-#define	ZIO_FREE_PIPELINE					\
-	(ZIO_INTERLOCK_STAGES |					\
-	(1U << ZIO_STAGE_DVA_FREE))
+#define	ZIO_DDT_READ_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_READ_BP_INIT |		\
+	ZIO_STAGE_DDT_READ_START |		\
+	ZIO_STAGE_DDT_READ_DONE)
 
-#define	ZIO_CLAIM_PIPELINE					\
-	(ZIO_INTERLOCK_STAGES |					\
-	(1U << ZIO_STAGE_DVA_CLAIM))
+#define	ZIO_WRITE_COMMON_STAGES			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_ISSUE_ASYNC |			\
+	ZIO_STAGE_CHECKSUM_GENERATE)
 
-#define	ZIO_IOCTL_PIPELINE					\
-	(ZIO_INTERLOCK_STAGES |					\
-	(1U << ZIO_STAGE_VDEV_IO_START) |			\
-	(1U << ZIO_STAGE_VDEV_IO_ASSESS))
+#define	ZIO_WRITE_PHYS_PIPELINE			\
+	ZIO_WRITE_COMMON_STAGES
 
-#define	ZIO_CONFIG_LOCK_BLOCKING_STAGES				\
-	((1U << ZIO_STAGE_VDEV_IO_START) |			\
-	(1U << ZIO_STAGE_DVA_ALLOCATE) |			\
-	(1U << ZIO_STAGE_DVA_CLAIM))
+#define	ZIO_REWRITE_PIPELINE			\
+	(ZIO_WRITE_COMMON_STAGES |		\
+	ZIO_STAGE_WRITE_BP_INIT)
+
+#define	ZIO_WRITE_PIPELINE			\
+	(ZIO_WRITE_COMMON_STAGES |		\
+	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_DVA_ALLOCATE)
+
+#define	ZIO_DDT_CHILD_WRITE_PIPELINE		\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_VDEV_IO_STAGES |			\
+	ZIO_STAGE_DVA_ALLOCATE)
+
+#define	ZIO_DDT_WRITE_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_ISSUE_ASYNC |			\
+	ZIO_STAGE_WRITE_BP_INIT |		\
+	ZIO_STAGE_CHECKSUM_GENERATE |		\
+	ZIO_STAGE_DDT_WRITE)
+
+#define	ZIO_GANG_STAGES				\
+	(ZIO_STAGE_GANG_ASSEMBLE |		\
+	ZIO_STAGE_GANG_ISSUE)
+
+#define	ZIO_FREE_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_FREE_BP_INIT |		\
+	ZIO_STAGE_DVA_FREE)
+
+#define	ZIO_DDT_FREE_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_FREE_BP_INIT |		\
+	ZIO_STAGE_ISSUE_ASYNC |			\
+	ZIO_STAGE_DDT_FREE)
+
+#define	ZIO_CLAIM_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_DVA_CLAIM)
+
+#define	ZIO_IOCTL_PIPELINE			\
+	(ZIO_INTERLOCK_STAGES |			\
+	ZIO_STAGE_VDEV_IO_START |		\
+	ZIO_STAGE_VDEV_IO_ASSESS)
+
+#define	ZIO_BLOCKING_STAGES			\
+	(ZIO_STAGE_DVA_ALLOCATE |		\
+	ZIO_STAGE_DVA_CLAIM |			\
+	ZIO_STAGE_VDEV_IO_START)
 
 extern void zio_inject_init(void);
 extern void zio_inject_fini(void);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h
index 06adc667e1243..6284a4154a661 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h
@@ -20,15 +20,13 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_ZVOL_H
 #define	_SYS_ZVOL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 
 #ifdef	__cplusplus
@@ -43,10 +41,10 @@ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize);
 extern int zvol_check_volblocksize(uint64_t volblocksize);
 extern int zvol_get_stats(objset_t *os, nvlist_t *nv);
 extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx);
-extern int zvol_create_minor(const char *, major_t);
+extern int zvol_create_minor(const char *);
 extern int zvol_remove_minor(const char *);
+extern void zvol_remove_minors(const char *);
 extern int zvol_set_volsize(const char *, major_t, uint64_t);
-extern int zvol_set_volblocksize(const char *, uint64_t);
 
 extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr);
 extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c b/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c
index 2bbf2f086c154..fb62f108940fc 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c
@@ -19,13 +19,14 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <sys/zfs_context.h>
 #include <sys/txg_impl.h>
 #include <sys/dmu_impl.h>
+#include <sys/dmu_tx.h>
 #include <sys/dsl_pool.h>
 #include <sys/callb.h>
 
@@ -57,12 +58,20 @@ txg_init(dsl_pool_t *dp, uint64_t txg)
 		for (i = 0; i < TXG_SIZE; i++) {
 			cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT,
 			    NULL);
+			list_create(&tx->tx_cpu[c].tc_callbacks[i],
+			    sizeof (dmu_tx_callback_t),
+			    offsetof(dmu_tx_callback_t, dcb_node));
 		}
 	}
 
-	rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL);
 	mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL);
 
+	cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL);
+	cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL);
+
 	tx->tx_open_txg = txg;
 }
 
@@ -77,17 +86,27 @@ txg_fini(dsl_pool_t *dp)
 
 	ASSERT(tx->tx_threads == 0);
 
-	rw_destroy(&tx->tx_suspend);
 	mutex_destroy(&tx->tx_sync_lock);
 
+	cv_destroy(&tx->tx_sync_more_cv);
+	cv_destroy(&tx->tx_sync_done_cv);
+	cv_destroy(&tx->tx_quiesce_more_cv);
+	cv_destroy(&tx->tx_quiesce_done_cv);
+	cv_destroy(&tx->tx_exit_cv);
+
 	for (c = 0; c < max_ncpus; c++) {
 		int i;
 
 		mutex_destroy(&tx->tx_cpu[c].tc_lock);
-		for (i = 0; i < TXG_SIZE; i++)
+		for (i = 0; i < TXG_SIZE; i++) {
 			cv_destroy(&tx->tx_cpu[c].tc_cv[i]);
+			list_destroy(&tx->tx_cpu[c].tc_callbacks[i]);
+		}
 	}
 
+	if (tx->tx_commit_cb_taskq != NULL)
+		taskq_destroy(tx->tx_commit_cb_taskq);
+
 	kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t));
 
 	bzero(tx, sizeof (tx_state_t));
@@ -147,7 +166,8 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time)
 	CALLB_CPR_SAFE_BEGIN(cpr);
 
 	if (time)
-		(void) cv_timedwait(cv, &tx->tx_sync_lock, lbolt + time);
+		(void) cv_timedwait(cv, &tx->tx_sync_lock,
+		    ddi_get_lbolt() + time);
 	else
 		cv_wait(cv, &tx->tx_sync_lock);
 
@@ -167,7 +187,11 @@ txg_sync_stop(dsl_pool_t *dp)
 	 * Finish off any work in progress.
 	 */
 	ASSERT(tx->tx_threads == 2);
-	txg_wait_synced(dp, 0);
+
+	/*
+	 * We need to ensure that we've vacated the deferred space_maps.
+	 */
+	txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE);
 
 	/*
 	 * Wake all sync threads and wait for them to die.
@@ -216,6 +240,17 @@ txg_rele_to_quiesce(txg_handle_t *th)
 	mutex_exit(&tc->tc_lock);
 }
 
+void
+txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks)
+{
+	tx_cpu_t *tc = th->th_cpu;
+	int g = th->th_txg & TXG_MASK;
+
+	mutex_enter(&tc->tc_lock);
+	list_move_tail(&tc->tc_callbacks[g], tx_callbacks);
+	mutex_exit(&tc->tc_lock);
+}
+
 void
 txg_rele_to_sync(txg_handle_t *th)
 {
@@ -266,9 +301,59 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg)
 	}
 }
 
+static void
+txg_do_callbacks(list_t *cb_list)
+{
+	dmu_tx_do_callbacks(cb_list, 0);
+
+	list_destroy(cb_list);
+
+	kmem_free(cb_list, sizeof (list_t));
+}
+
+/*
+ * Dispatch the commit callbacks registered on this txg to worker threads.
+ */
+static void
+txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg)
+{
+	int c;
+	tx_state_t *tx = &dp->dp_tx;
+	list_t *cb_list;
+
+	for (c = 0; c < max_ncpus; c++) {
+		tx_cpu_t *tc = &tx->tx_cpu[c];
+		/* No need to lock tx_cpu_t at this point */
+
+		int g = txg & TXG_MASK;
+
+		if (list_is_empty(&tc->tc_callbacks[g]))
+			continue;
+
+		if (tx->tx_commit_cb_taskq == NULL) {
+			/*
+			 * Commit callback taskq hasn't been created yet.
+			 */
+			tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb",
+			    max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2,
+			    TASKQ_PREPOPULATE);
+		}
+
+		cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP);
+		list_create(cb_list, sizeof (dmu_tx_callback_t),
+		    offsetof(dmu_tx_callback_t, dcb_node));
+
+		list_move_tail(&tc->tc_callbacks[g], cb_list);
+
+		(void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *)
+		    txg_do_callbacks, cb_list, TQ_SLEEP);
+	}
+}
+
 static void
 txg_sync_thread(dsl_pool_t *dp)
 {
+	spa_t *spa = dp->dp_spa;
 	tx_state_t *tx = &dp->dp_tx;
 	callb_cpr_t cpr;
 	uint64_t start, delta;
@@ -287,14 +372,15 @@ txg_sync_thread(dsl_pool_t *dp)
 		 */
 		timer = (delta >= timeout ? 0 : timeout - delta);
 		while ((dp->dp_scrub_func == SCRUB_FUNC_NONE ||
-		    spa_shutting_down(dp->dp_spa)) &&
+		    spa_load_state(spa) != SPA_LOAD_NONE ||
+		    spa_shutting_down(spa)) &&
 		    !tx->tx_exiting && timer > 0 &&
 		    tx->tx_synced_txg >= tx->tx_sync_txg_waiting &&
 		    tx->tx_quiesced_txg == 0) {
 			dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n",
 			    tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp);
 			txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer);
-			delta = lbolt - start;
+			delta = ddi_get_lbolt() - start;
 			timer = (delta > timeout ? 0 : timeout - delta);
 		}
 
@@ -312,8 +398,6 @@ txg_sync_thread(dsl_pool_t *dp)
 		if (tx->tx_exiting)
 			txg_thread_exit(tx, &cpr, &tx->tx_sync_thread);
 
-		rw_enter(&tx->tx_suspend, RW_WRITER);
-
 		/*
 		 * Consume the quiesced txg which has been handed off to
 		 * us.  This may cause the quiescing thread to now be
@@ -323,22 +407,24 @@ txg_sync_thread(dsl_pool_t *dp)
 		tx->tx_quiesced_txg = 0;
 		tx->tx_syncing_txg = txg;
 		cv_broadcast(&tx->tx_quiesce_more_cv);
-		rw_exit(&tx->tx_suspend);
 
 		dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
 		    txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting);
 		mutex_exit(&tx->tx_sync_lock);
 
-		start = lbolt;
-		spa_sync(dp->dp_spa, txg);
-		delta = lbolt - start;
+		start = ddi_get_lbolt();
+		spa_sync(spa, txg);
+		delta = ddi_get_lbolt() - start;
 
 		mutex_enter(&tx->tx_sync_lock);
-		rw_enter(&tx->tx_suspend, RW_WRITER);
 		tx->tx_synced_txg = txg;
 		tx->tx_syncing_txg = 0;
-		rw_exit(&tx->tx_suspend);
 		cv_broadcast(&tx->tx_sync_done_cv);
+
+		/*
+		 * Dispatch commit callbacks to worker threads.
+		 */
+		txg_dispatch_callbacks(dp, txg);
 	}
 }
 
@@ -395,7 +481,7 @@ void
 txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
 {
 	tx_state_t *tx = &dp->dp_tx;
-	int timeout = lbolt + ticks;
+	int timeout = ddi_get_lbolt() + ticks;
 
 	/* don't delay if this txg could transition to quiesing immediately */
 	if (tx->tx_open_txg > txg ||
@@ -408,7 +494,7 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks)
 		return;
 	}
 
-	while (lbolt < timeout &&
+	while (ddi_get_lbolt() < timeout &&
 	    tx->tx_syncing_txg < txg-1 && !txg_stalled(dp))
 		(void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock,
 		    timeout);
@@ -424,7 +510,7 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg)
 	mutex_enter(&tx->tx_sync_lock);
 	ASSERT(tx->tx_threads == 2);
 	if (txg == 0)
-		txg = tx->tx_open_txg;
+		txg = tx->tx_open_txg + TXG_DEFER_SIZE;
 	if (tx->tx_sync_txg_waiting < txg)
 		tx->tx_sync_txg_waiting = txg;
 	dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n",
@@ -475,21 +561,6 @@ txg_sync_waiting(dsl_pool_t *dp)
 	    tx->tx_quiesced_txg != 0);
 }
 
-void
-txg_suspend(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	/* XXX some code paths suspend when they are already suspended! */
-	rw_enter(&tx->tx_suspend, RW_READER);
-}
-
-void
-txg_resume(dsl_pool_t *dp)
-{
-	tx_state_t *tx = &dp->dp_tx;
-	rw_exit(&tx->tx_suspend);
-}
-
 /*
  * Per-txg object lists.
  */
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c
index 16a27e514a41b..48082c8bf9479 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -39,6 +39,7 @@
 #include <sys/zap.h>
 #include <sys/fs/zfs.h>
 #include <sys/arc.h>
+#include <sys/zil.h>
 
 /*
  * Virtual device management.
@@ -53,6 +54,7 @@ static vdev_ops_t *vdev_ops_table[] = {
 	&vdev_disk_ops,
 	&vdev_file_ops,
 	&vdev_missing_ops,
+	&vdev_hole_ops,
 	NULL
 };
 
@@ -83,9 +85,8 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
 {
 	uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift);
 	uint64_t csize;
-	uint64_t c;
 
-	for (c = 0; c < vd->vdev_children; c++) {
+	for (int c = 0; c < vd->vdev_children; c++) {
 		csize = vdev_psize_to_asize(vd->vdev_child[c], psize);
 		asize = MAX(asize, csize);
 	}
@@ -94,40 +95,47 @@ vdev_default_asize(vdev_t *vd, uint64_t psize)
 }
 
 /*
- * Get the replaceable or attachable device size.
- * If the parent is a mirror or raidz, the replaceable size is the minimum
- * psize of all its children. For the rest, just return our own psize.
- *
- * e.g.
- *			psize	rsize
- * root			-	-
- *	mirror/raidz	-	-
- *	    disk1	20g	20g
- *	    disk2 	40g	20g
- *	disk3 		80g	80g
+ * Get the minimum allocatable size. We define the allocatable size as
+ * the vdev's asize rounded to the nearest metaslab. This allows us to
+ * replace or attach devices which don't have the same physical size but
+ * can still satisfy the same number of allocations.
  */
 uint64_t
-vdev_get_rsize(vdev_t *vd)
+vdev_get_min_asize(vdev_t *vd)
 {
-	vdev_t *pvd, *cvd;
-	uint64_t c, rsize;
+	vdev_t *pvd = vd->vdev_parent;
 
-	pvd = vd->vdev_parent;
+	/*
+	 * The our parent is NULL (inactive spare or cache) or is the root,
+	 * just return our own asize.
+	 */
+	if (pvd == NULL)
+		return (vd->vdev_asize);
+
+	/*
+	 * The top-level vdev just returns the allocatable size rounded
+	 * to the nearest metaslab.
+	 */
+	if (vd == vd->vdev_top)
+		return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift));
 
 	/*
-	 * If our parent is NULL or the root, just return our own psize.
+	 * The allocatable space for a raidz vdev is N * sizeof(smallest child),
+	 * so each child must provide at least 1/Nth of its asize.
 	 */
-	if (pvd == NULL || pvd->vdev_parent == NULL)
-		return (vd->vdev_psize);
+	if (pvd->vdev_ops == &vdev_raidz_ops)
+		return (pvd->vdev_min_asize / pvd->vdev_children);
 
-	rsize = 0;
+	return (pvd->vdev_min_asize);
+}
 
-	for (c = 0; c < pvd->vdev_children; c++) {
-		cvd = pvd->vdev_child[c];
-		rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1;
-	}
+void
+vdev_set_min_asize(vdev_t *vd)
+{
+	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
-	return (rsize);
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_set_min_asize(vd->vdev_child[c]);
 }
 
 vdev_t *
@@ -148,13 +156,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev)
 vdev_t *
 vdev_lookup_by_guid(vdev_t *vd, uint64_t guid)
 {
-	int c;
 	vdev_t *mvd;
 
 	if (vd->vdev_guid == guid)
 		return (vd);
 
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) !=
 		    NULL)
 			return (mvd);
@@ -250,17 +257,17 @@ vdev_compact_children(vdev_t *pvd)
 {
 	vdev_t **newchild, *cvd;
 	int oldc = pvd->vdev_children;
-	int newc, c;
+	int newc;
 
 	ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
 
-	for (c = newc = 0; c < oldc; c++)
+	for (int c = newc = 0; c < oldc; c++)
 		if (pvd->vdev_child[c])
 			newc++;
 
 	newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP);
 
-	for (c = newc = 0; c < oldc; c++) {
+	for (int c = newc = 0; c < oldc; c++) {
 		if ((cvd = pvd->vdev_child[c]) != NULL) {
 			newchild[newc] = cvd;
 			cvd->vdev_id = newc++;
@@ -275,7 +282,7 @@ vdev_compact_children(vdev_t *pvd)
 /*
  * Allocate and minimally initialize a vdev_t.
  */
-static vdev_t *
+vdev_t *
 vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 {
 	vdev_t *vd;
@@ -287,21 +294,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 		spa->spa_root_vdev = vd;
 	}
 
-	if (guid == 0) {
+	if (guid == 0 && ops != &vdev_hole_ops) {
 		if (spa->spa_root_vdev == vd) {
 			/*
 			 * The root vdev's guid will also be the pool guid,
 			 * which must be unique among all pools.
 			 */
-			while (guid == 0 || spa_guid_exists(guid, 0))
-				guid = spa_get_random(-1ULL);
+			guid = spa_generate_guid(NULL);
 		} else {
 			/*
 			 * Any other vdev's guid must be unique within the pool.
 			 */
-			while (guid == 0 ||
-			    spa_guid_exists(spa_guid(spa), guid))
-				guid = spa_get_random(-1ULL);
+			guid = spa_generate_guid(spa);
 		}
 		ASSERT(!spa_guid_exists(spa_guid(spa), guid));
 	}
@@ -312,12 +316,15 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops)
 	vd->vdev_guid_sum = guid;
 	vd->vdev_ops = ops;
 	vd->vdev_state = VDEV_STATE_CLOSED;
+	vd->vdev_ishole = (ops == &vdev_hole_ops);
 
 	mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL);
 	mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL);
-	space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock);
-	space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0,
+		    &vd->vdev_dtl_lock);
+	}
 	txg_list_create(&vd->vdev_ms_list,
 	    offsetof(struct metaslab, ms_txg_node));
 	txg_list_create(&vd->vdev_dtl_list,
@@ -370,6 +377,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	} else if (alloctype == VDEV_ALLOC_L2CACHE) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
 			return (EINVAL);
+	} else if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0)
+			return (EINVAL);
 	}
 
 	/*
@@ -386,6 +396,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (islog && spa_version(spa) < SPA_VERSION_SLOGS)
 		return (ENOTSUP);
 
+	if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES)
+		return (ENOTSUP);
+
 	/*
 	 * Set the nparity property for RAID-Z vdevs.
 	 */
@@ -393,23 +406,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (ops == &vdev_raidz_ops) {
 		if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY,
 		    &nparity) == 0) {
-			/*
-			 * Currently, we can only support 2 parity devices.
-			 */
-			if (nparity == 0 || nparity > 2)
+			if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY)
 				return (EINVAL);
 			/*
-			 * Older versions can only support 1 parity device.
+			 * Previous versions could only support 1 or 2 parity
+			 * device.
 			 */
-			if (nparity == 2 &&
-			    spa_version(spa) < SPA_VERSION_RAID6)
+			if (nparity > 1 &&
+			    spa_version(spa) < SPA_VERSION_RAIDZ2)
+				return (ENOTSUP);
+			if (nparity > 2 &&
+			    spa_version(spa) < SPA_VERSION_RAIDZ3)
 				return (ENOTSUP);
 		} else {
 			/*
 			 * We require the parity to be specified for SPAs that
 			 * support multiple parity levels.
 			 */
-			if (spa_version(spa) >= SPA_VERSION_RAID6)
+			if (spa_version(spa) >= SPA_VERSION_RAIDZ2)
 				return (EINVAL);
 			/*
 			 * Otherwise, we default to 1 parity device for RAID-Z.
@@ -433,6 +447,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 	    &vd->vdev_physpath) == 0)
 		vd->vdev_physpath = spa_strdup(vd->vdev_physpath);
+	if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0)
+		vd->vdev_fru = spa_strdup(vd->vdev_fru);
 
 	/*
 	 * Set the whole_disk property.  If it's not specified, leave the value
@@ -446,19 +462,25 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 	 * Look for the 'not present' flag.  This will only be set if the device
 	 * was not present at the time of import.
 	 */
-	if (!spa->spa_import_faulted)
-		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
-		    &vd->vdev_not_present);
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT,
+	    &vd->vdev_not_present);
 
 	/*
 	 * Get the alignment requirement.
 	 */
 	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift);
 
+	/*
+	 * Retrieve the vdev creation time.
+	 */
+	(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+	    &vd->vdev_crtxg);
+
 	/*
 	 * If we're a top-level vdev, try to load the allocation parameters.
 	 */
-	if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) {
+	if (parent && !parent->vdev_parent &&
+	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) {
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY,
 		    &vd->vdev_ms_array);
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT,
@@ -467,32 +489,63 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 		    &vd->vdev_asize);
 	}
 
+	if (parent && !parent->vdev_parent) {
+		ASSERT(alloctype == VDEV_ALLOC_LOAD ||
+		    alloctype == VDEV_ALLOC_ADD ||
+		    alloctype == VDEV_ALLOC_SPLIT ||
+		    alloctype == VDEV_ALLOC_ROOTPOOL);
+		vd->vdev_mg = metaslab_group_create(islog ?
+		    spa_log_class(spa) : spa_normal_class(spa), vd);
+	}
+
 	/*
 	 * If we're a leaf vdev, try to load the DTL object and other state.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf &&
-	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) {
+	    (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE ||
+	    alloctype == VDEV_ALLOC_ROOTPOOL)) {
 		if (alloctype == VDEV_ALLOC_LOAD) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL,
-			    &vd->vdev_dtl.smo_object);
+			    &vd->vdev_dtl_smo.smo_object);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    &vd->vdev_unspare);
 		}
+
+		if (alloctype == VDEV_ALLOC_ROOTPOOL) {
+			uint64_t spare = 0;
+
+			if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE,
+			    &spare) == 0 && spare)
+				spa_spare_add(vd);
+		}
+
 		(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 		    &vd->vdev_offline);
 
 		/*
 		 * When importing a pool, we want to ignore the persistent fault
 		 * state, as the diagnosis made on another system may not be
-		 * valid in the current context.
+		 * valid in the current context.  Local vdevs will
+		 * remain in the faulted state.
 		 */
-		if (spa->spa_load_state == SPA_LOAD_OPEN) {
+		if (spa_load_state(spa) == SPA_LOAD_OPEN) {
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED,
 			    &vd->vdev_faulted);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED,
 			    &vd->vdev_degraded);
 			(void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED,
 			    &vd->vdev_removed);
+
+			if (vd->vdev_faulted || vd->vdev_degraded) {
+				char *aux;
+
+				vd->vdev_label_aux =
+				    VDEV_AUX_ERR_EXCEEDED;
+				if (nvlist_lookup_string(nv,
+				    ZPOOL_CONFIG_AUX_STATE, &aux) == 0 &&
+				    strcmp(aux, "external") == 0)
+					vd->vdev_label_aux = VDEV_AUX_EXTERNAL;
+			}
 		}
 	}
 
@@ -509,7 +562,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id,
 void
 vdev_free(vdev_t *vd)
 {
-	int c;
 	spa_t *spa = vd->vdev_spa;
 
 	/*
@@ -519,11 +571,12 @@ vdev_free(vdev_t *vd)
 	vdev_close(vd);
 
 	ASSERT(!list_link_active(&vd->vdev_config_dirty_node));
+	ASSERT(!list_link_active(&vd->vdev_state_dirty_node));
 
 	/*
 	 * Free all children.
 	 */
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_free(vd->vdev_child[c]);
 
 	ASSERT(vd->vdev_child == NULL);
@@ -532,8 +585,10 @@ vdev_free(vdev_t *vd)
 	/*
 	 * Discard allocation state.
 	 */
-	if (vd == vd->vdev_top)
+	if (vd->vdev_mg != NULL) {
 		vdev_metaslab_fini(vd);
+		metaslab_group_destroy(vd->vdev_mg);
+	}
 
 	ASSERT3U(vd->vdev_stat.vs_space, ==, 0);
 	ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0);
@@ -558,6 +613,8 @@ vdev_free(vdev_t *vd)
 		spa_strfree(vd->vdev_devid);
 	if (vd->vdev_physpath)
 		spa_strfree(vd->vdev_physpath);
+	if (vd->vdev_fru)
+		spa_strfree(vd->vdev_fru);
 
 	if (vd->vdev_isspare)
 		spa_spare_remove(vd);
@@ -566,12 +623,14 @@ vdev_free(vdev_t *vd)
 
 	txg_list_destroy(&vd->vdev_ms_list);
 	txg_list_destroy(&vd->vdev_dtl_list);
+
 	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_unload(&vd->vdev_dtl_map);
-	space_map_destroy(&vd->vdev_dtl_map);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-	space_map_destroy(&vd->vdev_dtl_scrub);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		space_map_unload(&vd->vdev_dtl[t]);
+		space_map_destroy(&vd->vdev_dtl[t]);
+	}
 	mutex_exit(&vd->vdev_dtl_lock);
+
 	mutex_destroy(&vd->vdev_dtl_lock);
 	mutex_destroy(&vd->vdev_stat_lock);
 	mutex_destroy(&vd->vdev_probe_lock);
@@ -649,14 +708,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd)
 static void
 vdev_top_update(vdev_t *tvd, vdev_t *vd)
 {
-	int c;
-
 	if (vd == NULL)
 		return;
 
 	vd->vdev_top = tvd;
 
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_top_update(tvd, vd->vdev_child[c]);
 }
 
@@ -675,8 +732,10 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops)
 	mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops);
 
 	mvd->vdev_asize = cvd->vdev_asize;
+	mvd->vdev_min_asize = cvd->vdev_min_asize;
 	mvd->vdev_ashift = cvd->vdev_ashift;
 	mvd->vdev_state = cvd->vdev_state;
+	mvd->vdev_crtxg = cvd->vdev_crtxg;
 
 	vdev_remove_child(pvd, cvd);
 	vdev_add_child(pvd, mvd);
@@ -709,14 +768,19 @@ vdev_remove_parent(vdev_t *cvd)
 
 	vdev_remove_child(mvd, cvd);
 	vdev_remove_child(pvd, mvd);
+
 	/*
 	 * If cvd will replace mvd as a top-level vdev, preserve mvd's guid.
 	 * Otherwise, we could have detached an offline device, and when we
 	 * go to import the pool we'll think we have two top-level vdevs,
 	 * instead of a different version of the same top-level vdev.
 	 */
-	if (mvd->vdev_top == mvd)
-		cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid;
+	if (mvd->vdev_top == mvd) {
+		uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid;
+		cvd->vdev_orig_guid = cvd->vdev_guid;
+		cvd->vdev_guid += guid_delta;
+		cvd->vdev_guid_sum += guid_delta;
+	}
 	cvd->vdev_id = mvd->vdev_id;
 	vdev_add_child(pvd, cvd);
 	vdev_top_update(cvd->vdev_top, cvd->vdev_top);
@@ -733,25 +797,32 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
 	objset_t *mos = spa->spa_meta_objset;
-	metaslab_class_t *mc;
 	uint64_t m;
 	uint64_t oldc = vd->vdev_ms_count;
 	uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift;
 	metaslab_t **mspp;
 	int error;
 
-	if (vd->vdev_ms_shift == 0)	/* not being allocated from yet */
+	ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER));
+
+	/*
+	 * This vdev is not being allocated from yet or is a hole.
+	 */
+	if (vd->vdev_ms_shift == 0)
 		return (0);
 
-	ASSERT(oldc <= newc);
+	ASSERT(!vd->vdev_ishole);
 
-	if (vd->vdev_islog)
-		mc = spa->spa_log_class;
-	else
-		mc = spa->spa_normal_class;
+	/*
+	 * Compute the raidz-deflation ratio.  Note, we hard-code
+	 * in 128k (1 << 17) because it is the current "typical" blocksize.
+	 * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change,
+	 * or we will inconsistently account for existing bp's.
+	 */
+	vd->vdev_deflate_ratio = (1 << 17) /
+	    (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT);
 
-	if (vd->vdev_mg == NULL)
-		vd->vdev_mg = metaslab_group_create(mc, vd);
+	ASSERT(oldc <= newc);
 
 	mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP);
 
@@ -768,7 +839,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 		if (txg == 0) {
 			uint64_t object = 0;
 			error = dmu_read(mos, vd->vdev_ms_array,
-			    m * sizeof (uint64_t), sizeof (uint64_t), &object);
+			    m * sizeof (uint64_t), sizeof (uint64_t), &object,
+			    DMU_READ_PREFETCH);
 			if (error)
 				return (error);
 			if (object != 0) {
@@ -786,6 +858,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg)
 		    m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg);
 	}
 
+	if (txg == 0)
+		spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER);
+
+	if (oldc == 0)
+		metaslab_group_activate(vd->vdev_mg);
+
+	if (txg == 0)
+		spa_config_exit(spa, SCL_ALLOC, FTAG);
+
 	return (0);
 }
 
@@ -796,6 +877,7 @@ vdev_metaslab_fini(vdev_t *vd)
 	uint64_t count = vd->vdev_ms_count;
 
 	if (vd->vdev_ms != NULL) {
+		metaslab_group_passivate(vd->vdev_mg);
 		for (m = 0; m < count; m++)
 			if (vd->vdev_ms[m] != NULL)
 				metaslab_fini(vd->vdev_ms[m]);
@@ -808,22 +890,22 @@ typedef struct vdev_probe_stats {
 	boolean_t	vps_readable;
 	boolean_t	vps_writeable;
 	int		vps_flags;
-	zio_t		*vps_root;
-	vdev_t		*vps_vd;
 } vdev_probe_stats_t;
 
 static void
 vdev_probe_done(zio_t *zio)
 {
+	spa_t *spa = zio->io_spa;
+	vdev_t *vd = zio->io_vd;
 	vdev_probe_stats_t *vps = zio->io_private;
-	vdev_t *vd = vps->vps_vd;
+
+	ASSERT(vd->vdev_probe_zio != NULL);
 
 	if (zio->io_type == ZIO_TYPE_READ) {
-		ASSERT(zio->io_vd == vd);
 		if (zio->io_error == 0)
 			vps->vps_readable = 1;
-		if (zio->io_error == 0 && (spa_mode & FWRITE)) {
-			zio_nowait(zio_write_phys(vps->vps_root, vd,
+		if (zio->io_error == 0 && spa_writeable(spa)) {
+			zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd,
 			    zio->io_offset, zio->io_size, zio->io_data,
 			    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 			    ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE));
@@ -831,26 +913,34 @@ vdev_probe_done(zio_t *zio)
 			zio_buf_free(zio->io_data, zio->io_size);
 		}
 	} else if (zio->io_type == ZIO_TYPE_WRITE) {
-		ASSERT(zio->io_vd == vd);
 		if (zio->io_error == 0)
 			vps->vps_writeable = 1;
 		zio_buf_free(zio->io_data, zio->io_size);
 	} else if (zio->io_type == ZIO_TYPE_NULL) {
-		ASSERT(zio->io_vd == NULL);
-		ASSERT(zio == vps->vps_root);
+		zio_t *pio;
 
 		vd->vdev_cant_read |= !vps->vps_readable;
 		vd->vdev_cant_write |= !vps->vps_writeable;
 
 		if (vdev_readable(vd) &&
-		    (vdev_writeable(vd) || !(spa_mode & FWRITE))) {
+		    (vdev_writeable(vd) || !spa_writeable(spa))) {
 			zio->io_error = 0;
 		} else {
 			ASSERT(zio->io_error != 0);
 			zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE,
-			    zio->io_spa, vd, NULL, 0, 0);
+			    spa, vd, NULL, 0, 0);
 			zio->io_error = ENXIO;
 		}
+
+		mutex_enter(&vd->vdev_probe_lock);
+		ASSERT(vd->vdev_probe_zio == zio);
+		vd->vdev_probe_zio = NULL;
+		mutex_exit(&vd->vdev_probe_lock);
+
+		while ((pio = zio_walk_parents(zio)) != NULL)
+			if (!vdev_accessible(vd, pio))
+				pio->io_error = ENXIO;
+
 		kmem_free(vps, sizeof (*vps));
 	}
 }
@@ -861,53 +951,139 @@ vdev_probe_done(zio_t *zio)
  * but the first (which we leave alone in case it contains a VTOC).
  */
 zio_t *
-vdev_probe(vdev_t *vd, zio_t *pio)
+vdev_probe(vdev_t *vd, zio_t *zio)
 {
 	spa_t *spa = vd->vdev_spa;
-	vdev_probe_stats_t *vps;
-	zio_t *zio;
+	vdev_probe_stats_t *vps = NULL;
+	zio_t *pio;
 
-	vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
+	ASSERT(vd->vdev_ops->vdev_op_leaf);
 
-	vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
-	    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY;
+	/*
+	 * Don't probe the probe.
+	 */
+	if (zio && (zio->io_flags & ZIO_FLAG_PROBE))
+		return (NULL);
 
-	if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
-		/*
-		 * vdev_cant_read and vdev_cant_write can only transition
-		 * from TRUE to FALSE when we have the SCL_ZIO lock as writer;
-		 * otherwise they can only transition from FALSE to TRUE.
-		 * This ensures that any zio looking at these values can
-		 * assume that failures persist for the life of the I/O.
-		 * That's important because when a device has intermittent
-		 * connectivity problems, we want to ensure that they're
-		 * ascribed to the device (ENXIO) and not the zio (EIO).
-		 *
-		 * Since we hold SCL_ZIO as writer here, clear both values
-		 * so the probe can reevaluate from first principles.
-		 */
-		vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
-		vd->vdev_cant_read = B_FALSE;
-		vd->vdev_cant_write = B_FALSE;
+	/*
+	 * To prevent 'probe storms' when a device fails, we create
+	 * just one probe i/o at a time.  All zios that want to probe
+	 * this vdev will become parents of the probe io.
+	 */
+	mutex_enter(&vd->vdev_probe_lock);
+
+	if ((pio = vd->vdev_probe_zio) == NULL) {
+		vps = kmem_zalloc(sizeof (*vps), KM_SLEEP);
+
+		vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE |
+		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE |
+		    ZIO_FLAG_TRYHARD;
+
+		if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) {
+			/*
+			 * vdev_cant_read and vdev_cant_write can only
+			 * transition from TRUE to FALSE when we have the
+			 * SCL_ZIO lock as writer; otherwise they can only
+			 * transition from FALSE to TRUE.  This ensures that
+			 * any zio looking at these values can assume that
+			 * failures persist for the life of the I/O.  That's
+			 * important because when a device has intermittent
+			 * connectivity problems, we want to ensure that
+			 * they're ascribed to the device (ENXIO) and not
+			 * the zio (EIO).
+			 *
+			 * Since we hold SCL_ZIO as writer here, clear both
+			 * values so the probe can reevaluate from first
+			 * principles.
+			 */
+			vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER;
+			vd->vdev_cant_read = B_FALSE;
+			vd->vdev_cant_write = B_FALSE;
+		}
+
+		vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd,
+		    vdev_probe_done, vps,
+		    vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE);
+
+		if (zio != NULL) {
+			vd->vdev_probe_wanted = B_TRUE;
+			spa_async_request(spa, SPA_ASYNC_PROBE);
+		}
 	}
 
-	ASSERT(vd->vdev_ops->vdev_op_leaf);
+	if (zio != NULL)
+		zio_add_child(zio, pio);
 
-	zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags);
+	mutex_exit(&vd->vdev_probe_lock);
 
-	vps->vps_root = zio;
-	vps->vps_vd = vd;
+	if (vps == NULL) {
+		ASSERT(zio != NULL);
+		return (NULL);
+	}
 
 	for (int l = 1; l < VDEV_LABELS; l++) {
-		zio_nowait(zio_read_phys(zio, vd,
+		zio_nowait(zio_read_phys(pio, vd,
 		    vdev_label_offset(vd->vdev_psize, l,
-		    offsetof(vdev_label_t, vl_pad)),
-		    VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE),
+		    offsetof(vdev_label_t, vl_pad2)),
+		    VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE),
 		    ZIO_CHECKSUM_OFF, vdev_probe_done, vps,
 		    ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE));
 	}
 
-	return (zio);
+	if (zio == NULL)
+		return (pio);
+
+	zio_nowait(pio);
+	return (NULL);
+}
+
+static void
+vdev_open_child(void *arg)
+{
+	vdev_t *vd = arg;
+
+	vd->vdev_open_thread = curthread;
+	vd->vdev_open_error = vdev_open(vd);
+	vd->vdev_open_thread = NULL;
+}
+
+boolean_t
+vdev_uses_zvols(vdev_t *vd)
+{
+	if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR,
+	    strlen(ZVOL_DIR)) == 0)
+		return (B_TRUE);
+	for (int c = 0; c < vd->vdev_children; c++)
+		if (vdev_uses_zvols(vd->vdev_child[c]))
+			return (B_TRUE);
+	return (B_FALSE);
+}
+
+void
+vdev_open_children(vdev_t *vd)
+{
+	taskq_t *tq;
+	int children = vd->vdev_children;
+
+	/*
+	 * in order to handle pools on top of zvols, do the opens
+	 * in a single thread so that the same thread holds the
+	 * spa_namespace_lock
+	 */
+	if (vdev_uses_zvols(vd)) {
+		for (int c = 0; c < children; c++)
+			vd->vdev_child[c]->vdev_open_error =
+			    vdev_open(vd->vdev_child[c]);
+		return;
+	}
+	tq = taskq_create("vdev_open", children, minclsyspri,
+	    children, children, TASKQ_PREPOPULATE);
+
+	for (int c = 0; c < children; c++)
+		VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c],
+		    TQ_SLEEP) != NULL);
+
+	taskq_destroy(tq);
 }
 
 /*
@@ -916,22 +1092,33 @@ vdev_probe(vdev_t *vd, zio_t *pio)
 int
 vdev_open(vdev_t *vd)
 {
+	spa_t *spa = vd->vdev_spa;
 	int error;
-	int c;
 	uint64_t osize = 0;
 	uint64_t asize, psize;
 	uint64_t ashift = 0;
 
+	ASSERT(vd->vdev_open_thread == curthread ||
+	    spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 	ASSERT(vd->vdev_state == VDEV_STATE_CLOSED ||
 	    vd->vdev_state == VDEV_STATE_CANT_OPEN ||
 	    vd->vdev_state == VDEV_STATE_OFFLINE);
 
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
+	vd->vdev_cant_read = B_FALSE;
+	vd->vdev_cant_write = B_FALSE;
+	vd->vdev_min_asize = vdev_get_min_asize(vd);
 
+	/*
+	 * If this vdev is not removed, check its fault status.  If it's
+	 * faulted, bail out of the open.
+	 */
 	if (!vd->vdev_removed && vd->vdev_faulted) {
 		ASSERT(vd->vdev_children == 0);
+		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
-		    VDEV_AUX_ERR_EXCEEDED);
+		    vd->vdev_label_aux);
 		return (ENXIO);
 	} else if (vd->vdev_offline) {
 		ASSERT(vd->vdev_children == 0);
@@ -941,8 +1128,13 @@ vdev_open(vdev_t *vd)
 
 	error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift);
 
+	/*
+	 * Reset the vdev_reopening flag so that we actually close
+	 * the vdev on error.
+	 */
+	vd->vdev_reopening = B_FALSE;
 	if (zio_injection_enabled && error == 0)
-		error = zio_handle_device_injection(vd, ENXIO);
+		error = zio_handle_device_injection(vd, NULL, ENXIO);
 
 	if (error) {
 		if (vd->vdev_removed &&
@@ -956,20 +1148,40 @@ vdev_open(vdev_t *vd)
 
 	vd->vdev_removed = B_FALSE;
 
+	/*
+	 * Recheck the faulted flag now that we have confirmed that
+	 * the vdev is accessible.  If we're faulted, bail.
+	 */
+	if (vd->vdev_faulted) {
+		ASSERT(vd->vdev_children == 0);
+		ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED ||
+		    vd->vdev_label_aux == VDEV_AUX_EXTERNAL);
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED,
+		    vd->vdev_label_aux);
+		return (ENXIO);
+	}
+
 	if (vd->vdev_degraded) {
 		ASSERT(vd->vdev_children == 0);
 		vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 		    VDEV_AUX_ERR_EXCEEDED);
 	} else {
-		vd->vdev_state = VDEV_STATE_HEALTHY;
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0);
 	}
 
-	for (c = 0; c < vd->vdev_children; c++)
+	/*
+	 * For hole or missing vdevs we just return success.
+	 */
+	if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops)
+		return (0);
+
+	for (int c = 0; c < vd->vdev_children; c++) {
 		if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED,
 			    VDEV_AUX_NONE);
 			break;
 		}
+	}
 
 	osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t));
 
@@ -994,6 +1206,15 @@ vdev_open(vdev_t *vd)
 
 	vd->vdev_psize = psize;
 
+	/*
+	 * Make sure the allocatable size hasn't shrunk.
+	 */
+	if (asize < vd->vdev_min_asize) {
+		vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
+		    VDEV_AUX_BAD_LABEL);
+		return (EINVAL);
+	}
+
 	if (vd->vdev_asize == 0) {
 		/*
 		 * This is the first-ever open, so use the computed values.
@@ -1010,25 +1231,18 @@ vdev_open(vdev_t *vd)
 			    VDEV_AUX_BAD_LABEL);
 			return (EINVAL);
 		}
+	}
 
-		/*
-		 * Make sure the device hasn't shrunk.
-		 */
-		if (asize < vd->vdev_asize) {
-			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
-			    VDEV_AUX_BAD_LABEL);
-			return (EINVAL);
-		}
+	/*
+	 * If all children are healthy and the asize has increased,
+	 * then we've experienced dynamic LUN growth.  If automatic
+	 * expansion is enabled then use the additional space.
+	 */
+	if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize &&
+	    (vd->vdev_expanding || spa->spa_autoexpand))
+		vd->vdev_asize = asize;
 
-		/*
-		 * If all children are healthy and the asize has increased,
-		 * then we've experienced dynamic LUN growth.
-		 */
-		if (vd->vdev_state == VDEV_STATE_HEALTHY &&
-		    asize > vd->vdev_asize) {
-			vd->vdev_asize = asize;
-		}
-	}
+	vdev_set_min_asize(vd);
 
 	/*
 	 * Ensure we can issue some IO before declaring the
@@ -1041,30 +1255,14 @@ vdev_open(vdev_t *vd)
 		return (error);
 	}
 
-	/*
-	 * If this is a top-level vdev, compute the raidz-deflation
-	 * ratio.  Note, we hard-code in 128k (1<<17) because it is the
-	 * current "typical" blocksize.  Even if SPA_MAXBLOCKSIZE
-	 * changes, this algorithm must never change, or we will
-	 * inconsistently account for existing bp's.
-	 */
-	if (vd->vdev_top == vd) {
-		vd->vdev_deflate_ratio = (1<<17) /
-		    (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT);
-	}
-
 	/*
 	 * If a leaf vdev has a DTL, and seems healthy, then kick off a
-	 * resilver.  But don't do this if we are doing a reopen for a
-	 * scrub, since this would just restart the scrub we are already
-	 * doing.
+	 * resilver.  But don't do this if we are doing a reopen for a scrub,
+	 * since this would just restart the scrub we are already doing.
 	 */
-	if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) {
-		mutex_enter(&vd->vdev_dtl_lock);
-		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd))
-			spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER);
-		mutex_exit(&vd->vdev_dtl_lock);
-	}
+	if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen &&
+	    vdev_resilver_needed(vd, NULL, NULL))
+		spa_async_request(spa, SPA_ASYNC_RESILVER);
 
 	return (0);
 }
@@ -1083,12 +1281,11 @@ int
 vdev_validate(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
-	int c;
 	nvlist_t *label;
-	uint64_t guid, top_guid;
+	uint64_t guid = 0, top_guid;
 	uint64_t state;
 
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		if (vdev_validate(vd->vdev_child[c]) != 0)
 			return (EBADF);
 
@@ -1098,6 +1295,8 @@ vdev_validate(vdev_t *vd)
 	 * overwrite the previous state.
 	 */
 	if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) {
+		uint64_t aux_guid = 0;
+		nvlist_t *nvl;
 
 		if ((label = vdev_label_read_config(vd)) == NULL) {
 			vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN,
@@ -1105,6 +1304,18 @@ vdev_validate(vdev_t *vd)
 			return (0);
 		}
 
+		/*
+		 * Determine if this vdev has been split off into another
+		 * pool.  If so, then refuse to open it.
+		 */
+		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID,
+		    &aux_guid) == 0 && aux_guid == spa_guid(spa)) {
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
+			    VDEV_AUX_SPLIT_POOL);
+			nvlist_free(label);
+			return (0);
+		}
+
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID,
 		    &guid) != 0 || guid != spa_guid(spa)) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1113,6 +1324,11 @@ vdev_validate(vdev_t *vd)
 			return (0);
 		}
 
+		if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl)
+		    != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID,
+		    &aux_guid) != 0)
+			aux_guid = 0;
+
 		/*
 		 * If this vdev just became a top-level vdev because its
 		 * sibling was detached, it will have adopted the parent's
@@ -1120,12 +1336,16 @@ vdev_validate(vdev_t *vd)
 		 * Fortunately, either version of the label will have the
 		 * same top guid, so if we're a top-level vdev, we can
 		 * safely compare to that instead.
+		 *
+		 * If we split this vdev off instead, then we also check the
+		 * original pool's guid.  We don't want to consider the vdev
+		 * corrupt if it is partway through a split operation.
 		 */
 		if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID,
 		    &guid) != 0 ||
 		    nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID,
 		    &top_guid) != 0 ||
-		    (vd->vdev_guid != guid &&
+		    ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) &&
 		    (vd->vdev_guid != top_guid || vd != vd->vdev_top))) {
 			vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
 			    VDEV_AUX_CORRUPT_DATA);
@@ -1143,7 +1363,12 @@ vdev_validate(vdev_t *vd)
 
 		nvlist_free(label);
 
-		if (spa->spa_load_state == SPA_LOAD_OPEN &&
+		/*
+		 * If spa->spa_load_verbatim is true, no need to check the
+		 * state of the pool.
+		 */
+		if (!spa->spa_load_verbatim &&
+		    spa_load_state(spa) == SPA_LOAD_OPEN &&
 		    state != POOL_STATE_ACTIVE)
 			return (EBADF);
 
@@ -1165,12 +1390,24 @@ vdev_validate(vdev_t *vd)
 void
 vdev_close(vdev_t *vd)
 {
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *pvd = vd->vdev_parent;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+	/*
+	 * If our parent is reopening, then we are as well, unless we are
+	 * going offline.
+	 */
+	if (pvd != NULL && pvd->vdev_reopening)
+		vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline);
+
 	vd->vdev_ops->vdev_op_close(vd);
 
 	vdev_cache_purge(vd);
 
 	/*
-	 * We record the previous state before we close it, so  that if we are
+	 * We record the previous state before we close it, so that if we are
 	 * doing a reopen(), we don't generate FMA ereports if we notice that
 	 * it's still faulted.
 	 */
@@ -1183,6 +1420,12 @@ vdev_close(vdev_t *vd)
 	vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 }
 
+/*
+ * Reopen all interior vdevs and any unopened leaves.  We don't actually
+ * reopen leaf vdevs which had previously been opened as they might deadlock
+ * on the spa_config_lock.  Instead we only obtain the leaf's physical size.
+ * If the leaf has never been opened then open it, as usual.
+ */
 void
 vdev_reopen(vdev_t *vd)
 {
@@ -1190,6 +1433,8 @@ vdev_reopen(vdev_t *vd)
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
+	/* set the reopening flag unless we're taking the vdev offline */
+	vd->vdev_reopening = !vd->vdev_offline;
 	vdev_close(vd);
 	(void) vdev_open(vd);
 
@@ -1201,12 +1446,9 @@ vdev_reopen(vdev_t *vd)
 	if (vd->vdev_aux) {
 		(void) vdev_validate_aux(vd);
 		if (vdev_readable(vd) && vdev_writeable(vd) &&
-		    !l2arc_vdev_present(vd)) {
-			uint64_t size = vdev_get_rsize(vd);
-			l2arc_add_vdev(spa, vd,
-			    VDEV_LABEL_START_SIZE,
-			    size - VDEV_LABEL_START_SIZE);
-		}
+		    vd->vdev_aux == &spa->spa_l2cache &&
+		    !l2arc_vdev_present(vd))
+			l2arc_add_vdev(spa, vd);
 	} else {
 		(void) vdev_validate(vd);
 	}
@@ -1246,32 +1488,21 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing)
 	return (0);
 }
 
-/*
- * The is the latter half of vdev_create().  It is distinct because it
- * involves initiating transactions in order to do metaslab creation.
- * For creation, we want to try to create all vdevs at once and then undo it
- * if anything fails; this is much harder if we have pending transactions.
- */
 void
-vdev_init(vdev_t *vd, uint64_t txg)
+vdev_metaslab_set_size(vdev_t *vd)
 {
 	/*
 	 * Aim for roughly 200 metaslabs per vdev.
 	 */
 	vd->vdev_ms_shift = highbit(vd->vdev_asize / 200);
 	vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT);
-
-	/*
-	 * Initialize the vdev's metaslabs.  This can't fail because
-	 * there's nothing to read when creating all new metaslabs.
-	 */
-	VERIFY(vdev_metaslab_init(vd, txg) == 0);
 }
 
 void
 vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 {
 	ASSERT(vd == vd->vdev_top);
+	ASSERT(!vd->vdev_ishole);
 	ASSERT(ISP2(flags));
 
 	if (flags & VDD_METASLAB)
@@ -1283,34 +1514,88 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg)
 	(void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg);
 }
 
+/*
+ * DTLs.
+ *
+ * A vdev's DTL (dirty time log) is the set of transaction groups for which
+ * the vdev has less than perfect replication.  There are three kinds of DTL:
+ *
+ * DTL_MISSING: txgs for which the vdev has no valid copies of the data
+ *
+ * DTL_PARTIAL: txgs for which data is available, but not fully replicated
+ *
+ * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon
+ *	scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of
+ *	txgs that was scrubbed.
+ *
+ * DTL_OUTAGE: txgs which cannot currently be read, whether due to
+ *	persistent errors or just some device being offline.
+ *	Unlike the other three, the DTL_OUTAGE map is not generally
+ *	maintained; it's only computed when needed, typically to
+ *	determine whether a device can be detached.
+ *
+ * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device
+ * either has the data or it doesn't.
+ *
+ * For interior vdevs such as mirror and RAID-Z the picture is more complex.
+ * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because
+ * if any child is less than fully replicated, then so is its parent.
+ * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs,
+ * comprising only those txgs which appear in 'maxfaults' or more children;
+ * those are the txgs we don't have enough replication to read.  For example,
+ * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2);
+ * thus, its DTL_MISSING consists of the set of txgs that appear in more than
+ * two child DTL_MISSING maps.
+ *
+ * It should be clear from the above that to compute the DTLs and outage maps
+ * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps.
+ * Therefore, that is all we keep on disk.  When loading the pool, or after
+ * a configuration change, we generate all other DTLs from first principles.
+ */
 void
-vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size)
+vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
+	space_map_t *sm = &vd->vdev_dtl[t];
+
+	ASSERT(t < DTL_TYPES);
+	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
+
 	mutex_enter(sm->sm_lock);
 	if (!space_map_contains(sm, txg, size))
 		space_map_add(sm, txg, size);
 	mutex_exit(sm->sm_lock);
 }
 
-int
-vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size)
+boolean_t
+vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size)
 {
-	int dirty;
+	space_map_t *sm = &vd->vdev_dtl[t];
+	boolean_t dirty = B_FALSE;
 
-	/*
-	 * Quick test without the lock -- covers the common case that
-	 * there are no dirty time segments.
-	 */
-	if (sm->sm_space == 0)
-		return (0);
+	ASSERT(t < DTL_TYPES);
+	ASSERT(vd != vd->vdev_spa->spa_root_vdev);
 
 	mutex_enter(sm->sm_lock);
-	dirty = space_map_contains(sm, txg, size);
+	if (sm->sm_space != 0)
+		dirty = space_map_contains(sm, txg, size);
 	mutex_exit(sm->sm_lock);
 
 	return (dirty);
 }
 
+boolean_t
+vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t)
+{
+	space_map_t *sm = &vd->vdev_dtl[t];
+	boolean_t empty;
+
+	mutex_enter(sm->sm_lock);
+	empty = (sm->sm_space == 0);
+	mutex_exit(sm->sm_lock);
+
+	return (empty);
+}
+
 /*
  * Reassess DTLs after a config change or scrub completion.
  */
@@ -1318,11 +1603,19 @@ void
 vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 {
 	spa_t *spa = vd->vdev_spa;
-	int c;
+	avl_tree_t reftree;
+	int minref;
+
+	ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0);
 
-	ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER));
+	for (int c = 0; c < vd->vdev_children; c++)
+		vdev_dtl_reassess(vd->vdev_child[c], txg,
+		    scrub_txg, scrub_done);
 
-	if (vd->vdev_children == 0) {
+	if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux)
+		return;
+
+	if (vd->vdev_ops->vdev_op_leaf) {
 		mutex_enter(&vd->vdev_dtl_lock);
 		if (scrub_txg != 0 &&
 		    (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) {
@@ -1333,12 +1626,38 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 			 * will be valid, so excise the old region and
 			 * fold in the scrub dtl.  Otherwise, leave the
 			 * dtl as-is if there was an error.
+			 *
+			 * There's little trick here: to excise the beginning
+			 * of the DTL_MISSING map, we put it into a reference
+			 * tree and then add a segment with refcnt -1 that
+			 * covers the range [0, scrub_txg).  This means
+			 * that each txg in that range has refcnt -1 or 0.
+			 * We then add DTL_SCRUB with a refcnt of 2, so that
+			 * entries in the range [0, scrub_txg) will have a
+			 * positive refcnt -- either 1 or 2.  We then convert
+			 * the reference tree into the new DTL_MISSING map.
 			 */
-			space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg);
-			space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub);
+			space_map_ref_create(&reftree);
+			space_map_ref_add_map(&reftree,
+			    &vd->vdev_dtl[DTL_MISSING], 1);
+			space_map_ref_add_seg(&reftree, 0, scrub_txg, -1);
+			space_map_ref_add_map(&reftree,
+			    &vd->vdev_dtl[DTL_SCRUB], 2);
+			space_map_ref_generate_map(&reftree,
+			    &vd->vdev_dtl[DTL_MISSING], 1);
+			space_map_ref_destroy(&reftree);
 		}
+		space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL);
+		space_map_walk(&vd->vdev_dtl[DTL_MISSING],
+		    space_map_add, &vd->vdev_dtl[DTL_PARTIAL]);
 		if (scrub_done)
-			space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
+			space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL);
+		space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL);
+		if (!vdev_readable(vd))
+			space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL);
+		else
+			space_map_walk(&vd->vdev_dtl[DTL_MISSING],
+			    space_map_add, &vd->vdev_dtl[DTL_OUTAGE]);
 		mutex_exit(&vd->vdev_dtl_lock);
 
 		if (txg != 0)
@@ -1346,35 +1665,36 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done)
 		return;
 	}
 
-	/*
-	 * Make sure the DTLs are always correct under the scrub lock.
-	 */
-	if (vd == spa->spa_root_vdev)
-		mutex_enter(&spa->spa_scrub_lock);
-
 	mutex_enter(&vd->vdev_dtl_lock);
-	space_map_vacate(&vd->vdev_dtl_map, NULL, NULL);
-	space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL);
-	mutex_exit(&vd->vdev_dtl_lock);
-
-	for (c = 0; c < vd->vdev_children; c++) {
-		vdev_t *cvd = vd->vdev_child[c];
-		vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done);
-		mutex_enter(&vd->vdev_dtl_lock);
-		space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map);
-		space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub);
-		mutex_exit(&vd->vdev_dtl_lock);
+	for (int t = 0; t < DTL_TYPES; t++) {
+		/* account for child's outage in parent's missing map */
+		int s = (t == DTL_MISSING) ? DTL_OUTAGE: t;
+		if (t == DTL_SCRUB)
+			continue;			/* leaf vdevs only */
+		if (t == DTL_PARTIAL)
+			minref = 1;			/* i.e. non-zero */
+		else if (vd->vdev_nparity != 0)
+			minref = vd->vdev_nparity + 1;	/* RAID-Z */
+		else
+			minref = vd->vdev_children;	/* any kind of mirror */
+		space_map_ref_create(&reftree);
+		for (int c = 0; c < vd->vdev_children; c++) {
+			vdev_t *cvd = vd->vdev_child[c];
+			mutex_enter(&cvd->vdev_dtl_lock);
+			space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1);
+			mutex_exit(&cvd->vdev_dtl_lock);
+		}
+		space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref);
+		space_map_ref_destroy(&reftree);
 	}
-
-	if (vd == spa->spa_root_vdev)
-		mutex_exit(&spa->spa_scrub_lock);
+	mutex_exit(&vd->vdev_dtl_lock);
 }
 
 static int
 vdev_dtl_load(vdev_t *vd)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl;
+	space_map_obj_t *smo = &vd->vdev_dtl_smo;
 	objset_t *mos = spa->spa_meta_objset;
 	dmu_buf_t *db;
 	int error;
@@ -1384,6 +1704,8 @@ vdev_dtl_load(vdev_t *vd)
 	if (smo->smo_object == 0)
 		return (0);
 
+	ASSERT(!vd->vdev_ishole);
+
 	if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0)
 		return (error);
 
@@ -1392,7 +1714,8 @@ vdev_dtl_load(vdev_t *vd)
 	dmu_buf_rele(db, FTAG);
 
 	mutex_enter(&vd->vdev_dtl_lock);
-	error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos);
+	error = space_map_load(&vd->vdev_dtl[DTL_MISSING],
+	    NULL, SM_ALLOC, smo, mos);
 	mutex_exit(&vd->vdev_dtl_lock);
 
 	return (error);
@@ -1402,14 +1725,16 @@ void
 vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 {
 	spa_t *spa = vd->vdev_spa;
-	space_map_obj_t *smo = &vd->vdev_dtl;
-	space_map_t *sm = &vd->vdev_dtl_map;
+	space_map_obj_t *smo = &vd->vdev_dtl_smo;
+	space_map_t *sm = &vd->vdev_dtl[DTL_MISSING];
 	objset_t *mos = spa->spa_meta_objset;
 	space_map_t smsync;
 	kmutex_t smlock;
 	dmu_buf_t *db;
 	dmu_tx_t *tx;
 
+	ASSERT(!vd->vdev_ishole);
+
 	tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
 
 	if (vd->vdev_detached) {
@@ -1460,6 +1785,37 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg)
 	dmu_tx_commit(tx);
 }
 
+/*
+ * Determine whether the specified vdev can be offlined/detached/removed
+ * without losing data.
+ */
+boolean_t
+vdev_dtl_required(vdev_t *vd)
+{
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *tvd = vd->vdev_top;
+	uint8_t cant_read = vd->vdev_cant_read;
+	boolean_t required;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+
+	if (vd == spa->spa_root_vdev || vd == tvd)
+		return (B_TRUE);
+
+	/*
+	 * Temporarily mark the device as unreadable, and then determine
+	 * whether this results in any DTL outages in the top-level vdev.
+	 * If not, we can safely offline/detach/remove the device.
+	 */
+	vd->vdev_cant_read = B_TRUE;
+	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+	required = !vdev_dtl_empty(tvd, DTL_OUTAGE);
+	vd->vdev_cant_read = cant_read;
+	vdev_dtl_reassess(tvd, 0, 0, B_FALSE);
+
+	return (required);
+}
+
 /*
  * Determine if resilver is needed, and if so the txg range.
  */
@@ -1472,19 +1828,19 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 
 	if (vd->vdev_children == 0) {
 		mutex_enter(&vd->vdev_dtl_lock);
-		if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) {
+		if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 &&
+		    vdev_writeable(vd)) {
 			space_seg_t *ss;
 
-			ss = avl_first(&vd->vdev_dtl_map.sm_root);
+			ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root);
 			thismin = ss->ss_start - 1;
-			ss = avl_last(&vd->vdev_dtl_map.sm_root);
+			ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root);
 			thismax = ss->ss_end;
 			needed = B_TRUE;
 		}
 		mutex_exit(&vd->vdev_dtl_lock);
 	} else {
-		int c;
-		for (c = 0; c < vd->vdev_children; c++) {
+		for (int c = 0; c < vd->vdev_children; c++) {
 			vdev_t *cvd = vd->vdev_child[c];
 			uint64_t cmin, cmax;
 
@@ -1506,18 +1862,16 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp)
 void
 vdev_load(vdev_t *vd)
 {
-	int c;
-
 	/*
 	 * Recursively load all children.
 	 */
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_load(vd->vdev_child[c]);
 
 	/*
 	 * If this is a top-level vdev, initialize its metaslabs.
 	 */
-	if (vd == vd->vdev_top &&
+	if (vd == vd->vdev_top && !vd->vdev_ishole &&
 	    (vd->vdev_ashift == 0 || vd->vdev_asize == 0 ||
 	    vdev_metaslab_init(vd, 0) != 0))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN,
@@ -1573,13 +1927,55 @@ vdev_validate_aux(vdev_t *vd)
 	return (0);
 }
 
+void
+vdev_remove(vdev_t *vd, uint64_t txg)
+{
+	spa_t *spa = vd->vdev_spa;
+	objset_t *mos = spa->spa_meta_objset;
+	dmu_tx_t *tx;
+
+	tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg);
+
+	if (vd->vdev_dtl_smo.smo_object) {
+		ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0);
+		(void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx);
+		vd->vdev_dtl_smo.smo_object = 0;
+	}
+
+	if (vd->vdev_ms != NULL) {
+		for (int m = 0; m < vd->vdev_ms_count; m++) {
+			metaslab_t *msp = vd->vdev_ms[m];
+
+			if (msp == NULL || msp->ms_smo.smo_object == 0)
+				continue;
+
+			ASSERT3U(msp->ms_smo.smo_alloc, ==, 0);
+			(void) dmu_object_free(mos, msp->ms_smo.smo_object, tx);
+			msp->ms_smo.smo_object = 0;
+		}
+	}
+
+	if (vd->vdev_ms_array) {
+		(void) dmu_object_free(mos, vd->vdev_ms_array, tx);
+		vd->vdev_ms_array = 0;
+		vd->vdev_ms_shift = 0;
+	}
+	dmu_tx_commit(tx);
+}
+
 void
 vdev_sync_done(vdev_t *vd, uint64_t txg)
 {
 	metaslab_t *msp;
+	boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg));
+
+	ASSERT(!vd->vdev_ishole);
 
 	while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg)))
 		metaslab_sync_done(msp, txg);
+
+	if (reassess)
+		metaslab_sync_reassess(vd->vdev_mg);
 }
 
 void
@@ -1590,6 +1986,8 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 	metaslab_t *msp;
 	dmu_tx_t *tx;
 
+	ASSERT(!vd->vdev_ishole);
+
 	if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) {
 		ASSERT(vd == vd->vdev_top);
 		tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg);
@@ -1600,6 +1998,9 @@ vdev_sync(vdev_t *vd, uint64_t txg)
 		dmu_tx_commit(tx);
 	}
 
+	if (vd->vdev_removing)
+		vdev_remove(vd, txg);
+
 	while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) {
 		metaslab_sync(msp, txg);
 		(void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg));
@@ -1622,11 +2023,11 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize)
  * not be opened, and no I/O is attempted.
  */
 int
-vdev_fault(spa_t *spa, uint64_t guid)
+vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1634,19 +2035,27 @@ vdev_fault(spa_t *spa, uint64_t guid)
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
+	/*
+	 * We don't directly use the aux state here, but if we do a
+	 * vdev_reopen(), we need this value to be present to remember why we
+	 * were faulted.
+	 */
+	vd->vdev_label_aux = aux;
+
 	/*
 	 * Faulted state takes precedence over degraded.
 	 */
 	vd->vdev_faulted = 1ULL;
 	vd->vdev_degraded = 0ULL;
-	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED);
+	vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux);
 
 	/*
 	 * If marking the vdev as faulted cause the top-level vdev to become
 	 * unavailable, then back off and simply mark the vdev as degraded
 	 * instead.
 	 */
-	if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+	if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog &&
+	    vd->vdev_aux == NULL) {
 		vd->vdev_degraded = 1ULL;
 		vd->vdev_faulted = 0ULL;
 
@@ -1656,10 +2065,8 @@ vdev_fault(spa_t *spa, uint64_t guid)
 		 */
 		vdev_reopen(vd);
 
-		if (vdev_readable(vd)) {
-			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
-			    VDEV_AUX_ERR_EXCEEDED);
-		}
+		if (vdev_readable(vd))
+			vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux);
 	}
 
 	return (spa_vdev_state_exit(spa, vd, 0));
@@ -1671,11 +2078,11 @@ vdev_fault(spa_t *spa, uint64_t guid)
  * as I/O is concerned.
  */
 int
-vdev_degrade(spa_t *spa, uint64_t guid)
+vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux)
 {
 	vdev_t *vd;
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1692,7 +2099,7 @@ vdev_degrade(spa_t *spa, uint64_t guid)
 	vd->vdev_degraded = 1ULL;
 	if (!vdev_is_dead(vd))
 		vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED,
-		    VDEV_AUX_ERR_EXCEEDED);
+		    aux);
 
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
@@ -1706,9 +2113,9 @@ vdev_degrade(spa_t *spa, uint64_t guid)
 int
 vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 {
-	vdev_t *vd;
+	vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev;
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1716,13 +2123,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
+	tvd = vd->vdev_top;
 	vd->vdev_offline = B_FALSE;
 	vd->vdev_tmpoffline = B_FALSE;
 	vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE);
 	vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT);
-	vdev_reopen(vd->vdev_top);
+
+	/* XXX - L2ARC 1.0 does not support expansion */
+	if (!vd->vdev_aux) {
+		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+			pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND);
+	}
+
+	vdev_reopen(tvd);
 	vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE;
 
+	if (!vd->vdev_aux) {
+		for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+			pvd->vdev_expanding = B_FALSE;
+	}
+
 	if (newstate)
 		*newstate = vd->vdev_state;
 	if ((flags & ZFS_ONLINE_UNSPARE) &&
@@ -1731,19 +2151,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
 	    vd->vdev_parent->vdev_child[0] == vd)
 		vd->vdev_unspare = B_TRUE;
 
-	(void) spa_vdev_state_exit(spa, vd, 0);
-
-	VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0);
+	if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) {
 
-	return (0);
+		/* XXX - L2ARC 1.0 does not support expansion */
+		if (vd->vdev_aux)
+			return (spa_vdev_state_exit(spa, vd, ENOTSUP));
+		spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
+	}
+	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
-int
-vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+static int
+vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags)
 {
-	vdev_t *vd;
+	vdev_t *vd, *tvd;
+	int error = 0;
+	uint64_t generation;
+	metaslab_group_t *mg;
 
-	spa_vdev_state_enter(spa);
+top:
+	spa_vdev_state_enter(spa, SCL_ALLOC);
 
 	if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL)
 		return (spa_vdev_state_exit(spa, NULL, ENODEV));
@@ -1751,32 +2178,76 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (spa_vdev_state_exit(spa, NULL, ENOTSUP));
 
+	tvd = vd->vdev_top;
+	mg = tvd->vdev_mg;
+	generation = spa->spa_config_generation + 1;
+
 	/*
 	 * If the device isn't already offline, try to offline it.
 	 */
 	if (!vd->vdev_offline) {
 		/*
-		 * If this device's top-level vdev has a non-empty DTL,
-		 * don't allow the device to be offlined.
-		 *
-		 * XXX -- make this more precise by allowing the offline
-		 * as long as the remaining devices don't have any DTL holes.
+		 * If this device has the only valid copy of some data,
+		 * don't allow it to be offlined. Log devices are always
+		 * expendable.
 		 */
-		if (vd->vdev_top->vdev_dtl_map.sm_space != 0)
+		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+		    vdev_dtl_required(vd))
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 
+		/*
+		 * If the top-level is a slog and it has had allocations
+		 * then proceed.  We check that the vdev's metaslab group
+		 * is not NULL since it's possible that we may have just
+		 * added this vdev but not yet initialized its metaslabs.
+		 */
+		if (tvd->vdev_islog && mg != NULL) {
+			/*
+			 * Prevent any future allocations.
+			 */
+			metaslab_group_passivate(mg);
+			(void) spa_vdev_state_exit(spa, vd, 0);
+
+			error = spa_offline_log(spa);
+
+			spa_vdev_state_enter(spa, SCL_ALLOC);
+
+			/*
+			 * Check to see if the config has changed.
+			 */
+			if (error || generation != spa->spa_config_generation) {
+				metaslab_group_activate(mg);
+				if (error)
+					return (spa_vdev_state_exit(spa,
+					    vd, error));
+				(void) spa_vdev_state_exit(spa, vd, 0);
+				goto top;
+			}
+			ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0);
+		}
+
 		/*
 		 * Offline this device and reopen its top-level vdev.
-		 * If this action results in the top-level vdev becoming
-		 * unusable, undo it and fail the request.
+		 * If the top-level vdev is a log device then just offline
+		 * it. Otherwise, if this action results in the top-level
+		 * vdev becoming unusable, undo it and fail the request.
 		 */
 		vd->vdev_offline = B_TRUE;
-		vdev_reopen(vd->vdev_top);
-		if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) {
+		vdev_reopen(tvd);
+
+		if (!tvd->vdev_islog && vd->vdev_aux == NULL &&
+		    vdev_is_dead(tvd)) {
 			vd->vdev_offline = B_FALSE;
-			vdev_reopen(vd->vdev_top);
+			vdev_reopen(tvd);
 			return (spa_vdev_state_exit(spa, NULL, EBUSY));
 		}
+
+		/*
+		 * Add the device back into the metaslab rotor so that
+		 * once we online the device it's open for business.
+		 */
+		if (tvd->vdev_islog && mg != NULL)
+			metaslab_group_activate(mg);
 	}
 
 	vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY);
@@ -1784,6 +2255,18 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
 	return (spa_vdev_state_exit(spa, vd, 0));
 }
 
+int
+vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags)
+{
+	int error;
+
+	mutex_enter(&spa->spa_vdev_top_lock);
+	error = vdev_offline_locked(spa, guid, flags);
+	mutex_exit(&spa->spa_vdev_top_lock);
+
+	return (error);
+}
+
 /*
  * Clear the error counts associated with this vdev.  Unlike vdev_online() and
  * vdev_offline(), we assume the spa config is locked.  We also clear all
@@ -1815,12 +2298,21 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 	if (vd->vdev_faulted || vd->vdev_degraded ||
 	    !vdev_readable(vd) || !vdev_writeable(vd)) {
 
+		/*
+		 * When reopening in reponse to a clear event, it may be due to
+		 * a fmadm repair request.  In this case, if the device is
+		 * still broken, we want to still post the ereport again.
+		 */
+		vd->vdev_forcefault = B_TRUE;
+
 		vd->vdev_faulted = vd->vdev_degraded = 0;
 		vd->vdev_cant_read = B_FALSE;
 		vd->vdev_cant_write = B_FALSE;
 
 		vdev_reopen(vd);
 
+		vd->vdev_forcefault = B_FALSE;
+
 		if (vd != rvd)
 			vdev_state_dirty(vd->vdev_top);
 
@@ -1829,12 +2321,30 @@ vdev_clear(spa_t *spa, vdev_t *vd)
 
 		spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR);
 	}
+
+	/*
+	 * When clearing a FMA-diagnosed fault, we always want to
+	 * unspare the device, as we assume that the original spare was
+	 * done in response to the FMA fault.
+	 */
+	if (!vdev_is_dead(vd) && vd->vdev_parent != NULL &&
+	    vd->vdev_parent->vdev_ops == &vdev_spare_ops &&
+	    vd->vdev_parent->vdev_child[0] == vd)
+		vd->vdev_unspare = B_TRUE;
 }
 
 boolean_t
 vdev_is_dead(vdev_t *vd)
 {
-	return (vd->vdev_state < VDEV_STATE_DEGRADED);
+	/*
+	 * Holes and missing devices are always considered "dead".
+	 * This simplifies the code since we don't have to check for
+	 * these types of devices in the various code paths.
+	 * Instead we rely on the fact that we skip over dead devices
+	 * before issuing I/O to them.
+	 */
+	return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole ||
+	    vd->vdev_ops == &vdev_missing_ops);
 }
 
 boolean_t
@@ -1852,14 +2362,18 @@ vdev_writeable(vdev_t *vd)
 boolean_t
 vdev_allocatable(vdev_t *vd)
 {
+	uint64_t state = vd->vdev_state;
+
 	/*
-	 * We currently allow allocations from vdevs which maybe in the
+	 * We currently allow allocations from vdevs which may be in the
 	 * process of reopening (i.e. VDEV_STATE_CLOSED). If the device
 	 * fails to reopen then we'll catch it later when we're holding
-	 * the proper locks.
+	 * the proper locks.  Note that we have to get the vdev state
+	 * in a local variable because although it changes atomically,
+	 * we're asking two separate questions about it.
 	 */
-	return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) &&
-	    !vd->vdev_cant_write);
+	return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) &&
+	    !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing);
 }
 
 boolean_t
@@ -1892,7 +2406,9 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs)
 	vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors;
 	vs->vs_timestamp = gethrtime() - vs->vs_timestamp;
 	vs->vs_state = vd->vdev_state;
-	vs->vs_rsize = vdev_get_rsize(vd);
+	vs->vs_rsize = vdev_get_min_asize(vd);
+	if (vd->vdev_ops->vdev_op_leaf)
+		vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE;
 	mutex_exit(&vd->vdev_stat_lock);
 
 	/*
@@ -1928,7 +2444,8 @@ vdev_clear_stats(vdev_t *vd)
 void
 vdev_stat_update(zio_t *zio, uint64_t psize)
 {
-	vdev_t *rvd = zio->io_spa->spa_root_vdev;
+	spa_t *spa = zio->io_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	vdev_t *vd = zio->io_vd ? zio->io_vd : rvd;
 	vdev_t *pvd;
 	uint64_t txg = zio->io_txg;
@@ -1961,61 +2478,106 @@ vdev_stat_update(zio_t *zio, uint64_t psize)
 			return;
 
 		ASSERT(vd == zio->io_vd);
-		if (!(flags & ZIO_FLAG_IO_BYPASS)) {
-			mutex_enter(&vd->vdev_stat_lock);
-			vs->vs_ops[type]++;
-			vs->vs_bytes[type] += psize;
-			mutex_exit(&vd->vdev_stat_lock);
-		}
+
+		if (flags & ZIO_FLAG_IO_BYPASS)
+			return;
+
+		mutex_enter(&vd->vdev_stat_lock);
+
 		if (flags & ZIO_FLAG_IO_REPAIR) {
-			ASSERT(zio->io_delegate_list == NULL);
-			mutex_enter(&vd->vdev_stat_lock);
 			if (flags & ZIO_FLAG_SCRUB_THREAD)
 				vs->vs_scrub_repaired += psize;
-			else
+			if (flags & ZIO_FLAG_SELF_HEAL)
 				vs->vs_self_healed += psize;
-			mutex_exit(&vd->vdev_stat_lock);
 		}
+
+		vs->vs_ops[type]++;
+		vs->vs_bytes[type] += psize;
+
+		mutex_exit(&vd->vdev_stat_lock);
 		return;
 	}
 
 	if (flags & ZIO_FLAG_SPECULATIVE)
 		return;
 
+	/*
+	 * If this is an I/O error that is going to be retried, then ignore the
+	 * error.  Otherwise, the user may interpret B_FAILFAST I/O errors as
+	 * hard errors, when in reality they can happen for any number of
+	 * innocuous reasons (bus resets, MPxIO link failure, etc).
+	 */
+	if (zio->io_error == EIO &&
+	    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
+		return;
+
+	/*
+	 * Intent logs writes won't propagate their error to the root
+	 * I/O so don't mark these types of failures as pool-level
+	 * errors.
+	 */
+	if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE))
+		return;
+
 	mutex_enter(&vd->vdev_stat_lock);
-	if (type == ZIO_TYPE_READ) {
+	if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) {
 		if (zio->io_error == ECKSUM)
 			vs->vs_checksum_errors++;
 		else
 			vs->vs_read_errors++;
 	}
-	if (type == ZIO_TYPE_WRITE)
+	if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd))
 		vs->vs_write_errors++;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) {
-		if (flags & ZIO_FLAG_SCRUB_THREAD) {
-			ASSERT(flags & ZIO_FLAG_IO_REPAIR);
-			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1);
-		}
-		if (!(flags & ZIO_FLAG_IO_REPAIR)) {
-			if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1))
+	if (type == ZIO_TYPE_WRITE && txg != 0 &&
+	    (!(flags & ZIO_FLAG_IO_REPAIR) ||
+	    (flags & ZIO_FLAG_SCRUB_THREAD) ||
+	    spa->spa_claiming)) {
+		/*
+		 * This is either a normal write (not a repair), or it's
+		 * a repair induced by the scrub thread, or it's a repair
+		 * made by zil_claim() during spa_load() in the first txg.
+		 * In the normal case, we commit the DTL change in the same
+		 * txg as the block was born.  In the scrub-induced repair
+		 * case, we know that scrubs run in first-pass syncing context,
+		 * so we commit the DTL change in spa_syncing_txg(spa).
+		 * In the zil_claim() case, we commit in spa_first_txg(spa).
+		 *
+		 * We currently do not make DTL entries for failed spontaneous
+		 * self-healing writes triggered by normal (non-scrubbing)
+		 * reads, because we have no transactional context in which to
+		 * do so -- and it's not clear that it'd be desirable anyway.
+		 */
+		if (vd->vdev_ops->vdev_op_leaf) {
+			uint64_t commit_txg = txg;
+			if (flags & ZIO_FLAG_SCRUB_THREAD) {
+				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+				ASSERT(spa_sync_pass(spa) == 1);
+				vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1);
+				commit_txg = spa_syncing_txg(spa);
+			} else if (spa->spa_claiming) {
+				ASSERT(flags & ZIO_FLAG_IO_REPAIR);
+				commit_txg = spa_first_txg(spa);
+			}
+			ASSERT(commit_txg >= spa_syncing_txg(spa));
+			if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1))
 				return;
-			vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg);
-			for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent)
-				vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1);
+			for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent)
+				vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1);
+			vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg);
 		}
+		if (vd != rvd)
+			vdev_dtl_dirty(vd, DTL_MISSING, txg, 1);
 	}
 }
 
 void
 vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
 {
-	int c;
 	vdev_stat_t *vs = &vd->vdev_stat;
 
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_scrub_stat_update(vd->vdev_child[c], type, complete);
 
 	mutex_enter(&vd->vdev_stat_lock);
@@ -2040,15 +2602,18 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete)
 }
 
 /*
- * Update the in-core space usage stats for this vdev and the root vdev.
+ * Update the in-core space usage stats for this vdev, its metaslab class,
+ * and the root vdev.
  */
 void
-vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
-    boolean_t update_root)
+vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta,
+    int64_t space_delta)
 {
 	int64_t dspace_delta = space_delta;
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
+	metaslab_group_t *mg = vd->vdev_mg;
+	metaslab_class_t *mc = mg ? mg->mg_class : NULL;
 
 	ASSERT(vd == vd->vdev_top);
 
@@ -2059,32 +2624,31 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta,
 	 * childrens', thus not accurate enough for us.
 	 */
 	ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0);
+	ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache);
 	dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) *
 	    vd->vdev_deflate_ratio;
 
 	mutex_enter(&vd->vdev_stat_lock);
-	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_alloc += alloc_delta;
+	vd->vdev_stat.vs_space += space_delta;
 	vd->vdev_stat.vs_dspace += dspace_delta;
 	mutex_exit(&vd->vdev_stat_lock);
 
-	if (update_root) {
-		ASSERT(rvd == vd->vdev_parent);
-		ASSERT(vd->vdev_ms_count != 0);
-
-		/*
-		 * Don't count non-normal (e.g. intent log) space as part of
-		 * the pool's capacity.
-		 */
-		if (vd->vdev_mg->mg_class != spa->spa_normal_class)
-			return;
-
+	if (mc == spa_normal_class(spa)) {
 		mutex_enter(&rvd->vdev_stat_lock);
-		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_alloc += alloc_delta;
+		rvd->vdev_stat.vs_space += space_delta;
 		rvd->vdev_stat.vs_dspace += dspace_delta;
 		mutex_exit(&rvd->vdev_stat_lock);
 	}
+
+	if (mc != NULL) {
+		ASSERT(rvd == vd->vdev_parent);
+		ASSERT(vd->vdev_ms_count != 0);
+
+		metaslab_class_space_update(mc,
+		    alloc_delta, defer_delta, space_delta, dspace_delta);
+	}
 }
 
 /*
@@ -2100,8 +2664,8 @@ vdev_config_dirty(vdev_t *vd)
 	int c;
 
 	/*
-	 * If this is an aux vdev (as with l2cache devices), then we update the
-	 * vdev config manually and set the sync flag.
+	 * If this is an aux vdev (as with l2cache and spare devices), then we
+	 * update the vdev config manually and set the sync flag.
 	 */
 	if (vd->vdev_aux != NULL) {
 		spa_aux_vdev_t *sav = vd->vdev_aux;
@@ -2123,8 +2687,11 @@ vdev_config_dirty(vdev_t *vd)
 
 		sav->sav_sync = B_TRUE;
 
-		VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
-		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0);
+		if (nvlist_lookup_nvlist_array(sav->sav_config,
+		    ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) {
+			VERIFY(nvlist_lookup_nvlist_array(sav->sav_config,
+			    ZPOOL_CONFIG_SPARES, &aux, &naux) == 0);
+		}
 
 		ASSERT(c < naux);
 
@@ -2154,7 +2721,8 @@ vdev_config_dirty(vdev_t *vd)
 	} else {
 		ASSERT(vd == vd->vdev_top);
 
-		if (!list_link_active(&vd->vdev_config_dirty_node))
+		if (!list_link_active(&vd->vdev_config_dirty_node) &&
+		    !vd->vdev_ishole)
 			list_insert_head(&spa->spa_config_dirty_list, vd);
 	}
 }
@@ -2195,7 +2763,7 @@ vdev_state_dirty(vdev_t *vd)
 	    (dsl_pool_sync_context(spa_get_dsl(spa)) &&
 	    spa_config_held(spa, SCL_STATE, RW_READER)));
 
-	if (!list_link_active(&vd->vdev_state_dirty_node))
+	if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole)
 		list_insert_head(&spa->spa_state_dirty_list, vd);
 }
 
@@ -2218,18 +2786,24 @@ vdev_state_clean(vdev_t *vd)
 void
 vdev_propagate_state(vdev_t *vd)
 {
-	vdev_t *rvd = vd->vdev_spa->spa_root_vdev;
+	spa_t *spa = vd->vdev_spa;
+	vdev_t *rvd = spa->spa_root_vdev;
 	int degraded = 0, faulted = 0;
 	int corrupted = 0;
-	int c;
 	vdev_t *child;
 
 	if (vd->vdev_children > 0) {
-		for (c = 0; c < vd->vdev_children; c++) {
+		for (int c = 0; c < vd->vdev_children; c++) {
 			child = vd->vdev_child[c];
 
+			/*
+			 * Don't factor holes into the decision.
+			 */
+			if (child->vdev_ishole)
+				continue;
+
 			if (!vdev_readable(child) ||
-			    (!vdev_writeable(child) && (spa_mode & FWRITE))) {
+			    (!vdev_writeable(child) && spa_writeable(spa))) {
 				/*
 				 * Root special: if there is a top-level log
 				 * device, treat the root vdev as if it were
@@ -2300,6 +2874,19 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 	if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf)
 		vd->vdev_ops->vdev_op_close(vd);
 
+	/*
+	 * If we have brought this vdev back into service, we need
+	 * to notify fmd so that it can gracefully repair any outstanding
+	 * cases due to a missing device.  We do this in all cases, even those
+	 * that probably don't correlate to a repaired fault.  This is sure to
+	 * catch all cases, and we let the zfs-retire agent sort it out.  If
+	 * this is a transient state it's OK, as the retire agent will
+	 * double-check the state of the vdev before repairing it.
+	 */
+	if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf &&
+	    vd->vdev_prevstate != state)
+		zfs_post_state_change(spa, vd);
+
 	if (vd->vdev_removed &&
 	    state == VDEV_STATE_CANT_OPEN &&
 	    (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) {
@@ -2315,11 +2902,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 		vd->vdev_state = VDEV_STATE_REMOVED;
 		vd->vdev_stat.vs_aux = VDEV_AUX_NONE;
 	} else if (state == VDEV_STATE_REMOVED) {
-		/*
-		 * Indicate to the ZFS DE that this device has been removed, and
-		 * any recent errors should be ignored.
-		 */
-		zfs_post_remove(spa, vd);
 		vd->vdev_removed = B_TRUE;
 	} else if (state == VDEV_STATE_CANT_OPEN) {
 		/*
@@ -2328,8 +2910,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 		 * begin with.  Failure to open such a device is not considered
 		 * an error.
 		 */
-		if (spa->spa_load_state == SPA_LOAD_IMPORT &&
-		    !spa->spa_import_faulted &&
+		if (spa_load_state(spa) == SPA_LOAD_IMPORT &&
 		    vd->vdev_ops->vdev_op_leaf)
 			vd->vdev_not_present = 1;
 
@@ -2388,8 +2969,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 		vd->vdev_removed = B_FALSE;
 	}
 
-	if (!isopen)
-		vdev_propagate_state(vd);
+	if (!isopen && vd->vdev_parent)
+		vdev_propagate_state(vd->vdev_parent);
 }
 
 /*
@@ -2401,8 +2982,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux)
 boolean_t
 vdev_is_bootable(vdev_t *vd)
 {
-	int c;
-
 	if (!vd->vdev_ops->vdev_op_leaf) {
 		char *vdev_type = vd->vdev_ops->vdev_op_type;
 
@@ -2417,9 +2996,71 @@ vdev_is_bootable(vdev_t *vd)
 		return (B_FALSE);
 	}
 
-	for (c = 0; c < vd->vdev_children; c++) {
+	for (int c = 0; c < vd->vdev_children; c++) {
 		if (!vdev_is_bootable(vd->vdev_child[c]))
 			return (B_FALSE);
 	}
 	return (B_TRUE);
 }
+
+/*
+ * Load the state from the original vdev tree (ovd) which
+ * we've retrieved from the MOS config object. If the original
+ * vdev was offline then we transfer that state to the device
+ * in the current vdev tree (nvd).
+ */
+void
+vdev_load_log_state(vdev_t *nvd, vdev_t *ovd)
+{
+	spa_t *spa = nvd->vdev_spa;
+
+	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
+	ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid);
+
+	for (int c = 0; c < nvd->vdev_children; c++)
+		vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]);
+
+	if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) {
+		/*
+		 * It would be nice to call vdev_offline()
+		 * directly but the pool isn't fully loaded and
+		 * the txg threads have not been started yet.
+		 */
+		nvd->vdev_offline = ovd->vdev_offline;
+		vdev_reopen(nvd->vdev_top);
+	}
+}
+
+/*
+ * Expand a vdev if possible.
+ */
+void
+vdev_expand(vdev_t *vd, uint64_t txg)
+{
+	ASSERT(vd->vdev_top == vd);
+	ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL);
+
+	if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) {
+		VERIFY(vdev_metaslab_init(vd, txg) == 0);
+		vdev_config_dirty(vd);
+	}
+}
+
+/*
+ * Split a vdev.
+ */
+void
+vdev_split(vdev_t *vd)
+{
+	vdev_t *cvd, *pvd = vd->vdev_parent;
+
+	vdev_remove_child(pvd, vd);
+	vdev_compact_children(pvd);
+
+	cvd = pvd->vdev_child[0];
+	if (pvd->vdev_children == 1) {
+		vdev_remove_parent(cvd);
+		cvd->vdev_splitting = B_TRUE;
+	}
+	vdev_propagate_state(cvd);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c
index 5a7b59f6ed845..688d541344cbc 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -172,7 +172,7 @@ vdev_cache_allocate(zio_t *zio)
 
 	ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP);
 	ve->ve_offset = offset;
-	ve->ve_lastused = lbolt;
+	ve->ve_lastused = ddi_get_lbolt();
 	ve->ve_data = zio_buf_alloc(VCBS);
 
 	avl_add(&vc->vc_offset_tree, ve);
@@ -189,9 +189,9 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
 	ASSERT(MUTEX_HELD(&vc->vc_lock));
 	ASSERT(ve->ve_fill_io == NULL);
 
-	if (ve->ve_lastused != lbolt) {
+	if (ve->ve_lastused != ddi_get_lbolt()) {
 		avl_remove(&vc->vc_lastused_tree, ve);
-		ve->ve_lastused = lbolt;
+		ve->ve_lastused = ddi_get_lbolt();
 		avl_add(&vc->vc_lastused_tree, ve);
 	}
 
@@ -203,23 +203,23 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio)
  * Fill a previously allocated cache entry with data.
  */
 static void
-vdev_cache_fill(zio_t *zio)
+vdev_cache_fill(zio_t *fio)
 {
-	vdev_t *vd = zio->io_vd;
+	vdev_t *vd = fio->io_vd;
 	vdev_cache_t *vc = &vd->vdev_cache;
-	vdev_cache_entry_t *ve = zio->io_private;
-	zio_t *dio;
+	vdev_cache_entry_t *ve = fio->io_private;
+	zio_t *pio;
 
-	ASSERT(zio->io_size == VCBS);
+	ASSERT(fio->io_size == VCBS);
 
 	/*
 	 * Add data to the cache.
 	 */
 	mutex_enter(&vc->vc_lock);
 
-	ASSERT(ve->ve_fill_io == zio);
-	ASSERT(ve->ve_offset == zio->io_offset);
-	ASSERT(ve->ve_data == zio->io_data);
+	ASSERT(ve->ve_fill_io == fio);
+	ASSERT(ve->ve_offset == fio->io_offset);
+	ASSERT(ve->ve_data == fio->io_data);
 
 	ve->ve_fill_io = NULL;
 
@@ -228,20 +228,13 @@ vdev_cache_fill(zio_t *zio)
 	 * any reads that were queued up before the missed update are still
 	 * valid, so we can satisfy them from this line before we evict it.
 	 */
-	for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next)
-		vdev_cache_hit(vc, ve, dio);
+	while ((pio = zio_walk_parents(fio)) != NULL)
+		vdev_cache_hit(vc, ve, pio);
 
-	if (zio->io_error || ve->ve_missed_update)
+	if (fio->io_error || ve->ve_missed_update)
 		vdev_cache_evict(vc, ve);
 
 	mutex_exit(&vc->vc_lock);
-
-	while ((dio = zio->io_delegate_list) != NULL) {
-		zio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		dio->io_error = zio->io_error;
-		zio_execute(dio);
-	}
 }
 
 /*
@@ -284,9 +277,8 @@ vdev_cache_read(zio_t *zio)
 		}
 
 		if ((fio = ve->ve_fill_io) != NULL) {
-			zio->io_delegate_next = fio->io_delegate_list;
-			fio->io_delegate_list = zio;
 			zio_vdev_io_bypass(zio);
+			zio_add_child(zio, fio);
 			mutex_exit(&vc->vc_lock);
 			VDCSTAT_BUMP(vdc_stat_delegations);
 			return (0);
@@ -296,7 +288,6 @@ vdev_cache_read(zio_t *zio)
 		zio_vdev_io_bypass(zio);
 
 		mutex_exit(&vc->vc_lock);
-		zio_execute(zio);
 		VDCSTAT_BUMP(vdc_stat_hits);
 		return (0);
 	}
@@ -313,8 +304,8 @@ vdev_cache_read(zio_t *zio)
 	    ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve);
 
 	ve->ve_fill_io = fio;
-	fio->io_delegate_list = zio;
 	zio_vdev_io_bypass(zio);
+	zio_add_child(zio, fio);
 
 	mutex_exit(&vc->vc_lock);
 	zio_nowait(fio);
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c
index 35d4e2a9200db..08e28b274902a 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -47,6 +47,7 @@ typedef struct vdev_disk_buf {
 static int
 vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 {
+	spa_t *spa = vd->vdev_spa;
 	vdev_disk_t *dvd;
 	struct dk_minfo dkm;
 	int error;
@@ -61,6 +62,16 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		return (EINVAL);
 	}
 
+	/*
+	 * Reopen the device if it's not currently open. Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if (vd->vdev_tsd != NULL) {
+		ASSERT(vd->vdev_reopening);
+		dvd = vd->vdev_tsd;
+		goto skip_open;
+	}
+
 	dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP);
 
 	/*
@@ -78,12 +89,6 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 *
 	 * 3. Otherwise, the device may have moved.  Try opening the device
 	 *    by the devid instead.
-	 *
-	 * If the vdev is part of the root pool, we avoid opening it by path.
-	 * We do this because there is no /dev path available early in boot,
-	 * and if we try to open the device by path at a later point, we can
-	 * deadlock when devfsadm attempts to open the underlying backing store
-	 * file.
 	 */
 	if (vd->vdev_devid != NULL) {
 		if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid,
@@ -95,7 +100,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 
 	error = EINVAL;		/* presume failure */
 
-	if (vd->vdev_path != NULL && !spa_is_root(vd->vdev_spa)) {
+	if (vd->vdev_path != NULL) {
 		ddi_devid_t devid;
 
 		if (vd->vdev_wholedisk == -1ULL) {
@@ -105,18 +110,18 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 
 			(void) snprintf(buf, len, "%ss0", vd->vdev_path);
 
-			if (ldi_open_by_name(buf, spa_mode, kcred,
+			if (ldi_open_by_name(buf, spa_mode(spa), kcred,
 			    &lh, zfs_li) == 0) {
 				spa_strfree(vd->vdev_path);
 				vd->vdev_path = buf;
 				vd->vdev_wholedisk = 1ULL;
-				(void) ldi_close(lh, spa_mode, kcred);
+				(void) ldi_close(lh, spa_mode(spa), kcred);
 			} else {
 				kmem_free(buf, len);
 			}
 		}
 
-		error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
+		error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred,
 		    &dvd->vd_lh, zfs_li);
 
 		/*
@@ -126,7 +131,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		    ldi_get_devid(dvd->vd_lh, &devid) == 0) {
 			if (ddi_devid_compare(devid, dvd->vd_devid) != 0) {
 				error = EINVAL;
-				(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+				(void) ldi_close(dvd->vd_lh, spa_mode(spa),
+				    kcred);
 				dvd->vd_lh = NULL;
 			}
 			ddi_devid_free(devid);
@@ -146,7 +152,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 */
 	if (error != 0 && vd->vdev_devid != NULL)
 		error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor,
-		    spa_mode, kcred, &dvd->vd_lh, zfs_li);
+		    spa_mode(spa), kcred, &dvd->vd_lh, zfs_li);
 
 	/*
 	 * If all else fails, then try opening by physical path (if available)
@@ -156,8 +162,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 */
 	if (error) {
 		if (vd->vdev_physpath != NULL &&
-		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV)
-			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode,
+		    (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV)
+			error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa),
 			    kcred, &dvd->vd_lh, zfs_li);
 
 		/*
@@ -165,10 +171,9 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		 * as above.  This hasn't been used in a very long time and we
 		 * don't need to propagate its oddities to this edge condition.
 		 */
-		if (error && vd->vdev_path != NULL &&
-		    !spa_is_root(vd->vdev_spa))
-			error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred,
-			    &dvd->vd_lh, zfs_li);
+		if (error && vd->vdev_path != NULL)
+			error = ldi_open_by_name(vd->vdev_path, spa_mode(spa),
+			    kcred, &dvd->vd_lh, zfs_li);
 	}
 
 	if (error) {
@@ -201,6 +206,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		kmem_free(physpath, MAXPATHLEN);
 	}
 
+skip_open:
 	/*
 	 * Determine the actual size of the device.
 	 */
@@ -243,7 +249,7 @@ vdev_disk_close(vdev_t *vd)
 {
 	vdev_disk_t *dvd = vd->vdev_tsd;
 
-	if (dvd == NULL)
+	if (vd->vdev_reopening || dvd == NULL)
 		return;
 
 	if (dvd->vd_minor != NULL)
@@ -253,7 +259,7 @@ vdev_disk_close(vdev_t *vd)
 		ddi_devid_free(dvd->vd_devid);
 
 	if (dvd->vd_lh != NULL)
-		(void) ldi_close(dvd->vd_lh, spa_mode, kcred);
+		(void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred);
 
 	kmem_free(dvd, sizeof (vdev_disk_t));
 	vd->vdev_tsd = NULL;
@@ -314,6 +320,11 @@ vdev_disk_ioctl_free(zio_t *zio)
 	kmem_free(zio->io_vsd, sizeof (struct dk_callback));
 }
 
+static const zio_vsd_ops_t vdev_disk_vsd_ops = {
+	vdev_disk_ioctl_free,
+	zio_vsd_default_cksum_report
+};
+
 static void
 vdev_disk_ioctl_done(void *zio_arg, int error)
 {
@@ -354,7 +365,7 @@ vdev_disk_io_start(zio_t *zio)
 			}
 
 			zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP);
-			zio->io_vsd_free = vdev_disk_ioctl_free;
+			zio->io_vsd_ops = &vdev_disk_vsd_ops;
 
 			dkc->dkc_callback = vdev_disk_ioctl_done;
 			dkc->dkc_flag = FLUSH_VOLATILE;
@@ -400,8 +411,9 @@ vdev_disk_io_start(zio_t *zio)
 
 	bioinit(bp);
 	bp->b_flags = B_BUSY | B_NOCACHE |
-	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE) |
-	    ((zio->io_flags & ZIO_FLAG_IO_RETRY) ? 0 : B_FAILFAST);
+	    (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE);
+	if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))
+		bp->b_flags |= B_FAILFAST;
 	bp->b_bcount = zio->io_size;
 	bp->b_un.b_addr = zio->io_data;
 	bp->b_lblkno = lbtodb(zio->io_offset);
@@ -425,12 +437,19 @@ vdev_disk_io_done(zio_t *zio)
 	 * asynchronous removal of the device. Otherwise, probe the device and
 	 * make sure it's still accessible.
 	 */
-	if (zio->io_error == EIO) {
+	if (zio->io_error == EIO && !vd->vdev_remove_wanted) {
 		vdev_disk_t *dvd = vd->vdev_tsd;
 		int state = DKIO_NONE;
 
 		if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state,
 		    FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) {
+			/*
+			 * We post the resource as soon as possible, instead of
+			 * when the async removal actually happens, because the
+			 * DE is using this information to discard previous I/O
+			 * errors.
+			 */
+			zfs_post_remove(zio->io_spa, vd);
 			vd->vdev_remove_wanted = B_TRUE;
 			spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE);
 		}
@@ -469,7 +488,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 	if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid,
 	    &minor_name) == 0) {
 		error = ldi_open_by_devid(tmpdevid, minor_name,
-		    spa_mode, kcred, &vd_lh, zfs_li);
+		    FREAD, kcred, &vd_lh, zfs_li);
 		ddi_devid_free(tmpdevid);
 		ddi_devid_str_free(minor_name);
 	}
@@ -486,14 +505,14 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 	size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t);
 	label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP);
 
+	*config = NULL;
 	for (l = 0; l < VDEV_LABELS; l++) {
 		uint64_t offset, state, txg = 0;
 
 		/* read vdev label */
 		offset = vdev_label_offset(size, l, 0);
 		if (vdev_disk_physio(vd_lh, (caddr_t)label,
-		    VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE +
-		    VDEV_PHYS_SIZE, offset, B_READ) != 0)
+		    VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0)
 			continue;
 
 		if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist,
@@ -521,6 +540,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config)
 
 	kmem_free(label, sizeof (vdev_label_t));
 	(void) ldi_close(vd_lh, FREAD, kcred);
+	if (*config == NULL)
+		error = EIDRM;
 
 	return (error);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c
index dc0e920bfc521..779e88edb9f24 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -51,6 +51,16 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		return (EINVAL);
 	}
 
+	/*
+	 * Reopen the device if it's not currently open.  Otherwise,
+	 * just update the physical size of the device.
+	 */
+	if (vd->vdev_tsd != NULL) {
+		ASSERT(vd->vdev_reopening);
+		vf = vd->vdev_tsd;
+		goto skip_open;
+	}
+
 	vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP);
 
 	/*
@@ -61,7 +71,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 */
 	ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/');
 	error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE,
-	    spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
+	    spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1);
 
 	if (error) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED;
@@ -79,6 +89,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 		return (ENODEV);
 	}
 #endif
+
+skip_open:
 	/*
 	 * Determine the physical size of the file.
 	 */
@@ -100,12 +112,13 @@ vdev_file_close(vdev_t *vd)
 {
 	vdev_file_t *vf = vd->vdev_tsd;
 
-	if (vf == NULL)
+	if (vd->vdev_reopening || vf == NULL)
 		return;
 
 	if (vf->vf_vnode != NULL) {
 		(void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL);
-		(void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL);
+		(void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0,
+		    kcred, NULL);
 		VN_RELE(vf->vf_vnode);
 	}
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c
index 9c56d66364d56..d11b3df7c67e4 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -233,6 +233,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH,
 		    vd->vdev_physpath) == 0);
 
+	if (vd->vdev_fru != NULL)
+		VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU,
+		    vd->vdev_fru) == 0);
+
 	if (vd->vdev_nparity != 0) {
 		ASSERT(strcmp(vd->vdev_ops->vdev_op_type,
 		    VDEV_TYPE_RAIDZ) == 0);
@@ -242,8 +246,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		 * into a crufty old storage pool.
 		 */
 		ASSERT(vd->vdev_nparity == 1 ||
-		    (vd->vdev_nparity == 2 &&
-		    spa_version(spa) >= SPA_VERSION_RAID6));
+		    (vd->vdev_nparity <= 2 &&
+		    spa_version(spa) >= SPA_VERSION_RAIDZ2) ||
+		    (vd->vdev_nparity <= 3 &&
+		    spa_version(spa) >= SPA_VERSION_RAIDZ3));
 
 		/*
 		 * Note that we'll add the nparity tag even on storage pools
@@ -277,9 +283,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		    vd->vdev_islog) == 0);
 	}
 
-	if (vd->vdev_dtl.smo_object != 0)
+	if (vd->vdev_dtl_smo.smo_object != 0)
 		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL,
-		    vd->vdev_dtl.smo_object) == 0);
+		    vd->vdev_dtl_smo.smo_object) == 0);
+
+	if (vd->vdev_crtxg)
+		VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG,
+		    vd->vdev_crtxg) == 0);
 
 	if (getstats) {
 		vdev_stat_t vs;
@@ -292,6 +302,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		nvlist_t **child;
 		int c;
 
+		ASSERT(!vd->vdev_ishole);
+
 		child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *),
 		    KM_SLEEP);
 
@@ -308,6 +320,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		kmem_free(child, vd->vdev_children * sizeof (nvlist_t *));
 
 	} else {
+		const char *aux = NULL;
+
 		if (vd->vdev_offline && !vd->vdev_tmpoffline)
 			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE,
 			    B_TRUE) == 0);
@@ -323,11 +337,67 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats,
 		if (vd->vdev_unspare)
 			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE,
 			    B_TRUE) == 0);
+		if (vd->vdev_ishole)
+			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE,
+			    B_TRUE) == 0);
+
+		switch (vd->vdev_stat.vs_aux) {
+		case VDEV_AUX_ERR_EXCEEDED:
+			aux = "err_exceeded";
+			break;
+
+		case VDEV_AUX_EXTERNAL:
+			aux = "external";
+			break;
+		}
+
+		if (aux != NULL)
+			VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE,
+			    aux) == 0);
+
+		if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) {
+			VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID,
+			    vd->vdev_orig_guid) == 0);
+		}
 	}
 
 	return (nv);
 }
 
+/*
+ * Generate a view of the top-level vdevs.  If we currently have holes
+ * in the namespace, then generate an array which contains a list of holey
+ * vdevs.  Additionally, add the number of top-level children that currently
+ * exist.
+ */
+void
+vdev_top_config_generate(spa_t *spa, nvlist_t *config)
+{
+	vdev_t *rvd = spa->spa_root_vdev;
+	uint64_t *array;
+	uint_t idx;
+
+	array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP);
+
+	idx = 0;
+	for (int c = 0; c < rvd->vdev_children; c++) {
+		vdev_t *tvd = rvd->vdev_child[c];
+
+		if (tvd->vdev_ishole)
+			array[idx++] = c;
+	}
+
+	if (idx) {
+		VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY,
+		    array, idx) == 0);
+	}
+
+	VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN,
+	    rvd->vdev_children) == 0);
+
+	kmem_free(array, rvd->vdev_children * sizeof (uint64_t));
+}
+
 nvlist_t *
 vdev_label_read_config(vdev_t *vd)
 {
@@ -335,8 +405,8 @@ vdev_label_read_config(vdev_t *vd)
 	nvlist_t *config = NULL;
 	vdev_phys_t *vp;
 	zio_t *zio;
-	int flags =
-	    ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SPECULATIVE;
 
 	ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL);
 
@@ -345,6 +415,7 @@ vdev_label_read_config(vdev_t *vd)
 
 	vp = zio_buf_alloc(sizeof (vdev_phys_t));
 
+retry:
 	for (int l = 0; l < VDEV_LABELS; l++) {
 
 		zio = zio_root(spa, NULL, NULL, flags);
@@ -364,6 +435,11 @@ vdev_label_read_config(vdev_t *vd)
 		}
 	}
 
+	if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) {
+		flags |= ZIO_FLAG_TRYHARD;
+		goto retry;
+	}
+
 	zio_buf_free(vp, sizeof (vdev_phys_t));
 
 	return (config);
@@ -488,7 +564,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	spa_t *spa = vd->vdev_spa;
 	nvlist_t *label;
 	vdev_phys_t *vp;
-	vdev_boot_header_t *vb;
+	char *pad2;
 	uberblock_t *ub;
 	zio_t *zio;
 	char *buf;
@@ -504,6 +580,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		    crtxg, reason)) != 0)
 			return (error);
 
+	/* Track the creation time for this vdev */
+	vd->vdev_crtxg = crtxg;
+
 	if (!vd->vdev_ops->vdev_op_leaf)
 		return (0);
 
@@ -516,7 +595,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 	/*
 	 * Determine if the vdev is in use.
 	 */
-	if (reason != VDEV_LABEL_REMOVE &&
+	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT &&
 	    vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid))
 		return (EBUSY);
 
@@ -542,7 +621,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		 */
 		if (reason == VDEV_LABEL_SPARE)
 			return (0);
-		ASSERT(reason == VDEV_LABEL_REPLACE);
+		ASSERT(reason == VDEV_LABEL_REPLACE ||
+		    reason == VDEV_LABEL_SPLIT);
 	}
 
 	if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE &&
@@ -607,7 +687,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID,
 		    vd->vdev_guid) == 0);
 	} else {
-		label = spa_config_generate(spa, vd, 0ULL, B_FALSE);
+		uint64_t txg = 0ULL;
+
+		if (reason == VDEV_LABEL_SPLIT)
+			txg = spa->spa_uberblock.ub_txg;
+		label = spa_config_generate(spa, vd, txg, B_FALSE);
 
 		/*
 		 * Add our creation time.  This allows us to detect multiple
@@ -629,27 +713,22 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		return (error == EFAULT ? ENAMETOOLONG : EINVAL);
 	}
 
-	/*
-	 * Initialize boot block header.
-	 */
-	vb = zio_buf_alloc(sizeof (vdev_boot_header_t));
-	bzero(vb, sizeof (vdev_boot_header_t));
-	vb->vb_magic = VDEV_BOOT_MAGIC;
-	vb->vb_version = VDEV_BOOT_VERSION;
-	vb->vb_offset = VDEV_BOOT_OFFSET;
-	vb->vb_size = VDEV_BOOT_SIZE;
-
 	/*
 	 * Initialize uberblock template.
 	 */
-	ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd));
-	bzero(ub, VDEV_UBERBLOCK_SIZE(vd));
+	ub = zio_buf_alloc(VDEV_UBERBLOCK_RING);
+	bzero(ub, VDEV_UBERBLOCK_RING);
 	*ub = spa->spa_uberblock;
 	ub->ub_txg = 0;
 
+	/* Initialize the 2nd padding area. */
+	pad2 = zio_buf_alloc(VDEV_PAD_SIZE);
+	bzero(pad2, VDEV_PAD_SIZE);
+
 	/*
 	 * Write everything in parallel.
 	 */
+retry:
 	zio = zio_root(spa, NULL, NULL, flags);
 
 	for (int l = 0; l < VDEV_LABELS; l++) {
@@ -658,22 +737,30 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason)
 		    offsetof(vdev_label_t, vl_vdev_phys),
 		    sizeof (vdev_phys_t), NULL, NULL, flags);
 
-		vdev_label_write(zio, vd, l, vb,
-		    offsetof(vdev_label_t, vl_boot_header),
-		    sizeof (vdev_boot_header_t), NULL, NULL, flags);
+		/*
+		 * Skip the 1st padding area.
+		 * Zero out the 2nd padding area where it might have
+		 * left over data from previous filesystem format.
+		 */
+		vdev_label_write(zio, vd, l, pad2,
+		    offsetof(vdev_label_t, vl_pad2),
+		    VDEV_PAD_SIZE, NULL, NULL, flags);
 
-		for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) {
-			vdev_label_write(zio, vd, l, ub,
-			    VDEV_UBERBLOCK_OFFSET(vd, n),
-			    VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags);
-		}
+		vdev_label_write(zio, vd, l, ub,
+		    offsetof(vdev_label_t, vl_uberblock),
+		    VDEV_UBERBLOCK_RING, NULL, NULL, flags);
 	}
 
 	error = zio_wait(zio);
 
+	if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) {
+		flags |= ZIO_FLAG_TRYHARD;
+		goto retry;
+	}
+
 	nvlist_free(label);
-	zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd));
-	zio_buf_free(vb, sizeof (vdev_boot_header_t));
+	zio_buf_free(pad2, VDEV_PAD_SIZE);
+	zio_buf_free(ub, VDEV_UBERBLOCK_RING);
 	zio_buf_free(vp, sizeof (vdev_phys_t));
 
 	/*
@@ -730,6 +817,7 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2)
 static void
 vdev_uberblock_load_done(zio_t *zio)
 {
+	spa_t *spa = zio->io_spa;
 	zio_t *rio = zio->io_private;
 	uberblock_t *ub = zio->io_data;
 	uberblock_t *ubbest = rio->io_private;
@@ -738,7 +826,8 @@ vdev_uberblock_load_done(zio_t *zio)
 
 	if (zio->io_error == 0 && uberblock_verify(ub) == 0) {
 		mutex_enter(&rio->io_lock);
-		if (vdev_uberblock_compare(ub, ubbest) > 0)
+		if (ub->ub_txg <= spa->spa_load_max_txg &&
+		    vdev_uberblock_compare(ub, ubbest) > 0)
 			*ubbest = *ub;
 		mutex_exit(&rio->io_lock);
 	}
@@ -751,8 +840,8 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest)
 {
 	spa_t *spa = vd->vdev_spa;
 	vdev_t *rvd = spa->spa_root_vdev;
-	int flags =
-	    ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE;
+	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL |
+	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD;
 
 	if (vd == rvd) {
 		ASSERT(zio == NULL);
@@ -955,7 +1044,10 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
 	for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) {
 		uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t),
 		    KM_SLEEP);
-		zio_t *vio = zio_null(zio, spa,
+
+		ASSERT(!vd->vdev_ishole);
+
+		zio_t *vio = zio_null(zio, spa, NULL,
 		    (vd->vdev_islog || vd->vdev_aux != NULL) ?
 		    vdev_label_sync_ignore_done : vdev_label_sync_top_done,
 		    good_writes, flags);
@@ -990,7 +1082,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags)
  * at any time, you can just call it again, and it will resume its work.
  */
 int
-vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
+vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard)
 {
 	spa_t *spa = svd[0]->vdev_spa;
 	uberblock_t *ub = &spa->spa_uberblock;
@@ -999,6 +1091,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg)
 	int error;
 	int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL;
 
+	/*
+	 * Normally, we don't want to try too hard to write every label and
+	 * uberblock.  If there is a flaky disk, we don't want the rest of the
+	 * sync process to block while we retry.  But if we can't write a
+	 * single label out, we should retry with ZIO_FLAG_TRYHARD before
+	 * bailing out and declaring the pool faulted.
+	 */
+	if (tryhard)
+		flags |= ZIO_FLAG_TRYHARD;
+
 	ASSERT(ub->ub_txg <= txg);
 
 	/*
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c
index c4629ff45087c..ac2a9b0f4dddf 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -60,6 +60,11 @@ vdev_mirror_map_free(zio_t *zio)
 	kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children]));
 }
 
+static const zio_vsd_ops_t vdev_mirror_vsd_ops = {
+	vdev_mirror_map_free,
+	zio_vsd_default_cksum_report
+};
+
 static mirror_map_t *
 vdev_mirror_map_alloc(zio_t *zio)
 {
@@ -117,28 +122,28 @@ vdev_mirror_map_alloc(zio_t *zio)
 	}
 
 	zio->io_vsd = mm;
-	zio->io_vsd_free = vdev_mirror_map_free;
+	zio->io_vsd_ops = &vdev_mirror_vsd_ops;
 	return (mm);
 }
 
 static int
 vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
-	vdev_t *cvd;
-	uint64_t c;
 	int numerrors = 0;
-	int ret, lasterror = 0;
+	int lasterror = 0;
 
 	if (vd->vdev_children == 0) {
 		vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL;
 		return (EINVAL);
 	}
 
-	for (c = 0; c < vd->vdev_children; c++) {
-		cvd = vd->vdev_child[c];
+	vdev_open_children(vd);
 
-		if ((ret = vdev_open(cvd)) != 0) {
-			lasterror = ret;
+	for (int c = 0; c < vd->vdev_children; c++) {
+		vdev_t *cvd = vd->vdev_child[c];
+
+		if (cvd->vdev_open_error) {
+			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
@@ -158,9 +163,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 static void
 vdev_mirror_close(vdev_t *vd)
 {
-	uint64_t c;
-
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
@@ -180,11 +183,16 @@ vdev_mirror_scrub_done(zio_t *zio)
 	mirror_child_t *mc = zio->io_private;
 
 	if (zio->io_error == 0) {
-		zio_t *pio = zio->io_parent;
-		mutex_enter(&pio->io_lock);
-		ASSERT3U(zio->io_size, >=, pio->io_size);
-		bcopy(zio->io_data, pio->io_data, pio->io_size);
-		mutex_exit(&pio->io_lock);
+		zio_t *pio;
+
+		mutex_enter(&zio->io_lock);
+		while ((pio = zio_walk_parents(zio)) != NULL) {
+			mutex_enter(&pio->io_lock);
+			ASSERT3U(zio->io_size, >=, pio->io_size);
+			bcopy(zio->io_data, pio->io_data, pio->io_size);
+			mutex_exit(&pio->io_lock);
+		}
+		mutex_exit(&zio->io_lock);
 	}
 
 	zio_buf_free(zio->io_data, zio->io_size);
@@ -206,7 +214,7 @@ vdev_mirror_child_select(zio_t *zio)
 	uint64_t txg = zio->io_txg;
 	int i, c;
 
-	ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg);
+	ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg);
 
 	/*
 	 * Try to find a child whose DTL doesn't contain the block to read.
@@ -225,7 +233,7 @@ vdev_mirror_child_select(zio_t *zio)
 			mc->mc_skipped = 1;
 			continue;
 		}
-		if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1))
+		if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1))
 			return (c);
 		mc->mc_error = ESTALE;
 		mc->mc_skipped = 1;
@@ -282,20 +290,10 @@ vdev_mirror_io_start(zio_t *zio)
 		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
 
 		/*
-		 * If this is a resilvering I/O to a replacing vdev,
-		 * only the last child should be written -- unless the
-		 * first child happens to have a DTL entry here as well.
-		 * All other writes go to all children.
+		 * Writes go to all children.
 		 */
-		if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing &&
-		    !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map,
-		    zio->io_txg, 1)) {
-			c = mm->mm_children - 1;
-			children = 1;
-		} else {
-			c = 0;
-			children = mm->mm_children;
-		}
+		c = 0;
+		children = mm->mm_children;
 	}
 
 	while (children--) {
@@ -398,7 +396,7 @@ vdev_mirror_io_done(zio_t *zio)
 		ASSERT(zio->io_error != 0);
 	}
 
-	if (good_copies && (spa_mode & FWRITE) &&
+	if (good_copies && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors ||
 	    (zio->io_flags & ZIO_FLAG_RESILVER) ||
 	    ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) {
@@ -419,7 +417,7 @@ vdev_mirror_io_done(zio_t *zio)
 				if (mc->mc_tried)
 					continue;
 				if (!(zio->io_flags & ZIO_FLAG_SCRUB) &&
-				    !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map,
+				    !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL,
 				    zio->io_txg, 1))
 					continue;
 				mc->mc_error = ESTALE;
@@ -429,7 +427,8 @@ vdev_mirror_io_done(zio_t *zio)
 			    mc->mc_vd, mc->mc_offset,
 			    zio->io_data, zio->io_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
+			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c
index 731f7d3dcec90..e1bf7d86a361f 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -48,8 +48,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift)
 	 * VDEV_AUX_BAD_GUID_SUM.  So we pretend to succeed, knowing that we
 	 * will fail the GUID sum check before ever trying to open the pool.
 	 */
-	*psize = SPA_MINDEVSIZE;
-	*ashift = SPA_MINBLOCKSHIFT;
+	*psize = 0;
+	*ashift = 0;
 	return (0);
 }
 
@@ -83,3 +83,14 @@ vdev_ops_t vdev_missing_ops = {
 	VDEV_TYPE_MISSING,	/* name of this vdev type */
 	B_TRUE			/* leaf vdev */
 };
+
+vdev_ops_t vdev_hole_ops = {
+	vdev_missing_open,
+	vdev_missing_close,
+	vdev_default_asize,
+	vdev_missing_io_start,
+	vdev_missing_io_done,
+	NULL,
+	VDEV_TYPE_HOLE,		/* name of this vdev type */
+	B_TRUE			/* leaf vdev */
+};
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c
index 46fca0e3b629f..5a0d3ee97029d 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c
@@ -19,12 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <sys/zfs_context.h>
-#include <sys/spa.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
 #include <sys/avl.h>
@@ -38,20 +37,24 @@
  * of i/os pending to each device (before it starts ramping up to
  * max_pending).
  */
-int zfs_vdev_max_pending = 35;
+int zfs_vdev_max_pending = 10;
 int zfs_vdev_min_pending = 4;
 
-/* deadline = pri + (lbolt >> time_shift) */
+/* deadline = pri + ddi_get_lbolt64() >> time_shift) */
 int zfs_vdev_time_shift = 6;
 
 /* exponential I/O issue ramp-up rate */
 int zfs_vdev_ramp_rate = 2;
 
 /*
- * i/os will be aggregated into a single large i/o up to
- * zfs_vdev_aggregation_limit bytes long.
+ * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O.
+ * For read I/Os, we also aggregate across small adjacency gaps; for writes
+ * we include spans of optional I/Os to aid aggregation at the disk even when
+ * they aren't able to help us aggregate at this level.
  */
 int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE;
+int zfs_vdev_read_gap_limit = 32 << 10;
+int zfs_vdev_write_gap_limit = 4 << 10;
 
 /*
  * Virtual device vector for disk I/O scheduling.
@@ -149,34 +152,36 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
 static void
 vdev_queue_agg_io_done(zio_t *aio)
 {
-	zio_t *dio;
-	uint64_t offset = 0;
+	zio_t *pio;
 
-	while ((dio = aio->io_delegate_list) != NULL) {
+	while ((pio = zio_walk_parents(aio)) != NULL)
 		if (aio->io_type == ZIO_TYPE_READ)
-			bcopy((char *)aio->io_data + offset, dio->io_data,
-			    dio->io_size);
-		offset += dio->io_size;
-		aio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		dio->io_error = aio->io_error;
-		zio_execute(dio);
-	}
-	ASSERT3U(offset, ==, aio->io_size);
+			bcopy((char *)aio->io_data + (pio->io_offset -
+			    aio->io_offset), pio->io_data, pio->io_size);
 
 	zio_buf_free(aio->io_data, aio->io_size);
 }
 
-#define	IS_ADJACENT(io, nio) \
-	((io)->io_offset + (io)->io_size == (nio)->io_offset)
+/*
+ * Compute the range spanned by two i/os, which is the endpoint of the last
+ * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset).
+ * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio);
+ * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0.
+ */
+#define	IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset)
+#define	IO_GAP(fio, lio) (-IO_SPAN(lio, fio))
 
 static zio_t *
 vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 {
-	zio_t *fio, *lio, *aio, *dio;
-	avl_tree_t *tree;
-	uint64_t size;
-
+	zio_t *fio, *lio, *aio, *dio, *nio, *mio;
+	avl_tree_t *t;
+	int flags;
+	uint64_t maxspan = zfs_vdev_aggregation_limit;
+	uint64_t maxgap;
+	int stretch;
+
+again:
 	ASSERT(MUTEX_HELD(&vq->vq_lock));
 
 	if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit ||
@@ -185,58 +190,150 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit)
 
 	fio = lio = avl_first(&vq->vq_deadline_tree);
 
-	tree = fio->io_vdev_tree;
-	size = fio->io_size;
+	t = fio->io_vdev_tree;
+	flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT;
+	maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0;
+
+	if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) {
+		/*
+		 * We can aggregate I/Os that are sufficiently adjacent and of
+		 * the same flavor, as expressed by the AGG_INHERIT flags.
+		 * The latter requirement is necessary so that certain
+		 * attributes of the I/O, such as whether it's a normal I/O
+		 * or a scrub/resilver, can be preserved in the aggregate.
+		 * We can include optional I/Os, but don't allow them
+		 * to begin a range as they add no benefit in that situation.
+		 */
+
+		/*
+		 * We keep track of the last non-optional I/O.
+		 */
+		mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio;
+
+		/*
+		 * Walk backwards through sufficiently contiguous I/Os
+		 * recording the last non-option I/O.
+		 */
+		while ((dio = AVL_PREV(t, fio)) != NULL &&
+		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+		    IO_SPAN(dio, lio) <= maxspan &&
+		    IO_GAP(dio, fio) <= maxgap) {
+			fio = dio;
+			if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL))
+				mio = fio;
+		}
 
-	while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) &&
-	    !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
-	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-		dio->io_delegate_next = fio;
-		fio = dio;
-		size += dio->io_size;
-	}
+		/*
+		 * Skip any initial optional I/Os.
+		 */
+		while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) {
+			fio = AVL_NEXT(t, fio);
+			ASSERT(fio != NULL);
+		}
+
+		/*
+		 * Walk forward through sufficiently contiguous I/Os.
+		 */
+		while ((dio = AVL_NEXT(t, lio)) != NULL &&
+		    (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags &&
+		    IO_SPAN(fio, dio) <= maxspan &&
+		    IO_GAP(lio, dio) <= maxgap) {
+			lio = dio;
+			if (!(lio->io_flags & ZIO_FLAG_OPTIONAL))
+				mio = lio;
+		}
+
+		/*
+		 * Now that we've established the range of the I/O aggregation
+		 * we must decide what to do with trailing optional I/Os.
+		 * For reads, there's nothing to do. While we are unable to
+		 * aggregate further, it's possible that a trailing optional
+		 * I/O would allow the underlying device to aggregate with
+		 * subsequent I/Os. We must therefore determine if the next
+		 * non-optional I/O is close enough to make aggregation
+		 * worthwhile.
+		 */
+		stretch = B_FALSE;
+		if (t != &vq->vq_read_tree && mio != NULL) {
+			nio = lio;
+			while ((dio = AVL_NEXT(t, nio)) != NULL &&
+			    IO_GAP(nio, dio) == 0 &&
+			    IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) {
+				nio = dio;
+				if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) {
+					stretch = B_TRUE;
+					break;
+				}
+			}
+		}
 
-	while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) &&
-	    !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) &&
-	    size + dio->io_size <= zfs_vdev_aggregation_limit) {
-		lio->io_delegate_next = dio;
-		lio = dio;
-		size += dio->io_size;
+		if (stretch) {
+			/* This may be a no-op. */
+			VERIFY((dio = AVL_NEXT(t, lio)) != NULL);
+			dio->io_flags &= ~ZIO_FLAG_OPTIONAL;
+		} else {
+			while (lio != mio && lio != fio) {
+				ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL);
+				lio = AVL_PREV(t, lio);
+				ASSERT(lio != NULL);
+			}
+		}
 	}
 
 	if (fio != lio) {
-		char *buf = zio_buf_alloc(size);
-		uint64_t offset = 0;
-
+		uint64_t size = IO_SPAN(fio, lio);
 		ASSERT(size <= zfs_vdev_aggregation_limit);
 
 		aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset,
-		    buf, size, fio->io_type, ZIO_PRIORITY_NOW,
-		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
+		    zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG,
+		    flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE,
 		    vdev_queue_agg_io_done, NULL);
 
-		aio->io_delegate_list = fio;
-
-		for (dio = fio; dio != NULL; dio = dio->io_delegate_next) {
+		nio = fio;
+		do {
+			dio = nio;
+			nio = AVL_NEXT(t, dio);
 			ASSERT(dio->io_type == aio->io_type);
-			ASSERT(dio->io_vdev_tree == tree);
-			if (dio->io_type == ZIO_TYPE_WRITE)
-				bcopy(dio->io_data, buf + offset, dio->io_size);
-			offset += dio->io_size;
+			ASSERT(dio->io_vdev_tree == t);
+
+			if (dio->io_flags & ZIO_FLAG_NODATA) {
+				ASSERT(dio->io_type == ZIO_TYPE_WRITE);
+				bzero((char *)aio->io_data + (dio->io_offset -
+				    aio->io_offset), dio->io_size);
+			} else if (dio->io_type == ZIO_TYPE_WRITE) {
+				bcopy(dio->io_data, (char *)aio->io_data +
+				    (dio->io_offset - aio->io_offset),
+				    dio->io_size);
+			}
+
+			zio_add_child(dio, aio);
 			vdev_queue_io_remove(vq, dio);
 			zio_vdev_io_bypass(dio);
-		}
-
-		ASSERT(offset == size);
+			zio_execute(dio);
+		} while (dio != lio);
 
 		avl_add(&vq->vq_pending_tree, aio);
 
 		return (aio);
 	}
 
-	ASSERT(fio->io_vdev_tree == tree);
+	ASSERT(fio->io_vdev_tree == t);
 	vdev_queue_io_remove(vq, fio);
 
+	/*
+	 * If the I/O is or was optional and therefore has no data, we need to
+	 * simply discard it. We need to drop the vdev queue's lock to avoid a
+	 * deadlock that we could encounter since this I/O will complete
+	 * immediately.
+	 */
+	if (fio->io_flags & ZIO_FLAG_NODATA) {
+		mutex_exit(&vq->vq_lock);
+		zio_vdev_io_bypass(fio);
+		zio_execute(fio);
+		mutex_enter(&vq->vq_lock);
+		goto again;
+	}
+
 	avl_add(&vq->vq_pending_tree, fio);
 
 	return (fio);
@@ -262,7 +359,8 @@ vdev_queue_io(zio_t *zio)
 
 	mutex_enter(&vq->vq_lock);
 
-	zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority;
+	zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) +
+	    zio->io_priority;
 
 	vdev_queue_io_add(vq, zio);
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c
index 69e314468ee47..aa031dd25bd48 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -35,12 +35,27 @@
 /*
  * Virtual device vector for RAID-Z.
  *
- * This vdev supports both single and double parity. For single parity, we
- * use a simple XOR of all the data columns. For double parity, we use both
- * the simple XOR as well as a technique described in "The mathematics of
- * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8),
- * over the integers expressable in a single byte. Briefly, the operations on
- * the field are defined as follows:
+ * This vdev supports single, double, and triple parity. For single parity,
+ * we use a simple XOR of all the data columns. For double or triple parity,
+ * we use a special case of Reed-Solomon coding. This extends the
+ * technique described in "The mathematics of RAID-6" by H. Peter Anvin by
+ * drawing on the system described in "A Tutorial on Reed-Solomon Coding for
+ * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the
+ * former is also based. The latter is designed to provide higher performance
+ * for writes.
+ *
+ * Note that the Plank paper claimed to support arbitrary N+M, but was then
+ * amended six years later identifying a critical flaw that invalidates its
+ * claims. Nevertheless, the technique can be adapted to work for up to
+ * triple parity. For additional parity, the amendment "Note: Correction to
+ * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding
+ * is viable, but the additional complexity means that write performance will
+ * suffer.
+ *
+ * All of the methods above operate on a Galois field, defined over the
+ * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements
+ * can be expressed with a single byte. Briefly, the operations on the
+ * field are defined as follows:
  *
  *   o addition (+) is represented by a bitwise XOR
  *   o subtraction (-) is therefore identical to addition: A + B = A - B
@@ -55,22 +70,32 @@
  *	(A * 2)_0 = A_7
  *
  * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)).
+ * As an aside, this multiplication is derived from the error correcting
+ * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1.
  *
  * Observe that any number in the field (except for 0) can be expressed as a
  * power of 2 -- a generator for the field. We store a table of the powers of
  * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can
  * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather
- * than field addition). The inverse of a field element A (A^-1) is A^254.
+ * than field addition). The inverse of a field element A (A^-1) is therefore
+ * A ^ (255 - 1) = A^254.
  *
- * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1,
- * can be expressed by field operations:
+ * The up-to-three parity columns, P, Q, R over several data columns,
+ * D_0, ... D_n-1, can be expressed by field operations:
  *
  *	P = D_0 + D_1 + ... + D_n-2 + D_n-1
  *	Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1
  *	  = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1
+ *	R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1
+ *	  = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1
  *
- * See the reconstruction code below for how P and Q can used individually or
- * in concert to recover missing data columns.
+ * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival
+ * XOR operation, and 2 and 4 can be computed quickly and generate linearly-
+ * independent coefficients. (There are no additional coefficients that have
+ * this property which is why the uncorrected Plank method breaks down.)
+ *
+ * See the reconstruction code below for how P, Q and R can used individually
+ * or in concert to recover missing data columns.
  */
 
 typedef struct raidz_col {
@@ -78,27 +103,60 @@ typedef struct raidz_col {
 	uint64_t rc_offset;		/* device offset */
 	uint64_t rc_size;		/* I/O size */
 	void *rc_data;			/* I/O data */
+	void *rc_gdata;			/* used to store the "good" version */
 	int rc_error;			/* I/O error for this device */
 	uint8_t rc_tried;		/* Did we attempt this I/O column? */
 	uint8_t rc_skipped;		/* Did we skip this I/O column? */
 } raidz_col_t;
 
 typedef struct raidz_map {
-	uint64_t rm_cols;		/* Column count */
+	uint64_t rm_cols;		/* Regular column count */
+	uint64_t rm_scols;		/* Count including skipped columns */
 	uint64_t rm_bigcols;		/* Number of oversized columns */
 	uint64_t rm_asize;		/* Actual total I/O size */
 	uint64_t rm_missingdata;	/* Count of missing data devices */
 	uint64_t rm_missingparity;	/* Count of missing parity devices */
 	uint64_t rm_firstdatacol;	/* First data column/parity count */
+	uint64_t rm_nskip;		/* Skipped sectors for padding */
+	uint64_t rm_skipstart;	/* Column index of padding start */
+	void *rm_datacopy;		/* rm_asize-buffer of copied data */
+	uintptr_t rm_reports;		/* # of referencing checksum reports */
+	uint8_t	rm_freed;		/* map no longer has referencing ZIO */
+	uint8_t	rm_ecksuminjected;	/* checksum error was injected */
 	raidz_col_t rm_col[1];		/* Flexible array of I/O columns */
 } raidz_map_t;
 
 #define	VDEV_RAIDZ_P		0
 #define	VDEV_RAIDZ_Q		1
+#define	VDEV_RAIDZ_R		2
+
+#define	VDEV_RAIDZ_MUL_2(x)	(((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0))
+#define	VDEV_RAIDZ_MUL_4(x)	(VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x)))
+
+/*
+ * We provide a mechanism to perform the field multiplication operation on a
+ * 64-bit value all at once rather than a byte at a time. This works by
+ * creating a mask from the top bit in each byte and using that to
+ * conditionally apply the XOR of 0x1d.
+ */
+#define	VDEV_RAIDZ_64MUL_2(x, mask) \
+{ \
+	(mask) = (x) & 0x8080808080808080ULL; \
+	(mask) = ((mask) << 1) - ((mask) >> 7); \
+	(x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \
+	    ((mask) & 0x1d1d1d1d1d1d1d1d); \
+}
 
-#define	VDEV_RAIDZ_MAXPARITY	2
+#define	VDEV_RAIDZ_64MUL_4(x, mask) \
+{ \
+	VDEV_RAIDZ_64MUL_2((x), mask); \
+	VDEV_RAIDZ_64MUL_2((x), mask); \
+}
 
-#define	VDEV_RAIDZ_MUL_2(a)	(((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0))
+/*
+ * Force reconstruction to use the general purpose method.
+ */
+int vdev_raidz_default_to_general;
 
 /*
  * These two tables represent powers and logs of 2 in the Galois field defined
@@ -173,6 +231,8 @@ static const uint8_t vdev_raidz_log2[256] = {
 	0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf,
 };
 
+static void vdev_raidz_generate_parity(raidz_map_t *rm);
+
 /*
  * Multiply a given number by 2 raised to the given power.
  */
@@ -193,17 +253,184 @@ vdev_raidz_exp2(uint_t a, int exp)
 }
 
 static void
-vdev_raidz_map_free(zio_t *zio)
+vdev_raidz_map_free(raidz_map_t *rm)
 {
-	raidz_map_t *rm = zio->io_vsd;
 	int c;
+	size_t size;
 
-	for (c = 0; c < rm->rm_firstdatacol; c++)
+	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size);
 
-	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols]));
+		if (rm->rm_col[c].rc_gdata != NULL)
+			zio_buf_free(rm->rm_col[c].rc_gdata,
+			    rm->rm_col[c].rc_size);
+	}
+
+	size = 0;
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+		size += rm->rm_col[c].rc_size;
+
+	if (rm->rm_datacopy != NULL)
+		zio_buf_free(rm->rm_datacopy, size);
+
+	kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols]));
+}
+
+static void
+vdev_raidz_map_free_vsd(zio_t *zio)
+{
+	raidz_map_t *rm = zio->io_vsd;
+
+	ASSERT3U(rm->rm_freed, ==, 0);
+	rm->rm_freed = 1;
+
+	if (rm->rm_reports == 0)
+		vdev_raidz_map_free(rm);
+}
+
+/*ARGSUSED*/
+static void
+vdev_raidz_cksum_free(void *arg, size_t ignored)
+{
+	raidz_map_t *rm = arg;
+
+	ASSERT3U(rm->rm_reports, >, 0);
+
+	if (--rm->rm_reports == 0 && rm->rm_freed != 0)
+		vdev_raidz_map_free(rm);
+}
+
+static void
+vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data)
+{
+	raidz_map_t *rm = zcr->zcr_cbdata;
+	size_t c = zcr->zcr_cbinfo;
+	size_t x;
+
+	const char *good = NULL;
+	const char *bad = rm->rm_col[c].rc_data;
+
+	if (good_data == NULL) {
+		zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE);
+		return;
+	}
+
+	if (c < rm->rm_firstdatacol) {
+		/*
+		 * The first time through, calculate the parity blocks for
+		 * the good data (this relies on the fact that the good
+		 * data never changes for a given logical ZIO)
+		 */
+		if (rm->rm_col[0].rc_gdata == NULL) {
+			char *bad_parity[VDEV_RAIDZ_MAXPARITY];
+			char *buf;
+
+			/*
+			 * Set up the rm_col[]s to generate the parity for
+			 * good_data, first saving the parity bufs and
+			 * replacing them with buffers to hold the result.
+			 */
+			for (x = 0; x < rm->rm_firstdatacol; x++) {
+				bad_parity[x] = rm->rm_col[x].rc_data;
+				rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata =
+				    zio_buf_alloc(rm->rm_col[x].rc_size);
+			}
+
+			/* fill in the data columns from good_data */
+			buf = (char *)good_data;
+			for (; x < rm->rm_cols; x++) {
+				rm->rm_col[x].rc_data = buf;
+				buf += rm->rm_col[x].rc_size;
+			}
+
+			/*
+			 * Construct the parity from the good data.
+			 */
+			vdev_raidz_generate_parity(rm);
+
+			/* restore everything back to its original state */
+			for (x = 0; x < rm->rm_firstdatacol; x++)
+				rm->rm_col[x].rc_data = bad_parity[x];
+
+			buf = rm->rm_datacopy;
+			for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) {
+				rm->rm_col[x].rc_data = buf;
+				buf += rm->rm_col[x].rc_size;
+			}
+		}
+
+		ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL);
+		good = rm->rm_col[c].rc_gdata;
+	} else {
+		/* adjust good_data to point at the start of our column */
+		good = good_data;
+
+		for (x = rm->rm_firstdatacol; x < c; x++)
+			good += rm->rm_col[x].rc_size;
+	}
+
+	/* we drop the ereport if it ends up that the data was good */
+	zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE);
+}
+
+/*
+ * Invoked indirectly by zfs_ereport_start_checksum(), called
+ * below when our read operation fails completely.  The main point
+ * is to keep a copy of everything we read from disk, so that at
+ * vdev_raidz_cksum_finish() time we can compare it with the good data.
+ */
+static void
+vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg)
+{
+	size_t c = (size_t)(uintptr_t)arg;
+	caddr_t buf;
+
+	raidz_map_t *rm = zio->io_vsd;
+	size_t size;
+
+	/* set up the report and bump the refcount  */
+	zcr->zcr_cbdata = rm;
+	zcr->zcr_cbinfo = c;
+	zcr->zcr_finish = vdev_raidz_cksum_finish;
+	zcr->zcr_free = vdev_raidz_cksum_free;
+
+	rm->rm_reports++;
+	ASSERT3U(rm->rm_reports, >, 0);
+
+	if (rm->rm_datacopy != NULL)
+		return;
+
+	/*
+	 * It's the first time we're called for this raidz_map_t, so we need
+	 * to copy the data aside; there's no guarantee that our zio's buffer
+	 * won't be re-used for something else.
+	 *
+	 * Our parity data is already in separate buffers, so there's no need
+	 * to copy them.
+	 */
+
+	size = 0;
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++)
+		size += rm->rm_col[c].rc_size;
+
+	buf = rm->rm_datacopy = zio_buf_alloc(size);
+
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+		raidz_col_t *col = &rm->rm_col[c];
+
+		bcopy(col->rc_data, buf, col->rc_size);
+		col->rc_data = buf;
+
+		buf += col->rc_size;
+	}
+	ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size);
 }
 
+static const zio_vsd_ops_t vdev_raidz_vsd_ops = {
+	vdev_raidz_map_free_vsd,
+	vdev_raidz_cksum_report
+};
+
 static raidz_map_t *
 vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
     uint64_t nparity)
@@ -213,24 +440,40 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	uint64_t s = zio->io_size >> unit_shift;
 	uint64_t f = b % dcols;
 	uint64_t o = (b / dcols) << unit_shift;
-	uint64_t q, r, c, bc, col, acols, coff, devidx;
+	uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot;
 
 	q = s / (dcols - nparity);
 	r = s - q * (dcols - nparity);
 	bc = (r == 0 ? 0 : r + nparity);
+	tot = s + nparity * (q + (r == 0 ? 0 : 1));
+
+	if (q == 0) {
+		acols = bc;
+		scols = MIN(dcols, roundup(bc, nparity + 1));
+	} else {
+		acols = dcols;
+		scols = dcols;
+	}
 
-	acols = (q == 0 ? bc : dcols);
+	ASSERT3U(acols, <=, scols);
 
-	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP);
+	rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP);
 
 	rm->rm_cols = acols;
+	rm->rm_scols = scols;
 	rm->rm_bigcols = bc;
-	rm->rm_asize = 0;
+	rm->rm_skipstart = bc;
 	rm->rm_missingdata = 0;
 	rm->rm_missingparity = 0;
 	rm->rm_firstdatacol = nparity;
+	rm->rm_datacopy = NULL;
+	rm->rm_reports = 0;
+	rm->rm_freed = 0;
+	rm->rm_ecksuminjected = 0;
 
-	for (c = 0; c < acols; c++) {
+	asize = 0;
+
+	for (c = 0; c < scols; c++) {
 		col = f + c;
 		coff = o;
 		if (col >= dcols) {
@@ -239,15 +482,27 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 		}
 		rm->rm_col[c].rc_devidx = col;
 		rm->rm_col[c].rc_offset = coff;
-		rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift;
 		rm->rm_col[c].rc_data = NULL;
+		rm->rm_col[c].rc_gdata = NULL;
 		rm->rm_col[c].rc_error = 0;
 		rm->rm_col[c].rc_tried = 0;
 		rm->rm_col[c].rc_skipped = 0;
-		rm->rm_asize += rm->rm_col[c].rc_size;
+
+		if (c >= acols)
+			rm->rm_col[c].rc_size = 0;
+		else if (c < bc)
+			rm->rm_col[c].rc_size = (q + 1) << unit_shift;
+		else
+			rm->rm_col[c].rc_size = q << unit_shift;
+
+		asize += rm->rm_col[c].rc_size;
 	}
 
-	rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift);
+	ASSERT3U(asize, ==, tot << unit_shift);
+	rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift);
+	rm->rm_nskip = roundup(tot, nparity + 1) - tot;
+	ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift);
+	ASSERT3U(rm->rm_nskip, <=, nparity);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++)
 		rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size);
@@ -272,6 +527,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 	 * Unfortunately, this decision created an implicit on-disk format
 	 * requirement that we need to support for all eternity, but only
 	 * for single-parity RAID-Z.
+	 *
+	 * If we intend to skip a sector in the zeroth column for padding
+	 * we must make sure to note this swap. We will never intend to
+	 * skip the first column since at least one data and one parity
+	 * column must appear in each row.
 	 */
 	ASSERT(rm->rm_cols >= 2);
 	ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size);
@@ -283,10 +543,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols,
 		rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset;
 		rm->rm_col[1].rc_devidx = devidx;
 		rm->rm_col[1].rc_offset = o;
+
+		if (rm->rm_skipstart == 0)
+			rm->rm_skipstart = 1;
 	}
 
 	zio->io_vsd = rm;
-	zio->io_vsd_free = vdev_raidz_map_free;
+	zio->io_vsd_ops = &vdev_raidz_vsd_ops;
 	return (rm);
 }
 
@@ -305,12 +568,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
 
 		if (c == rm->rm_firstdatacol) {
 			ASSERT(ccount == pcount);
-			for (i = 0; i < ccount; i++, p++, src++) {
+			for (i = 0; i < ccount; i++, src++, p++) {
 				*p = *src;
 			}
 		} else {
 			ASSERT(ccount <= pcount);
-			for (i = 0; i < ccount; i++, p++, src++) {
+			for (i = 0; i < ccount; i++, src++, p++) {
 				*p ^= *src;
 			}
 		}
@@ -320,10 +583,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm)
 static void
 vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 {
-	uint64_t *q, *p, *src, pcount, ccount, mask, i;
+	uint64_t *p, *q, *src, pcnt, ccnt, mask, i;
 	int c;
 
-	pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
 	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
 	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
 
@@ -331,55 +594,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm)
 		src = rm->rm_col[c].rc_data;
 		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
 		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
-		ccount = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
 
 		if (c == rm->rm_firstdatacol) {
-			ASSERT(ccount == pcount || ccount == 0);
-			for (i = 0; i < ccount; i++, p++, q++, src++) {
-				*q = *src;
+			ASSERT(ccnt == pcnt || ccnt == 0);
+			for (i = 0; i < ccnt; i++, src++, p++, q++) {
 				*p = *src;
+				*q = *src;
 			}
-			for (; i < pcount; i++, p++, q++, src++) {
-				*q = 0;
+			for (; i < pcnt; i++, src++, p++, q++) {
 				*p = 0;
+				*q = 0;
 			}
 		} else {
-			ASSERT(ccount <= pcount);
+			ASSERT(ccnt <= pcnt);
 
 			/*
-			 * Rather than multiplying each byte individually (as
-			 * described above), we are able to handle 8 at once
-			 * by generating a mask based on the high bit in each
-			 * byte and using that to conditionally XOR in 0x1d.
+			 * Apply the algorithm described above by multiplying
+			 * the previous result and adding in the new value.
 			 */
-			for (i = 0; i < ccount; i++, p++, q++, src++) {
-				mask = *q & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+			for (i = 0; i < ccnt; i++, src++, p++, q++) {
+				*p ^= *src;
+
+				VDEV_RAIDZ_64MUL_2(*q, mask);
 				*q ^= *src;
+			}
+
+			/*
+			 * Treat short columns as though they are full of 0s.
+			 * Note that there's therefore nothing needed for P.
+			 */
+			for (; i < pcnt; i++, q++) {
+				VDEV_RAIDZ_64MUL_2(*q, mask);
+			}
+		}
+	}
+}
+
+static void
+vdev_raidz_generate_parity_pqr(raidz_map_t *rm)
+{
+	uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i;
+	int c;
+
+	pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]);
+	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+	    rm->rm_col[VDEV_RAIDZ_Q].rc_size);
+	ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size ==
+	    rm->rm_col[VDEV_RAIDZ_R].rc_size);
+
+	for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+		src = rm->rm_col[c].rc_data;
+		p = rm->rm_col[VDEV_RAIDZ_P].rc_data;
+		q = rm->rm_col[VDEV_RAIDZ_Q].rc_data;
+		r = rm->rm_col[VDEV_RAIDZ_R].rc_data;
+
+		ccnt = rm->rm_col[c].rc_size / sizeof (src[0]);
+
+		if (c == rm->rm_firstdatacol) {
+			ASSERT(ccnt == pcnt || ccnt == 0);
+			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
+				*p = *src;
+				*q = *src;
+				*r = *src;
+			}
+			for (; i < pcnt; i++, src++, p++, q++, r++) {
+				*p = 0;
+				*q = 0;
+				*r = 0;
+			}
+		} else {
+			ASSERT(ccnt <= pcnt);
+
+			/*
+			 * Apply the algorithm described above by multiplying
+			 * the previous result and adding in the new value.
+			 */
+			for (i = 0; i < ccnt; i++, src++, p++, q++, r++) {
 				*p ^= *src;
+
+				VDEV_RAIDZ_64MUL_2(*q, mask);
+				*q ^= *src;
+
+				VDEV_RAIDZ_64MUL_4(*r, mask);
+				*r ^= *src;
 			}
 
 			/*
 			 * Treat short columns as though they are full of 0s.
+			 * Note that there's therefore nothing needed for P.
 			 */
-			for (; i < pcount; i++, q++) {
-				mask = *q & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*q = ((*q << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+			for (; i < pcnt; i++, q++, r++) {
+				VDEV_RAIDZ_64MUL_2(*q, mask);
+				VDEV_RAIDZ_64MUL_4(*r, mask);
 			}
 		}
 	}
 }
 
+/*
+ * Generate RAID parity in the first virtual columns according to the number of
+ * parity columns available.
+ */
 static void
-vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
+vdev_raidz_generate_parity(raidz_map_t *rm)
+{
+	switch (rm->rm_firstdatacol) {
+	case 1:
+		vdev_raidz_generate_parity_p(rm);
+		break;
+	case 2:
+		vdev_raidz_generate_parity_pq(rm);
+		break;
+	case 3:
+		vdev_raidz_generate_parity_pqr(rm);
+		break;
+	default:
+		cmn_err(CE_PANIC, "invalid RAID-Z configuration");
+	}
+}
+
+static int
+vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, i;
+	int x = tgts[0];
 	int c;
 
+	ASSERT(ntgts == 1);
+	ASSERT(x >= rm->rm_firstdatacol);
+	ASSERT(x < rm->rm_cols);
+
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]));
 	ASSERT(xcount > 0);
@@ -404,15 +750,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x)
 			*dst ^= *src;
 		}
 	}
+
+	return (1 << VDEV_RAIDZ_P);
 }
 
-static void
-vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
+static int
+vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint64_t *dst, *src, xcount, ccount, count, mask, i;
 	uint8_t *b;
+	int x = tgts[0];
 	int c, j, exp;
 
+	ASSERT(ntgts == 1);
+
 	xcount = rm->rm_col[x].rc_size / sizeof (src[0]);
 	ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0]));
 
@@ -436,23 +787,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
 			}
 
 		} else {
-			/*
-			 * For an explanation of this, see the comment in
-			 * vdev_raidz_generate_parity_pq() above.
-			 */
 			for (i = 0; i < count; i++, dst++, src++) {
-				mask = *dst & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+				VDEV_RAIDZ_64MUL_2(*dst, mask);
 				*dst ^= *src;
 			}
 
 			for (; i < xcount; i++, dst++) {
-				mask = *dst & 0x8080808080808080ULL;
-				mask = (mask << 1) - (mask >> 7);
-				*dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^
-				    (mask & 0x1d1d1d1d1d1d1d1dULL);
+				VDEV_RAIDZ_64MUL_2(*dst, mask);
 			}
 		}
 	}
@@ -467,15 +808,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x)
 			*b = vdev_raidz_exp2(*b, exp);
 		}
 	}
+
+	return (1 << VDEV_RAIDZ_Q);
 }
 
-static void
-vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
+static int
+vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts)
 {
 	uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp;
 	void *pdata, *qdata;
 	uint64_t xsize, ysize, i;
+	int x = tgts[0];
+	int y = tgts[1];
 
+	ASSERT(ntgts == 2);
 	ASSERT(x < y);
 	ASSERT(x >= rm->rm_firstdatacol);
 	ASSERT(y < rm->rm_cols);
@@ -553,15 +899,554 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y)
 	 */
 	rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata;
 	rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata;
+
+	return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q));
+}
+
+/* BEGIN CSTYLED */
+/*
+ * In the general case of reconstruction, we must solve the system of linear
+ * equations defined by the coeffecients used to generate parity as well as
+ * the contents of the data and parity disks. This can be expressed with
+ * vectors for the original data (D) and the actual data (d) and parity (p)
+ * and a matrix composed of the identity matrix (I) and a dispersal matrix (V):
+ *
+ *            __   __                     __     __
+ *            |     |         __     __   |  p_0  |
+ *            |  V  |         |  D_0  |   | p_m-1 |
+ *            |     |    x    |   :   | = |  d_0  |
+ *            |  I  |         | D_n-1 |   |   :   |
+ *            |     |         ~~     ~~   | d_n-1 |
+ *            ~~   ~~                     ~~     ~~
+ *
+ * I is simply a square identity matrix of size n, and V is a vandermonde
+ * matrix defined by the coeffecients we chose for the various parity columns
+ * (1, 2, 4). Note that these values were chosen both for simplicity, speedy
+ * computation as well as linear separability.
+ *
+ *      __               __               __     __
+ *      |   1   ..  1 1 1 |               |  p_0  |
+ *      | 2^n-1 ..  4 2 1 |   __     __   |   :   |
+ *      | 4^n-1 .. 16 4 1 |   |  D_0  |   | p_m-1 |
+ *      |   1   ..  0 0 0 |   |  D_1  |   |  d_0  |
+ *      |   0   ..  0 0 0 | x |  D_2  | = |  d_1  |
+ *      |   :       : : : |   |   :   |   |  d_2  |
+ *      |   0   ..  1 0 0 |   | D_n-1 |   |   :   |
+ *      |   0   ..  0 1 0 |   ~~     ~~   |   :   |
+ *      |   0   ..  0 0 1 |               | d_n-1 |
+ *      ~~               ~~               ~~     ~~
+ *
+ * Note that I, V, d, and p are known. To compute D, we must invert the
+ * matrix and use the known data and parity values to reconstruct the unknown
+ * data values. We begin by removing the rows in V|I and d|p that correspond
+ * to failed or missing columns; we then make V|I square (n x n) and d|p
+ * sized n by removing rows corresponding to unused parity from the bottom up
+ * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)'
+ * using Gauss-Jordan elimination. In the example below we use m=3 parity
+ * columns, n=8 data columns, with errors in d_1, d_2, and p_1:
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  | <-----+-+-- missing disks
+ *           |  19 205 116  29  64  16  4   1  |      / /
+ *           |  1   0   0   0   0   0   0   0  |     / /
+ *           |  0   1   0   0   0   0   0   0  | <--' /
+ *  (V|I)  = |  0   0   1   0   0   0   0   0  | <---'
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *           __                               __
+ *           |  1   1   1   1   1   1   1   1  |
+ *           | 128  64  32  16  8   4   2   1  |
+ *           |  19 205 116  29  64  16  4   1  |
+ *           |  1   0   0   0   0   0   0   0  |
+ *           |  0   1   0   0   0   0   0   0  |
+ *  (V|I)' = |  0   0   1   0   0   0   0   0  |
+ *           |  0   0   0   1   0   0   0   0  |
+ *           |  0   0   0   0   1   0   0   0  |
+ *           |  0   0   0   0   0   1   0   0  |
+ *           |  0   0   0   0   0   0   1   0  |
+ *           |  0   0   0   0   0   0   0   1  |
+ *           ~~                               ~~
+ *
+ * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We
+ * have carefully chosen the seed values 1, 2, and 4 to ensure that this
+ * matrix is not singular.
+ * __                                                                 __
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  1   1   1   1   1   1   1   1     1   0   0   0   0   0   0   0  |
+ * |  19 205 116  29  64  16  4   1     0   1   0   0   0   0   0   0  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0  205 116  0   0   0   0   0     0   1   19  29  64  16  4   1  |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0  185  0   0   0   0   0    205  1  222 208 141 221 201 204 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   1   0   0   0   0   0     1   0   1   1   1   1   1   1  |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ * __                                                                 __
+ * |  1   0   0   0   0   0   0   0     0   0   1   0   0   0   0   0  |
+ * |  0   1   0   0   0   0   0   0    167 100  5   41 159 169 217 208 |
+ * |  0   0   1   0   0   0   0   0    166 100  4   40 158 168 216 209 |
+ * |  0   0   0   1   0   0   0   0     0   0   0   1   0   0   0   0  |
+ * |  0   0   0   0   1   0   0   0     0   0   0   0   1   0   0   0  |
+ * |  0   0   0   0   0   1   0   0     0   0   0   0   0   1   0   0  |
+ * |  0   0   0   0   0   0   1   0     0   0   0   0   0   0   1   0  |
+ * |  0   0   0   0   0   0   0   1     0   0   0   0   0   0   0   1  |
+ * ~~                                                                 ~~
+ *                   __                               __
+ *                   |  0   0   1   0   0   0   0   0  |
+ *                   | 167 100  5   41 159 169 217 208 |
+ *                   | 166 100  4   40 158 168 216 209 |
+ *       (V|I)'^-1 = |  0   0   0   1   0   0   0   0  |
+ *                   |  0   0   0   0   1   0   0   0  |
+ *                   |  0   0   0   0   0   1   0   0  |
+ *                   |  0   0   0   0   0   0   1   0  |
+ *                   |  0   0   0   0   0   0   0   1  |
+ *                   ~~                               ~~
+ *
+ * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values
+ * of the missing data.
+ *
+ * As is apparent from the example above, the only non-trivial rows in the
+ * inverse matrix correspond to the data disks that we're trying to
+ * reconstruct. Indeed, those are the only rows we need as the others would
+ * only be useful for reconstructing data known or assumed to be valid. For
+ * that reason, we only build the coefficients in the rows that correspond to
+ * targeted columns.
+ */
+/* END CSTYLED */
+
+static void
+vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map,
+    uint8_t **rows)
+{
+	int i, j;
+	int pow;
+
+	ASSERT(n == rm->rm_cols - rm->rm_firstdatacol);
+
+	/*
+	 * Fill in the missing rows of interest.
+	 */
+	for (i = 0; i < nmap; i++) {
+		ASSERT3S(0, <=, map[i]);
+		ASSERT3S(map[i], <=, 2);
+
+		pow = map[i] * n;
+		if (pow > 255)
+			pow -= 255;
+		ASSERT(pow <= 255);
+
+		for (j = 0; j < n; j++) {
+			pow -= map[i];
+			if (pow < 0)
+				pow += 255;
+			rows[i][j] = vdev_raidz_pow2[pow];
+		}
+	}
+}
+
+static void
+vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing,
+    uint8_t **rows, uint8_t **invrows, const uint8_t *used)
+{
+	int i, j, ii, jj;
+	uint8_t log;
+
+	/*
+	 * Assert that the first nmissing entries from the array of used
+	 * columns correspond to parity columns and that subsequent entries
+	 * correspond to data columns.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		ASSERT3S(used[i], <, rm->rm_firstdatacol);
+	}
+	for (; i < n; i++) {
+		ASSERT3S(used[i], >=, rm->rm_firstdatacol);
+	}
+
+	/*
+	 * First initialize the storage where we'll compute the inverse rows.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			invrows[i][j] = (i == j) ? 1 : 0;
+		}
+	}
+
+	/*
+	 * Subtract all trivial rows from the rows of consequence.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = nmissing; j < n; j++) {
+			ASSERT3U(used[j], >=, rm->rm_firstdatacol);
+			jj = used[j] - rm->rm_firstdatacol;
+			ASSERT3S(jj, <, n);
+			invrows[i][j] = rows[i][jj];
+			rows[i][jj] = 0;
+		}
+	}
+
+	/*
+	 * For each of the rows of interest, we must normalize it and subtract
+	 * a multiple of it from the other rows.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < missing[i]; j++) {
+			ASSERT3U(rows[i][j], ==, 0);
+		}
+		ASSERT3U(rows[i][missing[i]], !=, 0);
+
+		/*
+		 * Compute the inverse of the first element and multiply each
+		 * element in the row by that value.
+		 */
+		log = 255 - vdev_raidz_log2[rows[i][missing[i]]];
+
+		for (j = 0; j < n; j++) {
+			rows[i][j] = vdev_raidz_exp2(rows[i][j], log);
+			invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log);
+		}
+
+		for (ii = 0; ii < nmissing; ii++) {
+			if (i == ii)
+				continue;
+
+			ASSERT3U(rows[ii][missing[i]], !=, 0);
+
+			log = vdev_raidz_log2[rows[ii][missing[i]]];
+
+			for (j = 0; j < n; j++) {
+				rows[ii][j] ^=
+				    vdev_raidz_exp2(rows[i][j], log);
+				invrows[ii][j] ^=
+				    vdev_raidz_exp2(invrows[i][j], log);
+			}
+		}
+	}
+
+	/*
+	 * Verify that the data that is left in the rows are properly part of
+	 * an identity matrix.
+	 */
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			if (j == missing[i]) {
+				ASSERT3U(rows[i][j], ==, 1);
+			} else {
+				ASSERT3U(rows[i][j], ==, 0);
+			}
+		}
+	}
+}
+
+static void
+vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing,
+    int *missing, uint8_t **invrows, const uint8_t *used)
+{
+	int i, j, x, cc, c;
+	uint8_t *src;
+	uint64_t ccount;
+	uint8_t *dst[VDEV_RAIDZ_MAXPARITY];
+	uint64_t dcount[VDEV_RAIDZ_MAXPARITY];
+	uint8_t log, val;
+	int ll;
+	uint8_t *invlog[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *p, *pp;
+	size_t psize;
+
+	psize = sizeof (invlog[0][0]) * n * nmissing;
+	p = kmem_alloc(psize, KM_SLEEP);
+
+	for (pp = p, i = 0; i < nmissing; i++) {
+		invlog[i] = pp;
+		pp += n;
+	}
+
+	for (i = 0; i < nmissing; i++) {
+		for (j = 0; j < n; j++) {
+			ASSERT3U(invrows[i][j], !=, 0);
+			invlog[i][j] = vdev_raidz_log2[invrows[i][j]];
+		}
+	}
+
+	for (i = 0; i < n; i++) {
+		c = used[i];
+		ASSERT3U(c, <, rm->rm_cols);
+
+		src = rm->rm_col[c].rc_data;
+		ccount = rm->rm_col[c].rc_size;
+		for (j = 0; j < nmissing; j++) {
+			cc = missing[j] + rm->rm_firstdatacol;
+			ASSERT3U(cc, >=, rm->rm_firstdatacol);
+			ASSERT3U(cc, <, rm->rm_cols);
+			ASSERT3U(cc, !=, c);
+
+			dst[j] = rm->rm_col[cc].rc_data;
+			dcount[j] = rm->rm_col[cc].rc_size;
+		}
+
+		ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0);
+
+		for (x = 0; x < ccount; x++, src++) {
+			if (*src != 0)
+				log = vdev_raidz_log2[*src];
+
+			for (cc = 0; cc < nmissing; cc++) {
+				if (x >= dcount[cc])
+					continue;
+
+				if (*src == 0) {
+					val = 0;
+				} else {
+					if ((ll = log + invlog[cc][i]) >= 255)
+						ll -= 255;
+					val = vdev_raidz_pow2[ll];
+				}
+
+				if (i == 0)
+					dst[cc][x] = val;
+				else
+					dst[cc][x] ^= val;
+			}
+		}
+	}
+
+	kmem_free(p, psize);
+}
+
+static int
+vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts)
+{
+	int n, i, c, t, tt;
+	int nmissing_rows;
+	int missing_rows[VDEV_RAIDZ_MAXPARITY];
+	int parity_map[VDEV_RAIDZ_MAXPARITY];
+
+	uint8_t *p, *pp;
+	size_t psize;
+
+	uint8_t *rows[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *invrows[VDEV_RAIDZ_MAXPARITY];
+	uint8_t *used;
+
+	int code = 0;
+
+
+	n = rm->rm_cols - rm->rm_firstdatacol;
+
+	/*
+	 * Figure out which data columns are missing.
+	 */
+	nmissing_rows = 0;
+	for (t = 0; t < ntgts; t++) {
+		if (tgts[t] >= rm->rm_firstdatacol) {
+			missing_rows[nmissing_rows++] =
+			    tgts[t] - rm->rm_firstdatacol;
+		}
+	}
+
+	/*
+	 * Figure out which parity columns to use to help generate the missing
+	 * data columns.
+	 */
+	for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) {
+		ASSERT(tt < ntgts);
+		ASSERT(c < rm->rm_firstdatacol);
+
+		/*
+		 * Skip any targeted parity columns.
+		 */
+		if (c == tgts[tt]) {
+			tt++;
+			continue;
+		}
+
+		code |= 1 << c;
+
+		parity_map[i] = c;
+		i++;
+	}
+
+	ASSERT(code != 0);
+	ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY);
+
+	psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) *
+	    nmissing_rows * n + sizeof (used[0]) * n;
+	p = kmem_alloc(psize, KM_SLEEP);
+
+	for (pp = p, i = 0; i < nmissing_rows; i++) {
+		rows[i] = pp;
+		pp += n;
+		invrows[i] = pp;
+		pp += n;
+	}
+	used = pp;
+
+	for (i = 0; i < nmissing_rows; i++) {
+		used[i] = parity_map[i];
+	}
+
+	for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
+		if (tt < nmissing_rows &&
+		    c == missing_rows[tt] + rm->rm_firstdatacol) {
+			tt++;
+			continue;
+		}
+
+		ASSERT3S(i, <, n);
+		used[i] = c;
+		i++;
+	}
+
+	/*
+	 * Initialize the interesting rows of the matrix.
+	 */
+	vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows);
+
+	/*
+	 * Invert the matrix.
+	 */
+	vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows,
+	    invrows, used);
+
+	/*
+	 * Reconstruct the missing data using the generated matrix.
+	 */
+	vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows,
+	    invrows, used);
+
+	kmem_free(p, psize);
+
+	return (code);
 }
 
+static int
+vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt)
+{
+	int tgts[VDEV_RAIDZ_MAXPARITY], *dt;
+	int ntgts;
+	int i, c;
+	int code;
+	int nbadparity, nbaddata;
+	int parity_valid[VDEV_RAIDZ_MAXPARITY];
+
+	/*
+	 * The tgts list must already be sorted.
+	 */
+	for (i = 1; i < nt; i++) {
+		ASSERT(t[i] > t[i - 1]);
+	}
+
+	nbadparity = rm->rm_firstdatacol;
+	nbaddata = rm->rm_cols - nbadparity;
+	ntgts = 0;
+	for (i = 0, c = 0; c < rm->rm_cols; c++) {
+		if (c < rm->rm_firstdatacol)
+			parity_valid[c] = B_FALSE;
+
+		if (i < nt && c == t[i]) {
+			tgts[ntgts++] = c;
+			i++;
+		} else if (rm->rm_col[c].rc_error != 0) {
+			tgts[ntgts++] = c;
+		} else if (c >= rm->rm_firstdatacol) {
+			nbaddata--;
+		} else {
+			parity_valid[c] = B_TRUE;
+			nbadparity--;
+		}
+	}
+
+	ASSERT(ntgts >= nt);
+	ASSERT(nbaddata >= 0);
+	ASSERT(nbaddata + nbadparity == ntgts);
+
+	dt = &tgts[nbadparity];
+
+	/*
+	 * See if we can use any of our optimized reconstruction routines.
+	 */
+	if (!vdev_raidz_default_to_general) {
+		switch (nbaddata) {
+		case 1:
+			if (parity_valid[VDEV_RAIDZ_P])
+				return (vdev_raidz_reconstruct_p(rm, dt, 1));
+
+			ASSERT(rm->rm_firstdatacol > 1);
+
+			if (parity_valid[VDEV_RAIDZ_Q])
+				return (vdev_raidz_reconstruct_q(rm, dt, 1));
+
+			ASSERT(rm->rm_firstdatacol > 2);
+			break;
+
+		case 2:
+			ASSERT(rm->rm_firstdatacol > 1);
+
+			if (parity_valid[VDEV_RAIDZ_P] &&
+			    parity_valid[VDEV_RAIDZ_Q])
+				return (vdev_raidz_reconstruct_pq(rm, dt, 2));
+
+			ASSERT(rm->rm_firstdatacol > 2);
+
+			break;
+		}
+	}
+
+	code = vdev_raidz_reconstruct_general(rm, tgts, ntgts);
+	ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY));
+	ASSERT(code > 0);
+	return (code);
+}
 
 static int
 vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
 	vdev_t *cvd;
 	uint64_t nparity = vd->vdev_nparity;
-	int c, error;
+	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
@@ -573,11 +1458,13 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 		return (EINVAL);
 	}
 
+	vdev_open_children(vd);
+
 	for (c = 0; c < vd->vdev_children; c++) {
 		cvd = vd->vdev_child[c];
 
-		if ((error = vdev_open(cvd)) != 0) {
-			lasterror = error;
+		if (cvd->vdev_open_error != 0) {
+			lasterror = cvd->vdev_open_error;
 			numerrors++;
 			continue;
 		}
@@ -636,10 +1523,9 @@ vdev_raidz_io_start(zio_t *zio)
 	vdev_t *vd = zio->io_vd;
 	vdev_t *tvd = vd->vdev_top;
 	vdev_t *cvd;
-	blkptr_t *bp = zio->io_bp;
 	raidz_map_t *rm;
 	raidz_col_t *rc;
-	int c;
+	int c, i;
 
 	rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children,
 	    vd->vdev_nparity);
@@ -647,13 +1533,7 @@ vdev_raidz_io_start(zio_t *zio)
 	ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size));
 
 	if (zio->io_type == ZIO_TYPE_WRITE) {
-		/*
-		 * Generate RAID parity in the first virtual columns.
-		 */
-		if (rm->rm_firstdatacol == 1)
-			vdev_raidz_generate_parity_p(rm);
-		else
-			vdev_raidz_generate_parity_pq(rm);
+		vdev_raidz_generate_parity(rm);
 
 		for (c = 0; c < rm->rm_cols; c++) {
 			rc = &rm->rm_col[c];
@@ -664,6 +1544,23 @@ vdev_raidz_io_start(zio_t *zio)
 			    vdev_raidz_child_done, rc));
 		}
 
+		/*
+		 * Generate optional I/Os for any skipped sectors to improve
+		 * aggregation contiguity.
+		 */
+		for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) {
+			ASSERT(c <= rm->rm_scols);
+			if (c == rm->rm_scols)
+				c = 0;
+			rc = &rm->rm_col[c];
+			cvd = vd->vdev_child[rc->rc_devidx];
+			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
+			    rc->rc_offset + rc->rc_size, NULL,
+			    1 << tvd->vdev_ashift,
+			    zio->io_type, zio->io_priority,
+			    ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL));
+		}
+
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
@@ -671,8 +1568,7 @@ vdev_raidz_io_start(zio_t *zio)
 
 	/*
 	 * Iterate over the columns in reverse order so that we hit the parity
-	 * last -- any errors along the way will force us to read the parity
-	 * data.
+	 * last -- any errors along the way will force us to read the parity.
 	 */
 	for (c = rm->rm_cols - 1; c >= 0; c--) {
 		rc = &rm->rm_col[c];
@@ -687,7 +1583,7 @@ vdev_raidz_io_start(zio_t *zio)
 			rc->rc_skipped = 1;
 			continue;
 		}
-		if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) {
+		if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) {
 			if (c >= rm->rm_firstdatacol)
 				rm->rm_missingdata++;
 			else
@@ -697,7 +1593,7 @@ vdev_raidz_io_start(zio_t *zio)
 			continue;
 		}
 		if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 ||
-		    (zio->io_flags & ZIO_FLAG_SCRUB)) {
+		    (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) {
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    zio->io_type, zio->io_priority, 0,
@@ -712,19 +1608,42 @@ vdev_raidz_io_start(zio_t *zio)
  * Report a checksum error for a child of a RAID-Z device.
  */
 static void
-raidz_checksum_error(zio_t *zio, raidz_col_t *rc)
+raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data)
 {
 	vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx];
 
 	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+		zio_bad_cksum_t zbc;
+		raidz_map_t *rm = zio->io_vsd;
+
 		mutex_enter(&vd->vdev_stat_lock);
 		vd->vdev_stat.vs_checksum_errors++;
 		mutex_exit(&vd->vdev_stat_lock);
+
+		zbc.zbc_has_cksum = 0;
+		zbc.zbc_injected = rm->rm_ecksuminjected;
+
+		zfs_ereport_post_checksum(zio->io_spa, vd, zio,
+		    rc->rc_offset, rc->rc_size, rc->rc_data, bad_data,
+		    &zbc);
 	}
+}
+
+/*
+ * We keep track of whether or not there were any injected errors, so that
+ * any ereports we generate can note it.
+ */
+static int
+raidz_checksum_verify(zio_t *zio)
+{
+	zio_bad_cksum_t zbc;
+	raidz_map_t *rm = zio->io_vsd;
+
+	int ret = zio_checksum_error(zio, &zbc);
+	if (ret != 0 && zbc.zbc_injected != 0)
+		rm->rm_ecksuminjected = 1;
 
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE))
-		zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-		    zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size);
+	return (ret);
 }
 
 /*
@@ -748,17 +1667,14 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 		bcopy(rc->rc_data, orig[c], rc->rc_size);
 	}
 
-	if (rm->rm_firstdatacol == 1)
-		vdev_raidz_generate_parity_p(rm);
-	else
-		vdev_raidz_generate_parity_pq(rm);
+	vdev_raidz_generate_parity(rm);
 
 	for (c = 0; c < rm->rm_firstdatacol; c++) {
 		rc = &rm->rm_col[c];
 		if (!rc->rc_tried || rc->rc_error != 0)
 			continue;
 		if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) {
-			raidz_checksum_error(zio, rc);
+			raidz_checksum_error(zio, rc, orig[c]);
 			rc->rc_error = ECKSUM;
 			ret++;
 		}
@@ -768,9 +1684,10 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm)
 	return (ret);
 }
 
-static uint64_t raidz_corrected_p;
-static uint64_t raidz_corrected_q;
-static uint64_t raidz_corrected_pq;
+/*
+ * Keep statistics on all the ways that we used parity to correct data.
+ */
+static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY];
 
 static int
 vdev_raidz_worst_error(raidz_map_t *rm)
@@ -783,19 +1700,177 @@ vdev_raidz_worst_error(raidz_map_t *rm)
 	return (error);
 }
 
+/*
+ * Iterate over all combinations of bad data and attempt a reconstruction.
+ * Note that the algorithm below is non-optimal because it doesn't take into
+ * account how reconstruction is actually performed. For example, with
+ * triple-parity RAID-Z the reconstruction procedure is the same if column 4
+ * is targeted as invalid as if columns 1 and 4 are targeted since in both
+ * cases we'd only use parity information in column 0.
+ */
+static int
+vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors)
+{
+	raidz_map_t *rm = zio->io_vsd;
+	raidz_col_t *rc;
+	void *orig[VDEV_RAIDZ_MAXPARITY];
+	int tstore[VDEV_RAIDZ_MAXPARITY + 2];
+	int *tgts = &tstore[1];
+	int current, next, i, c, n;
+	int code, ret = 0;
+
+	ASSERT(total_errors < rm->rm_firstdatacol);
+
+	/*
+	 * This simplifies one edge condition.
+	 */
+	tgts[-1] = -1;
+
+	for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) {
+		/*
+		 * Initialize the targets array by finding the first n columns
+		 * that contain no error.
+		 *
+		 * If there were no data errors, we need to ensure that we're
+		 * always explicitly attempting to reconstruct at least one
+		 * data column. To do this, we simply push the highest target
+		 * up into the data columns.
+		 */
+		for (c = 0, i = 0; i < n; i++) {
+			if (i == n - 1 && data_errors == 0 &&
+			    c < rm->rm_firstdatacol) {
+				c = rm->rm_firstdatacol;
+			}
+
+			while (rm->rm_col[c].rc_error != 0) {
+				c++;
+				ASSERT3S(c, <, rm->rm_cols);
+			}
+
+			tgts[i] = c++;
+		}
+
+		/*
+		 * Setting tgts[n] simplifies the other edge condition.
+		 */
+		tgts[n] = rm->rm_cols;
+
+		/*
+		 * These buffers were allocated in previous iterations.
+		 */
+		for (i = 0; i < n - 1; i++) {
+			ASSERT(orig[i] != NULL);
+		}
+
+		orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size);
+
+		current = 0;
+		next = tgts[current];
+
+		while (current != n) {
+			tgts[current] = next;
+			current = 0;
+
+			/*
+			 * Save off the original data that we're going to
+			 * attempt to reconstruct.
+			 */
+			for (i = 0; i < n; i++) {
+				ASSERT(orig[i] != NULL);
+				c = tgts[i];
+				ASSERT3S(c, >=, 0);
+				ASSERT3S(c, <, rm->rm_cols);
+				rc = &rm->rm_col[c];
+				bcopy(rc->rc_data, orig[i], rc->rc_size);
+			}
+
+			/*
+			 * Attempt a reconstruction and exit the outer loop on
+			 * success.
+			 */
+			code = vdev_raidz_reconstruct(rm, tgts, n);
+			if (raidz_checksum_verify(zio) == 0) {
+				atomic_inc_64(&raidz_corrected[code]);
+
+				for (i = 0; i < n; i++) {
+					c = tgts[i];
+					rc = &rm->rm_col[c];
+					ASSERT(rc->rc_error == 0);
+					if (rc->rc_tried)
+						raidz_checksum_error(zio, rc,
+						    orig[i]);
+					rc->rc_error = ECKSUM;
+				}
+
+				ret = code;
+				goto done;
+			}
+
+			/*
+			 * Restore the original data.
+			 */
+			for (i = 0; i < n; i++) {
+				c = tgts[i];
+				rc = &rm->rm_col[c];
+				bcopy(orig[i], rc->rc_data, rc->rc_size);
+			}
+
+			do {
+				/*
+				 * Find the next valid column after the current
+				 * position..
+				 */
+				for (next = tgts[current] + 1;
+				    next < rm->rm_cols &&
+				    rm->rm_col[next].rc_error != 0; next++)
+					continue;
+
+				ASSERT(next <= tgts[current + 1]);
+
+				/*
+				 * If that spot is available, we're done here.
+				 */
+				if (next != tgts[current + 1])
+					break;
+
+				/*
+				 * Otherwise, find the next valid column after
+				 * the previous position.
+				 */
+				for (c = tgts[current - 1] + 1;
+				    rm->rm_col[c].rc_error != 0; c++)
+					continue;
+
+				tgts[current] = c;
+				current++;
+
+			} while (current != n);
+		}
+	}
+	n--;
+done:
+	for (i = 0; i < n; i++) {
+		zio_buf_free(orig[i], rm->rm_col[0].rc_size);
+	}
+
+	return (ret);
+}
+
 static void
 vdev_raidz_io_done(zio_t *zio)
 {
 	vdev_t *vd = zio->io_vd;
 	vdev_t *cvd;
 	raidz_map_t *rm = zio->io_vsd;
-	raidz_col_t *rc, *rc1;
+	raidz_col_t *rc;
 	int unexpected_errors = 0;
 	int parity_errors = 0;
 	int parity_untried = 0;
 	int data_errors = 0;
 	int total_errors = 0;
-	int n, c, c1;
+	int n, c;
+	int tgts[VDEV_RAIDZ_MAXPARITY];
+	int code;
 
 	ASSERT(zio->io_bp != NULL);  /* XXX need to add code to enforce this */
 
@@ -859,9 +1934,8 @@ vdev_raidz_io_done(zio_t *zio)
 	 * any errors.
 	 */
 	if (total_errors <= rm->rm_firstdatacol - parity_untried) {
-		switch (data_errors) {
-		case 0:
-			if (zio_checksum_error(zio) == 0) {
+		if (data_errors == 0) {
+			if (raidz_checksum_verify(zio) == 0) {
 				/*
 				 * If we read parity information (unnecessarily
 				 * as it happens since no reconstruction was
@@ -880,9 +1954,7 @@ vdev_raidz_io_done(zio_t *zio)
 				}
 				goto done;
 			}
-			break;
-
-		case 1:
+		} else {
 			/*
 			 * We either attempt to read all the parity columns or
 			 * none of them. If we didn't try to read parity, we
@@ -894,45 +1966,38 @@ vdev_raidz_io_done(zio_t *zio)
 			ASSERT(parity_errors < rm->rm_firstdatacol);
 
 			/*
-			 * Find the column that reported the error.
+			 * Identify the data columns that reported an error.
 			 */
+			n = 0;
 			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
 				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
+				if (rc->rc_error != 0) {
+					ASSERT(n < VDEV_RAIDZ_MAXPARITY);
+					tgts[n++] = c;
+				}
 			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
 
-			if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-				vdev_raidz_reconstruct_p(rm, c);
-			} else {
-				ASSERT(rm->rm_firstdatacol > 1);
-				vdev_raidz_reconstruct_q(rm, c);
-			}
+			ASSERT(rm->rm_firstdatacol >= n);
 
-			if (zio_checksum_error(zio) == 0) {
-				if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0)
-					atomic_inc_64(&raidz_corrected_p);
-				else
-					atomic_inc_64(&raidz_corrected_q);
+			code = vdev_raidz_reconstruct(rm, tgts, n);
+
+			if (raidz_checksum_verify(zio) == 0) {
+				atomic_inc_64(&raidz_corrected[code]);
 
 				/*
-				 * If there's more than one parity disk that
-				 * was successfully read, confirm that the
-				 * other parity disk produced the correct data.
-				 * This routine is suboptimal in that it
-				 * regenerates both the parity we wish to test
-				 * as well as the parity we just used to
-				 * perform the reconstruction, but this should
-				 * be a relatively uncommon case, and can be
-				 * optimized if it becomes a problem.
-				 * We also regenerate parity when resilvering
-				 * so we can write it out to the failed device
-				 * later.
+				 * If we read more parity disks than were used
+				 * for reconstruction, confirm that the other
+				 * parity disks produced correct data. This
+				 * routine is suboptimal in that it regenerates
+				 * the parity that we already used in addition
+				 * to the parity that we're attempting to
+				 * verify, but this should be a relatively
+				 * uncommon case, and can be optimized if it
+				 * becomes a problem. Note that we regenerate
+				 * parity when resilvering so we can write it
+				 * out to failed devices later.
 				 */
-				if (parity_errors < rm->rm_firstdatacol - 1 ||
+				if (parity_errors < rm->rm_firstdatacol - n ||
 				    (zio->io_flags & ZIO_FLAG_RESILVER)) {
 					n = raidz_parity_verify(zio, rm);
 					unexpected_errors += n;
@@ -942,46 +2007,6 @@ vdev_raidz_io_done(zio_t *zio)
 
 				goto done;
 			}
-			break;
-
-		case 2:
-			/*
-			 * Two data column errors require double parity.
-			 */
-			ASSERT(rm->rm_firstdatacol == 2);
-
-			/*
-			 * Find the two columns that reported errors.
-			 */
-			for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
-			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
-
-			for (c1 = c++; c < rm->rm_cols; c++) {
-				rc = &rm->rm_col[c];
-				if (rc->rc_error != 0)
-					break;
-			}
-			ASSERT(c != rm->rm_cols);
-			ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO ||
-			    rc->rc_error == ESTALE);
-
-			vdev_raidz_reconstruct_pq(rm, c1, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				atomic_inc_64(&raidz_corrected_pq);
-				goto done;
-			}
-			break;
-
-		default:
-			ASSERT(rm->rm_firstdatacol <= 2);
-			ASSERT(0);
 		}
 	}
 
@@ -1020,152 +2045,61 @@ vdev_raidz_io_done(zio_t *zio)
 	 * errors we detected, and we've attempted to read all columns. There
 	 * must, therefore, be one or more additional problems -- silent errors
 	 * resulting in invalid data rather than explicit I/O errors resulting
-	 * in absent data. Before we attempt combinatorial reconstruction make
-	 * sure we have a chance of coming up with the right answer.
+	 * in absent data. We check if there is enough additional data to
+	 * possibly reconstruct the data and then perform combinatorial
+	 * reconstruction over all possible combinations. If that fails,
+	 * we're cooked.
 	 */
-	if (total_errors >= rm->rm_firstdatacol) {
+	if (total_errors > rm->rm_firstdatacol) {
 		zio->io_error = vdev_raidz_worst_error(rm);
-		/*
-		 * If there were exactly as many device errors as parity
-		 * columns, yet we couldn't reconstruct the data, then at
-		 * least one device must have returned bad data silently.
-		 */
-		if (total_errors == rm->rm_firstdatacol)
-			zio->io_error = zio_worst_error(zio->io_error, ECKSUM);
-		goto done;
-	}
-
-	if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) {
-		/*
-		 * Attempt to reconstruct the data from parity P.
-		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			void *orig;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-			vdev_raidz_reconstruct_p(rm, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				zio_buf_free(orig, rc->rc_size);
-				atomic_inc_64(&raidz_corrected_p);
-
-				/*
-				 * If this child didn't know that it returned
-				 * bad data, inform it.
-				 */
-				if (rc->rc_tried && rc->rc_error == 0)
-					raidz_checksum_error(zio, rc);
-				rc->rc_error = ECKSUM;
-				goto done;
-			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
 
-	if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+	} else if (total_errors < rm->rm_firstdatacol &&
+	    (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) {
 		/*
-		 * Attempt to reconstruct the data from parity Q.
+		 * If we didn't use all the available parity for the
+		 * combinatorial reconstruction, verify that the remaining
+		 * parity is correct.
 		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) {
-			void *orig;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-			vdev_raidz_reconstruct_q(rm, c);
-
-			if (zio_checksum_error(zio) == 0) {
-				zio_buf_free(orig, rc->rc_size);
-				atomic_inc_64(&raidz_corrected_q);
-
-				/*
-				 * If this child didn't know that it returned
-				 * bad data, inform it.
-				 */
-				if (rc->rc_tried && rc->rc_error == 0)
-					raidz_checksum_error(zio, rc);
-				rc->rc_error = ECKSUM;
-				goto done;
-			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	if (rm->rm_firstdatacol > 1 &&
-	    rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 &&
-	    rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) {
+		if (code != (1 << rm->rm_firstdatacol) - 1)
+			(void) raidz_parity_verify(zio, rm);
+	} else {
 		/*
-		 * Attempt to reconstruct the data from both P and Q.
+		 * We're here because either:
+		 *
+		 *	total_errors == rm_first_datacol, or
+		 *	vdev_raidz_combrec() failed
+		 *
+		 * In either case, there is enough bad data to prevent
+		 * reconstruction.
+		 *
+		 * Start checksum ereports for all children which haven't
+		 * failed, and the IO wasn't speculative.
 		 */
-		for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) {
-			void *orig, *orig1;
-			rc = &rm->rm_col[c];
-
-			orig = zio_buf_alloc(rc->rc_size);
-			bcopy(rc->rc_data, orig, rc->rc_size);
-
-			for (c1 = c + 1; c1 < rm->rm_cols; c1++) {
-				rc1 = &rm->rm_col[c1];
-
-				orig1 = zio_buf_alloc(rc1->rc_size);
-				bcopy(rc1->rc_data, orig1, rc1->rc_size);
-
-				vdev_raidz_reconstruct_pq(rm, c, c1);
+		zio->io_error = ECKSUM;
 
-				if (zio_checksum_error(zio) == 0) {
-					zio_buf_free(orig, rc->rc_size);
-					zio_buf_free(orig1, rc1->rc_size);
-					atomic_inc_64(&raidz_corrected_pq);
-
-					/*
-					 * If these children didn't know they
-					 * returned bad data, inform them.
-					 */
-					if (rc->rc_tried && rc->rc_error == 0)
-						raidz_checksum_error(zio, rc);
-					if (rc1->rc_tried && rc1->rc_error == 0)
-						raidz_checksum_error(zio, rc1);
-
-					rc->rc_error = ECKSUM;
-					rc1->rc_error = ECKSUM;
-
-					goto done;
+		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
+			for (c = 0; c < rm->rm_cols; c++) {
+				rc = &rm->rm_col[c];
+				if (rc->rc_error == 0) {
+					zio_bad_cksum_t zbc;
+					zbc.zbc_has_cksum = 0;
+					zbc.zbc_injected =
+					    rm->rm_ecksuminjected;
+
+					zfs_ereport_start_checksum(
+					    zio->io_spa,
+					    vd->vdev_child[rc->rc_devidx],
+					    zio, rc->rc_offset, rc->rc_size,
+					    (void *)(uintptr_t)c, &zbc);
 				}
-
-				bcopy(orig1, rc1->rc_data, rc1->rc_size);
-				zio_buf_free(orig1, rc1->rc_size);
 			}
-
-			bcopy(orig, rc->rc_data, rc->rc_size);
-			zio_buf_free(orig, rc->rc_size);
-		}
-	}
-
-	/*
-	 * All combinations failed to checksum. Generate checksum ereports for
-	 * all children.
-	 */
-	zio->io_error = ECKSUM;
-
-	if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-		for (c = 0; c < rm->rm_cols; c++) {
-			rc = &rm->rm_col[c];
-			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-			    zio->io_spa, vd->vdev_child[rc->rc_devidx], zio,
-			    rc->rc_offset, rc->rc_size);
 		}
 	}
 
 done:
 	zio_checksum_verified(zio);
 
-	if (zio->io_error == 0 && (spa_mode & FWRITE) &&
+	if (zio->io_error == 0 && spa_writeable(zio->io_spa) &&
 	    (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) {
 		/*
 		 * Use the good data we have in hand to repair damaged children.
@@ -1180,7 +2114,8 @@ vdev_raidz_io_done(zio_t *zio)
 			zio_nowait(zio_vdev_child_io(zio, NULL, cvd,
 			    rc->rc_offset, rc->rc_data, rc->rc_size,
 			    ZIO_TYPE_WRITE, zio->io_priority,
-			    ZIO_FLAG_IO_REPAIR, NULL, NULL));
+			    ZIO_FLAG_IO_REPAIR | (unexpected_errors ?
+			    ZIO_FLAG_SELF_HEAL : 0), NULL, NULL));
 		}
 	}
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c
index 88383f002b805..524c8e60601d9 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -52,7 +52,6 @@ too_many_errors(vdev_t *vd, int numerrors)
 static int
 vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 {
-	int c;
 	int lasterror = 0;
 	int numerrors = 0;
 
@@ -61,15 +60,14 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 		return (EINVAL);
 	}
 
-	for (c = 0; c < vd->vdev_children; c++) {
+	vdev_open_children(vd);
+
+	for (int c = 0; c < vd->vdev_children; c++) {
 		vdev_t *cvd = vd->vdev_child[c];
-		int error;
 
-		if ((error = vdev_open(cvd)) != 0 &&
-		    !cvd->vdev_islog) {
-			lasterror = error;
+		if (cvd->vdev_open_error && !cvd->vdev_islog) {
+			lasterror = cvd->vdev_open_error;
 			numerrors++;
-			continue;
 		}
 	}
 
@@ -87,9 +85,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift)
 static void
 vdev_root_close(vdev_t *vd)
 {
-	int c;
-
-	for (c = 0; c < vd->vdev_children; c++)
+	for (int c = 0; c < vd->vdev_children; c++)
 		vdev_close(vd->vdev_child[c]);
 }
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c
index ca859ec355dab..3be29e971c2d1 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c
@@ -19,13 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
-
 /*
  * This file contains the top half of the zfs directory structure
  * implementation. The bottom half is in zap_leaf.c.
@@ -45,6 +42,7 @@
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
 #include <sys/zfs_znode.h>
+#include <sys/fs/zfs.h>
 #include <sys/zap.h>
 #include <sys/refcount.h>
 #include <sys/zap_impl.h>
@@ -72,7 +70,7 @@ fzap_byteswap(void *vbuf, size_t size)
 }
 
 void
-fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
+fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags)
 {
 	dmu_buf_t *db;
 	zap_leaf_t *l;
@@ -104,6 +102,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx)
 	zp->zap_num_entries = 0;
 	zp->zap_salt = zap->zap_salt;
 	zp->zap_normflags = zap->zap_normflags;
+	zp->zap_flags = flags;
 
 	/* block 1 will be the first leaf */
 	for (i = 0; i < (1<<zp->zap_ptrtbl.zt_shift); i++)
@@ -317,8 +316,13 @@ zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n)
 static int
 zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx)
 {
-	/* In case things go horribly wrong. */
-	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2)
+	/*
+	 * The pointer table should never use more hash bits than we
+	 * have (otherwise we'd be using useless zero bits to index it).
+	 * If we are within 2 bits of running out, stop growing, since
+	 * this is already an aberrant condition.
+	 */
+	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2)
 		return (ENOSPC);
 
 	if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) {
@@ -702,13 +706,17 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx)
 	}
 }
 
-
 static int
-fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
+fzap_checkname(zap_name_t *zn)
 {
-	if (name && strlen(name) > ZAP_MAXNAMELEN)
-		return (E2BIG);
+	if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+		return (ENAMETOOLONG);
+	return (0);
+}
 
+static int
+fzap_checksize(uint64_t integer_size, uint64_t num_integers)
+{
 	/* Only integer sizes supported by C */
 	switch (integer_size) {
 	case 1:
@@ -726,6 +734,16 @@ fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers)
 	return (0);
 }
 
+static int
+fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers)
+{
+	int err;
+
+	if ((err = fzap_checkname(zn)) != 0)
+		return (err);
+	return (fzap_checksize(integer_size, num_integers));
+}
+
 /*
  * Routines for manipulating attributes.
  */
@@ -738,8 +756,7 @@ fzap_lookup(zap_name_t *zn,
 	int err;
 	zap_entry_handle_t zeh;
 
-	err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
-	if (err != 0)
+	if ((err = fzap_checkname(zn)) != 0)
 		return (err);
 
 	err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l);
@@ -747,8 +764,13 @@ fzap_lookup(zap_name_t *zn,
 		return (err);
 	err = zap_leaf_lookup(l, zn, &zeh);
 	if (err == 0) {
+		if ((err = fzap_checksize(integer_size, num_integers)) != 0) {
+			zap_put_leaf(l);
+			return (err);
+		}
+
 		err = zap_entry_read(&zeh, integer_size, num_integers, buf);
-		(void) zap_entry_read_name(&zeh, rn_len, realname);
+		(void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname);
 		if (ncp) {
 			*ncp = zap_entry_normalization_conflict(&zeh,
 			    zn, NULL, zn->zn_zap);
@@ -771,8 +793,7 @@ fzap_add_cd(zap_name_t *zn,
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
 	ASSERT(!zap->zap_ismicro);
-	ASSERT(fzap_checksize(zn->zn_name_orij,
-	    integer_size, num_integers) == 0);
+	ASSERT(fzap_check(zn, integer_size, num_integers) == 0);
 
 	err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l);
 	if (err != 0)
@@ -786,7 +807,7 @@ fzap_add_cd(zap_name_t *zn,
 	if (err != ENOENT)
 		goto out;
 
-	err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd,
+	err = zap_entry_create(l, zn, cd,
 	    integer_size, num_integers, val, &zeh);
 
 	if (err == 0) {
@@ -809,12 +830,12 @@ fzap_add(zap_name_t *zn,
     uint64_t integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
-	int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+	int err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
 	return (fzap_add_cd(zn, integer_size, num_integers,
-	    val, ZAP_MAXCD, tx));
+	    val, ZAP_NEED_CD, tx));
 }
 
 int
@@ -827,7 +848,7 @@ fzap_update(zap_name_t *zn,
 	zap_t *zap = zn->zn_zap;
 
 	ASSERT(RW_LOCK_HELD(&zap->zap_rwlock));
-	err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers);
+	err = fzap_check(zn, integer_size, num_integers);
 	if (err != 0)
 		return (err);
 
@@ -840,8 +861,8 @@ fzap_update(zap_name_t *zn,
 	ASSERT(err == 0 || err == ENOENT);
 
 	if (create) {
-		err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash,
-		    ZAP_MAXCD, integer_size, num_integers, val, &zeh);
+		err = zap_entry_create(l, zn, ZAP_NEED_CD,
+		    integer_size, num_integers, val, &zeh);
 		if (err == 0)
 			zap_increment_num_entries(zap, 1, tx);
 	} else {
@@ -980,6 +1001,30 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value)
 	return (zap_lookup(os, obj, name, 8, 1, &value));
 }
 
+int
+zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta,
+    dmu_tx_t *tx)
+{
+	char name[20];
+	uint64_t value = 0;
+	int err;
+
+	if (delta == 0)
+		return (0);
+
+	(void) snprintf(name, sizeof (name), "%llx", (longlong_t)key);
+	err = zap_lookup(os, obj, name, 8, 1, &value);
+	if (err != 0 && err != ENOENT)
+		return (err);
+	value += delta;
+	if (value == 0)
+		err = zap_remove(os, obj, name, tx);
+	else
+		err = zap_update(os, obj, name, 8, 1, &value, tx);
+	return (err);
+}
+
+
 /*
  * Routines for iterating over the attributes.
  */
@@ -1041,7 +1086,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za)
 			err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer);
 			ASSERT(err == 0 || err == EOVERFLOW);
 		}
-		err = zap_entry_read_name(&zeh,
+		err = zap_entry_read_name(zap, &zeh,
 		    sizeof (za->za_name), za->za_name);
 		ASSERT(err == 0);
 
@@ -1080,6 +1125,31 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs)
 	}
 }
 
+int
+fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn)
+{
+	int err;
+	zap_leaf_t *l;
+	zap_entry_handle_t zeh;
+
+	if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN)
+		return (ENAMETOOLONG);
+
+	err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l);
+	if (err != 0)
+		return (err);
+
+	err = zap_leaf_lookup(l, zn, &zeh);
+	if (err != 0)
+		return (err);
+
+	zc->zc_leaf = l;
+	zc->zc_hash = zeh.zeh_hash;
+	zc->zc_cd = zeh.zeh_cd;
+
+	return (err);
+}
+
 void
 fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 {
@@ -1134,3 +1204,58 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs)
 		}
 	}
 }
+
+int
+fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite,
+    uint64_t *tooverwrite)
+{
+	zap_t *zap = zn->zn_zap;
+	zap_leaf_t *l;
+	int err;
+
+	/*
+	 * Account for the header block of the fatzap.
+	 */
+	if (!add && dmu_buf_freeable(zap->zap_dbuf)) {
+		*tooverwrite += zap->zap_dbuf->db_size;
+	} else {
+		*towrite += zap->zap_dbuf->db_size;
+	}
+
+	/*
+	 * Account for the pointer table blocks.
+	 * If we are adding we need to account for the following cases :
+	 * - If the pointer table is embedded, this operation could force an
+	 *   external pointer table.
+	 * - If this already has an external pointer table this operation
+	 *   could extend the table.
+	 */
+	if (add) {
+		if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0)
+			*towrite += zap->zap_dbuf->db_size;
+		else
+			*towrite += (zap->zap_dbuf->db_size * 3);
+	}
+
+	/*
+	 * Now, check if the block containing leaf is freeable
+	 * and account accordingly.
+	 */
+	err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l);
+	if (err != 0) {
+		return (err);
+	}
+
+	if (!add && dmu_buf_freeable(l->l_dbuf)) {
+		*tooverwrite += l->l_dbuf->db_size;
+	} else {
+		/*
+		 * If this an add operation, the leaf block could split.
+		 * Hence, we need to account for an additional leaf block.
+		 */
+		*towrite += (add ? 2 : 1) * l->l_dbuf->db_size;
+	}
+
+	zap_put_leaf(l);
+	return (0);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c
index da498b6bc9e3f..285d9c56742b1 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c
@@ -19,24 +19,24 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * The 512-byte leaf is broken into 32 16-byte chunks.
  * chunk number n means l_chunk[n], even though the header precedes it.
  * the names are stored null-terminated.
  */
 
+#include <sys/zio.h>
+#include <sys/spa.h>
+#include <sys/dmu.h>
 #include <sys/zfs_context.h>
+#include <sys/fs/zfs.h>
 #include <sys/zap.h>
 #include <sys/zap_impl.h>
 #include <sys/zap_leaf.h>
-#include <sys/spa.h>
-#include <sys/dmu.h>
 
 static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry);
 
@@ -127,12 +127,12 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size)
 			le = &lc->l_entry;
 
 			le->le_type =		BSWAP_8(le->le_type);
-			le->le_int_size =	BSWAP_8(le->le_int_size);
+			le->le_value_intlen =	BSWAP_8(le->le_value_intlen);
 			le->le_next =		BSWAP_16(le->le_next);
 			le->le_name_chunk =	BSWAP_16(le->le_name_chunk);
-			le->le_name_length =	BSWAP_16(le->le_name_length);
+			le->le_name_numints =	BSWAP_16(le->le_name_numints);
 			le->le_value_chunk =	BSWAP_16(le->le_value_chunk);
-			le->le_value_length =	BSWAP_16(le->le_value_length);
+			le->le_value_numints =	BSWAP_16(le->le_value_numints);
 			le->le_cd =		BSWAP_32(le->le_cd);
 			le->le_hash =		BSWAP_64(le->le_hash);
 			break;
@@ -215,7 +215,7 @@ zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk)
 
 static uint16_t
 zap_leaf_array_create(zap_leaf_t *l, const char *buf,
-	int integer_size, int num_integers)
+    int integer_size, int num_integers)
 {
 	uint16_t chunk_head;
 	uint16_t *chunkp = &chunk_head;
@@ -273,11 +273,12 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp)
 static void
 zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
     int array_int_len, int array_len, int buf_int_len, uint64_t buf_len,
-    char *buf)
+    void *buf)
 {
 	int len = MIN(array_len, buf_len);
 	int byten = 0;
 	uint64_t value = 0;
+	char *p = buf;
 
 	ASSERT3U(array_int_len, <=, buf_int_len);
 
@@ -285,7 +286,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 	if (array_int_len == 8 && buf_int_len == 8 && len == 1) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
 		uint8_t *ip = la->la_array;
-		uint64_t *buf64 = (uint64_t *)buf;
+		uint64_t *buf64 = buf;
 
 		*buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 |
 		    (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 |
@@ -300,8 +301,8 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 		while (chunk != CHAIN_END) {
 			struct zap_leaf_array *la =
 			    &ZAP_LEAF_CHUNK(l, chunk).l_array;
-			bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES);
-			buf += ZAP_LEAF_ARRAY_BYTES;
+			bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES);
+			p += ZAP_LEAF_ARRAY_BYTES;
 			chunk = la->la_next;
 		}
 		return;
@@ -316,50 +317,69 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk,
 			value = (value << 8) | la->la_array[i];
 			byten++;
 			if (byten == array_int_len) {
-				stv(buf_int_len, buf, value);
+				stv(buf_int_len, p, value);
 				byten = 0;
 				len--;
 				if (len == 0)
 					return;
-				buf += buf_int_len;
+				p += buf_int_len;
 			}
 		}
 		chunk = la->la_next;
 	}
 }
 
-/*
- * Only to be used on 8-bit arrays.
- * array_len is actual len in bytes (not encoded le_value_length).
- * namenorm is null-terminated.
- */
 static boolean_t
-zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len)
+zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn,
+    int chunk, int array_numints)
 {
 	int bseen = 0;
 
+	if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) {
+		uint64_t *thiskey;
+		boolean_t match;
+
+		ASSERT(zn->zn_key_intlen == sizeof (*thiskey));
+		thiskey = kmem_alloc(array_numints * sizeof (*thiskey),
+		    KM_SLEEP);
+
+		zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints,
+		    sizeof (*thiskey), array_numints, thiskey);
+		match = bcmp(thiskey, zn->zn_key_orig,
+		    array_numints * sizeof (*thiskey)) == 0;
+		kmem_free(thiskey, array_numints * sizeof (*thiskey));
+		return (match);
+	}
+
+	ASSERT(zn->zn_key_intlen == 1);
 	if (zn->zn_matchtype == MT_FIRST) {
-		char *thisname = kmem_alloc(array_len, KM_SLEEP);
+		char *thisname = kmem_alloc(array_numints, KM_SLEEP);
 		boolean_t match;
 
-		zap_leaf_array_read(l, chunk, 1, array_len, 1,
-		    array_len, thisname);
+		zap_leaf_array_read(l, chunk, sizeof (char), array_numints,
+		    sizeof (char), array_numints, thisname);
 		match = zap_match(zn, thisname);
-		kmem_free(thisname, array_len);
+		kmem_free(thisname, array_numints);
 		return (match);
 	}
 
-	/* Fast path for exact matching */
-	while (bseen < array_len) {
+	/*
+	 * Fast path for exact matching.
+	 * First check that the lengths match, so that we don't read
+	 * past the end of the zn_key_orig array.
+	 */
+	if (array_numints != zn->zn_key_orig_numints)
+		return (B_FALSE);
+	while (bseen < array_numints) {
 		struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array;
-		int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES);
+		int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES);
 		ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l));
-		if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread))
+		if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread))
 			break;
 		chunk = la->la_next;
 		bseen += toread;
 	}
-	return (bseen == array_len);
+	return (bseen == array_numints);
 }
 
 /*
@@ -394,9 +414,9 @@ zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh)
 		ASSERT(zn->zn_matchtype == MT_EXACT ||
 		    (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED));
 		if (zap_leaf_array_match(l, zn, le->le_name_chunk,
-		    le->le_name_length)) {
-			zeh->zeh_num_integers = le->le_value_length;
-			zeh->zeh_integer_size = le->le_int_size;
+		    le->le_name_numints)) {
+			zeh->zeh_num_integers = le->le_value_numints;
+			zeh->zeh_integer_size = le->le_value_intlen;
 			zeh->zeh_cd = le->le_cd;
 			zeh->zeh_hash = le->le_hash;
 			zeh->zeh_chunkp = chunkp;
@@ -427,7 +447,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
 {
 	uint16_t chunk;
 	uint64_t besth = -1ULL;
-	uint32_t bestcd = ZAP_MAXCD;
+	uint32_t bestcd = -1U;
 	uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1;
 	uint16_t lh;
 	struct zap_leaf_entry *le;
@@ -449,8 +469,8 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
 				besth = le->le_hash;
 				bestcd = le->le_cd;
 
-				zeh->zeh_num_integers = le->le_value_length;
-				zeh->zeh_integer_size = le->le_int_size;
+				zeh->zeh_num_integers = le->le_value_numints;
+				zeh->zeh_integer_size = le->le_value_intlen;
 				zeh->zeh_cd = le->le_cd;
 				zeh->zeh_hash = le->le_hash;
 				zeh->zeh_fakechunk = chunk;
@@ -460,7 +480,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l,
 		}
 	}
 
-	return (bestcd == ZAP_MAXCD ? ENOENT : 0);
+	return (bestcd == -1U ? ENOENT : 0);
 }
 
 int
@@ -471,11 +491,12 @@ zap_entry_read(const zap_entry_handle_t *zeh,
 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
-	if (le->le_int_size > integer_size)
+	if (le->le_value_intlen > integer_size)
 		return (EINVAL);
 
-	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size,
-	    le->le_value_length, integer_size, num_integers, buf);
+	zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk,
+	    le->le_value_intlen, le->le_value_numints,
+	    integer_size, num_integers, buf);
 
 	if (zeh->zeh_num_integers > num_integers)
 		return (EOVERFLOW);
@@ -484,15 +505,21 @@ zap_entry_read(const zap_entry_handle_t *zeh,
 }
 
 int
-zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf)
+zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen,
+    char *buf)
 {
 	struct zap_leaf_entry *le =
 	    ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp);
 	ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY);
 
-	zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
-	    le->le_name_length, 1, buflen, buf);
-	if (le->le_name_length > buflen)
+	if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8,
+		    le->le_name_numints, 8, buflen / 8, buf);
+	} else {
+		zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1,
+		    le->le_name_numints, 1, buflen, buf);
+	}
+	if (le->le_name_numints > buflen)
 		return (EOVERFLOW);
 	return (0);
 }
@@ -506,7 +533,7 @@ zap_entry_update(zap_entry_handle_t *zeh,
 	struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp);
 
 	delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) -
-	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size);
+	    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen);
 
 	if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks)
 		return (EAGAIN);
@@ -522,8 +549,8 @@ zap_entry_update(zap_entry_handle_t *zeh,
 	zap_leaf_array_free(l, &le->le_value_chunk);
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
-	le->le_value_length = num_integers;
-	le->le_int_size = integer_size;
+	le->le_value_numints = num_integers;
+	le->le_value_intlen = integer_size;
 	return (0);
 }
 
@@ -550,26 +577,25 @@ zap_entry_remove(zap_entry_handle_t *zeh)
 }
 
 int
-zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
+zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd,
     uint8_t integer_size, uint64_t num_integers, const void *buf,
     zap_entry_handle_t *zeh)
 {
 	uint16_t chunk;
 	uint16_t *chunkp;
 	struct zap_leaf_entry *le;
-	uint64_t namelen, valuelen;
+	uint64_t valuelen;
 	int numchunks;
+	uint64_t h = zn->zn_hash;
 
 	valuelen = integer_size * num_integers;
-	namelen = strlen(name) + 1;
-	ASSERT(namelen >= 2);
 
-	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) +
-	    ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
+	numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints *
+	    zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen);
 	if (numchunks > ZAP_LEAF_NUMCHUNKS(l))
 		return (E2BIG);
 
-	if (cd == ZAP_MAXCD) {
+	if (cd == ZAP_NEED_CD) {
 		/* find the lowest unused cd */
 		if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) {
 			cd = 0;
@@ -586,7 +612,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
 			}
 		} else {
 			/* old unsorted format; do it the O(n^2) way */
-			for (cd = 0; cd < ZAP_MAXCD; cd++) {
+			for (cd = 0; ; cd++) {
 				for (chunk = *LEAF_HASH_ENTPTR(l, h);
 				    chunk != CHAIN_END; chunk = le->le_next) {
 					le = ZAP_LEAF_ENTRY(l, chunk);
@@ -601,10 +627,10 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
 			}
 		}
 		/*
-		 * we would run out of space in a block before we could
-		 * have ZAP_MAXCD entries
+		 * We would run out of space in a block before we could
+		 * store enough entries to run out of CD values.
 		 */
-		ASSERT3U(cd, <, ZAP_MAXCD);
+		ASSERT3U(cd, <, zap_maxcd(zn->zn_zap));
 	}
 
 	if (l->l_phys->l_hdr.lh_nfree < numchunks)
@@ -614,12 +640,13 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
 	chunk = zap_leaf_chunk_alloc(l);
 	le = ZAP_LEAF_ENTRY(l, chunk);
 	le->le_type = ZAP_CHUNK_ENTRY;
-	le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen);
-	le->le_name_length = namelen;
+	le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig,
+	    zn->zn_key_intlen, zn->zn_key_orig_numints);
+	le->le_name_numints = zn->zn_key_orig_numints;
 	le->le_value_chunk =
 	    zap_leaf_array_create(l, buf, integer_size, num_integers);
-	le->le_value_length = num_integers;
-	le->le_int_size = integer_size;
+	le->le_value_numints = num_integers;
+	le->le_value_intlen = integer_size;
 	le->le_hash = h;
 	le->le_cd = cd;
 
@@ -631,7 +658,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd,
 
 	zeh->zeh_leaf = l;
 	zeh->zeh_num_integers = num_integers;
-	zeh->zeh_integer_size = le->le_int_size;
+	zeh->zeh_integer_size = le->le_value_intlen;
 	zeh->zeh_cd = le->le_cd;
 	zeh->zeh_hash = le->le_hash;
 	zeh->zeh_chunkp = chunkp;
@@ -673,7 +700,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn,
 			allocdzn = B_TRUE;
 		}
 		if (zap_leaf_array_match(zeh->zeh_leaf, zn,
-		    le->le_name_chunk, le->le_name_length)) {
+		    le->le_name_chunk, le->le_name_numints)) {
 			if (allocdzn)
 				zap_name_free(zn);
 			return (B_TRUE);
@@ -836,9 +863,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs)
 			struct zap_leaf_entry *le =
 			    ZAP_LEAF_ENTRY(l, chunk);
 
-			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) +
-			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length *
-			    le->le_int_size);
+			n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) +
+			    ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints *
+			    le->le_value_intlen);
 			n = MIN(n, ZAP_HISTOGRAM_SIZE-1);
 			zs->zs_entries_using_n_chunks[n]++;
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c
index abba42775bb76..32ffc966f6a15 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c
@@ -19,12 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
+#include <sys/zio.h>
 #include <sys/spa.h>
 #include <sys/dmu.h>
 #include <sys/zfs_context.h>
@@ -38,33 +37,92 @@
 #include <sys/sunddi.h>
 #endif
 
-static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx);
+static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags);
 
+uint64_t
+zap_getflags(zap_t *zap)
+{
+	if (zap->zap_ismicro)
+		return (0);
+	return (zap->zap_u.zap_fat.zap_phys->zap_flags);
+}
 
-static uint64_t
-zap_hash(zap_t *zap, const char *normname)
+int
+zap_hashbits(zap_t *zap)
 {
-	const uint8_t *cp;
-	uint8_t c;
-	uint64_t crc = zap->zap_salt;
+	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+		return (48);
+	else
+		return (28);
+}
 
-	/* NB: name must already be normalized, if necessary */
+uint32_t
+zap_maxcd(zap_t *zap)
+{
+	if (zap_getflags(zap) & ZAP_FLAG_HASH64)
+		return ((1<<16)-1);
+	else
+		return (-1U);
+}
 
-	ASSERT(crc != 0);
-	ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
-	for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) {
-		crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF];
-	}
+static uint64_t
+zap_hash(zap_name_t *zn)
+{
+	zap_t *zap = zn->zn_zap;
+	uint64_t h = 0;
 
+	if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) {
+		ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY);
+		h = *(uint64_t *)zn->zn_key_orig;
+	} else {
+		h = zap->zap_salt;
+		ASSERT(h != 0);
+		ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY);
+
+		if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) {
+			int i;
+			const uint64_t *wp = zn->zn_key_norm;
+
+			ASSERT(zn->zn_key_intlen == 8);
+			for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) {
+				int j;
+				uint64_t word = *wp;
+
+				for (j = 0; j < zn->zn_key_intlen; j++) {
+					h = (h >> 8) ^
+					    zfs_crc64_table[(h ^ word) & 0xFF];
+					word >>= NBBY;
+				}
+			}
+		} else {
+			int i, len;
+			const uint8_t *cp = zn->zn_key_norm;
+
+			/*
+			 * We previously stored the terminating null on
+			 * disk, but didn't hash it, so we need to
+			 * continue to not hash it.  (The
+			 * zn_key_*_numints includes the terminating
+			 * null for non-binary keys.)
+			 */
+			len = zn->zn_key_norm_numints - 1;
+
+			ASSERT(zn->zn_key_intlen == 1);
+			for (i = 0; i < len; cp++, i++) {
+				h = (h >> 8) ^
+				    zfs_crc64_table[(h ^ *cp) & 0xFF];
+			}
+		}
+	}
 	/*
-	 * Only use 28 bits, since we need 4 bits in the cookie for the
-	 * collision differentiator.  We MUST use the high bits, since
-	 * those are the ones that we first pay attention to when
+	 * Don't use all 64 bits, since we need some in the cookie for
+	 * the collision differentiator.  We MUST use the high bits,
+	 * since those are the ones that we first pay attention to when
 	 * chosing the bucket.
 	 */
-	crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1);
+	h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1);
 
-	return (crc);
+	return (h);
 }
 
 static int
@@ -73,13 +131,15 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
 	size_t inlen, outlen;
 	int err;
 
+	ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY));
+
 	inlen = strlen(name) + 1;
 	outlen = ZAP_MAXNAMELEN;
 
 	err = 0;
 	(void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen,
-	    zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST,
-	    &err);
+	    zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL |
+	    U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err);
 
 	return (err);
 }
@@ -87,16 +147,18 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm)
 boolean_t
 zap_match(zap_name_t *zn, const char *matchname)
 {
+	ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY));
+
 	if (zn->zn_matchtype == MT_FIRST) {
 		char norm[ZAP_MAXNAMELEN];
 
 		if (zap_normalize(zn->zn_zap, matchname, norm) != 0)
 			return (B_FALSE);
 
-		return (strcmp(zn->zn_name_norm, norm) == 0);
+		return (strcmp(zn->zn_key_norm, norm) == 0);
 	} else {
 		/* MT_BEST or MT_EXACT */
-		return (strcmp(zn->zn_name_orij, matchname) == 0);
+		return (strcmp(zn->zn_key_orig, matchname) == 0);
 	}
 }
 
@@ -106,30 +168,49 @@ zap_name_free(zap_name_t *zn)
 	kmem_free(zn, sizeof (zap_name_t));
 }
 
-/* XXX combine this with zap_lockdir()? */
 zap_name_t *
-zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt)
+zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt)
 {
 	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
 
 	zn->zn_zap = zap;
-	zn->zn_name_orij = name;
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = key;
+	zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1;
 	zn->zn_matchtype = mt;
 	if (zap->zap_normflags) {
-		if (zap_normalize(zap, name, zn->zn_normbuf) != 0) {
+		if (zap_normalize(zap, key, zn->zn_normbuf) != 0) {
 			zap_name_free(zn);
 			return (NULL);
 		}
-		zn->zn_name_norm = zn->zn_normbuf;
+		zn->zn_key_norm = zn->zn_normbuf;
+		zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1;
 	} else {
 		if (mt != MT_EXACT) {
 			zap_name_free(zn);
 			return (NULL);
 		}
-		zn->zn_name_norm = zn->zn_name_orij;
+		zn->zn_key_norm = zn->zn_key_orig;
+		zn->zn_key_norm_numints = zn->zn_key_orig_numints;
 	}
 
-	zn->zn_hash = zap_hash(zap, zn->zn_name_norm);
+	zn->zn_hash = zap_hash(zn);
+	return (zn);
+}
+
+zap_name_t *
+zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints)
+{
+	zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP);
+
+	ASSERT(zap->zap_normflags == 0);
+	zn->zn_zap = zap;
+	zn->zn_key_intlen = sizeof (*key);
+	zn->zn_key_orig = zn->zn_key_norm = key;
+	zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints;
+	zn->zn_matchtype = MT_EXACT;
+
+	zn->zn_hash = zap_hash(zn);
 	return (zn);
 }
 
@@ -188,7 +269,7 @@ mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep)
 
 	ASSERT(zap->zap_ismicro);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
-	ASSERT(mzep->mze_cd < ZAP_MAXCD);
+	ASSERT(mzep->mze_cd < zap_maxcd(zap));
 
 	mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP);
 	mze->mze_chunkid = chunkid;
@@ -208,9 +289,6 @@ mze_find(zap_name_t *zn)
 	ASSERT(zn->zn_zap->zap_ismicro);
 	ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock));
 
-	if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name))
-		return (NULL);
-
 	mze_tofind.mze_hash = zn->zn_hash;
 	mze_tofind.mze_phys.mze_cd = 0;
 
@@ -423,7 +501,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx,
 			dprintf("upgrading obj %llu: num_entries=%u\n",
 			    obj, zap->zap_m.zap_num_entries);
 			*zapp = zap;
-			return (mzap_upgrade(zapp, tx));
+			return (mzap_upgrade(zapp, tx, 0));
 		}
 		err = dmu_object_set_blocksize(os, obj, newsz, 0, tx);
 		ASSERT3U(err, ==, 0);
@@ -443,10 +521,11 @@ zap_unlockdir(zap_t *zap)
 }
 
 static int
-mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
+mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags)
 {
 	mzap_phys_t *mzp;
-	int i, sz, nchunks, err;
+	int i, sz, nchunks;
+	int err = 0;
 	zap_t *zap = *zapp;
 
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
@@ -456,11 +535,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
 	bcopy(zap->zap_dbuf->db_data, mzp, sz);
 	nchunks = zap->zap_m.zap_num_chunks;
 
-	err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
-	    1ULL << fzap_default_block_shift, 0, tx);
-	if (err) {
-		kmem_free(mzp, sz);
-		return (err);
+	if (!flags) {
+		err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object,
+		    1ULL << fzap_default_block_shift, 0, tx);
+		if (err) {
+			kmem_free(mzp, sz);
+			return (err);
+		}
 	}
 
 	dprintf("upgrading obj=%llu with %u chunks\n",
@@ -468,10 +549,9 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
 	/* XXX destroy the avl later, so we can use the stored hash value */
 	mze_destroy(zap);
 
-	fzap_upgrade(zap, tx);
+	fzap_upgrade(zap, tx, flags);
 
 	for (i = 0; i < nchunks; i++) {
-		int err;
 		mzap_ent_phys_t *mze = &mzp->mz_chunk[i];
 		zap_name_t *zn;
 		if (mze->mze_name[0] == 0)
@@ -491,7 +571,8 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx)
 }
 
 static void
-mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
+mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags,
+    dmu_tx_t *tx)
 {
 	dmu_buf_t *db;
 	mzap_phys_t *zp;
@@ -512,6 +593,15 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx)
 	zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL;
 	zp->mz_normflags = normflags;
 	dmu_buf_rele(db, FTAG);
+
+	if (flags != 0) {
+		zap_t *zap;
+		/* Only fat zap supports flags; upgrade immediately. */
+		VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER,
+		    B_FALSE, B_FALSE, &zap));
+		VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags));
+		zap_unlockdir(zap);
+	}
 }
 
 int
@@ -532,7 +622,7 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags,
 	err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx);
 	if (err != 0)
 		return (err);
-	mzap_create_impl(os, obj, normflags, tx);
+	mzap_create_impl(os, obj, normflags, 0, tx);
 	return (0);
 }
 
@@ -549,7 +639,26 @@ zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot,
 {
 	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
 
-	mzap_create_impl(os, obj, normflags, tx);
+	mzap_create_impl(os, obj, normflags, 0, tx);
+	return (obj);
+}
+
+uint64_t
+zap_create_flags(objset_t *os, int normflags, zap_flags_t flags,
+    dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift,
+    dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx)
+{
+	uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx);
+
+	ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT &&
+	    leaf_blockshift <= SPA_MAXBLOCKSHIFT &&
+	    indirect_blockshift >= SPA_MINBLOCKSHIFT &&
+	    indirect_blockshift <= SPA_MAXBLOCKSHIFT);
+
+	VERIFY(dmu_object_set_blocksize(os, obj,
+	    1ULL << leaf_blockshift, indirect_blockshift, tx) == 0);
+
+	mzap_create_impl(os, obj, normflags, flags, tx);
 	return (obj);
 }
 
@@ -700,6 +809,40 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+int
+zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+
+	err = fzap_lookup(zn, integer_size, num_integers, buf,
+	    NULL, 0, NULL);
+	zap_name_free(zn);
+	zap_unlockdir(zap);
+	return (err);
+}
+
+int
+zap_contains(objset_t *os, uint64_t zapobj, const char *name)
+{
+	int err = (zap_lookup_norm(os, zapobj, name, 0,
+	    0, NULL, MT_EXACT, NULL, 0, NULL));
+	if (err == EOVERFLOW || err == EINVAL)
+		err = 0; /* found, but skipped reading the value */
+	return (err);
+}
+
 int
 zap_length(objset_t *os, uint64_t zapobj, const char *name,
     uint64_t *integer_size, uint64_t *num_integers)
@@ -735,6 +878,28 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+int
+zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, uint64_t *integer_size, uint64_t *num_integers)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+	err = fzap_length(zn, integer_size, num_integers);
+	zap_name_free(zn);
+	zap_unlockdir(zap);
+	return (err);
+}
+
 static void
 mzap_addent(zap_name_t *zn, uint64_t value)
 {
@@ -743,20 +908,18 @@ mzap_addent(zap_name_t *zn, uint64_t value)
 	int start = zap->zap_m.zap_alloc_next;
 	uint32_t cd;
 
-	dprintf("obj=%llu %s=%llu\n", zap->zap_object,
-	    zn->zn_name_orij, value);
 	ASSERT(RW_WRITE_HELD(&zap->zap_rwlock));
 
 #ifdef ZFS_DEBUG
 	for (i = 0; i < zap->zap_m.zap_num_chunks; i++) {
 		mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i];
-		ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0);
+		ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0);
 	}
 #endif
 
 	cd = mze_find_unused_cd(zap, zn->zn_hash);
 	/* given the limited size of the microzap, this can't happen */
-	ASSERT(cd != ZAP_MAXCD);
+	ASSERT(cd < zap_maxcd(zap));
 
 again:
 	for (i = start; i < zap->zap_m.zap_num_chunks; i++) {
@@ -764,7 +927,7 @@ mzap_addent(zap_name_t *zn, uint64_t value)
 		if (mze->mze_name[0] == 0) {
 			mze->mze_value = value;
 			mze->mze_cd = cd;
-			(void) strcpy(mze->mze_name, zn->zn_name_orij);
+			(void) strcpy(mze->mze_name, zn->zn_key_orig);
 			zap->zap_m.zap_num_entries++;
 			zap->zap_m.zap_alloc_next = i+1;
 			if (zap->zap_m.zap_alloc_next ==
@@ -782,7 +945,7 @@ mzap_addent(zap_name_t *zn, uint64_t value)
 }
 
 int
-zap_add(objset_t *os, uint64_t zapobj, const char *name,
+zap_add(objset_t *os, uint64_t zapobj, const char *key,
     int integer_size, uint64_t num_integers,
     const void *val, dmu_tx_t *tx)
 {
@@ -795,7 +958,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
 	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
 	if (err)
 		return (err);
-	zn = zap_name_alloc(zap, name, MT_EXACT);
+	zn = zap_name_alloc(zap, key, MT_EXACT);
 	if (zn == NULL) {
 		zap_unlockdir(zap);
 		return (ENOTSUP);
@@ -804,10 +967,8 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
 		err = fzap_add(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
 	} else if (integer_size != 8 || num_integers != 1 ||
-	    strlen(name) >= MZAP_NAME_LEN) {
-		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
-		    zapobj, integer_size, num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, tx);
+	    strlen(key) >= MZAP_NAME_LEN) {
+		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0)
 			err = fzap_add(zn, integer_size, num_integers, val, tx);
 		zap = zn->zn_zap;	/* fzap_add() may change zap */
@@ -826,6 +987,31 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+int
+zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, int integer_size, uint64_t num_integers,
+    const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+	err = fzap_add(zn, integer_size, num_integers, val, tx);
+	zap = zn->zn_zap;	/* fzap_add() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_add() failed */
+		zap_unlockdir(zap);
+	return (err);
+}
+
 int
 zap_update(objset_t *os, uint64_t zapobj, const char *name,
     int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
@@ -851,7 +1037,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	    strlen(name) >= MZAP_NAME_LEN) {
 		dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n",
 		    zapobj, integer_size, num_integers, name);
-		err = mzap_upgrade(&zn->zn_zap, tx);
+		err = mzap_upgrade(&zn->zn_zap, tx, 0);
 		if (err == 0)
 			err = fzap_update(zn, integer_size, num_integers,
 			    val, tx);
@@ -873,6 +1059,31 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+int
+zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints,
+    int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	zap_name_t *zn;
+	int err;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+	err = fzap_update(zn, integer_size, num_integers, val, tx);
+	zap = zn->zn_zap;	/* fzap_update() may change zap */
+	zap_name_free(zn);
+	if (zap != NULL)	/* may be NULL if fzap_upgrade() failed */
+		zap_unlockdir(zap);
+	return (err);
+}
+
 int
 zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx)
 {
@@ -914,17 +1125,32 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name,
 	return (err);
 }
 
+int
+zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key,
+    int key_numints, dmu_tx_t *tx)
+{
+	zap_t *zap;
+	int err;
+	zap_name_t *zn;
+
+	err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap);
+	if (err)
+		return (err);
+	zn = zap_name_alloc_uint64(zap, key, key_numints);
+	if (zn == NULL) {
+		zap_unlockdir(zap);
+		return (ENOTSUP);
+	}
+	err = fzap_remove(zn, tx);
+	zap_name_free(zn);
+	zap_unlockdir(zap);
+	return (err);
+}
+
 /*
  * Routines for iterating over the attributes.
  */
 
-/*
- * We want to keep the high 32 bits of the cursor zero if we can, so
- * that 32-bit programs can access this.  So use a small hash value so
- * we can fit 4 bits of cd into the 32-bit cursor.
- *
- * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ]
- */
 void
 zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
     uint64_t serialized)
@@ -933,15 +1159,9 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj,
 	zc->zc_zap = NULL;
 	zc->zc_leaf = NULL;
 	zc->zc_zapobj = zapobj;
-	if (serialized == -1ULL) {
-		zc->zc_hash = -1ULL;
-		zc->zc_cd = 0;
-	} else {
-		zc->zc_hash = serialized << (64-ZAP_HASHBITS);
-		zc->zc_cd = serialized >> ZAP_HASHBITS;
-		if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */
-			zc->zc_cd = 0;
-	}
+	zc->zc_serialized = serialized;
+	zc->zc_hash = 0;
+	zc->zc_cd = 0;
 }
 
 void
@@ -971,10 +1191,21 @@ zap_cursor_serialize(zap_cursor_t *zc)
 {
 	if (zc->zc_hash == -1ULL)
 		return (-1ULL);
-	ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0);
-	ASSERT(zc->zc_cd < ZAP_MAXCD);
-	return ((zc->zc_hash >> (64-ZAP_HASHBITS)) |
-	    ((uint64_t)zc->zc_cd << ZAP_HASHBITS));
+	if (zc->zc_zap == NULL)
+		return (zc->zc_serialized);
+	ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0);
+	ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap));
+
+	/*
+	 * We want to keep the high 32 bits of the cursor zero if we can, so
+	 * that 32-bit programs can access this.  So usually use a small
+	 * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits
+	 * of the cursor.
+	 *
+	 * [ collision differentiator | zap_hashbits()-bit hash value ]
+	 */
+	return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) |
+	    ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap)));
 }
 
 int
@@ -989,10 +1220,23 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za)
 		return (ENOENT);
 
 	if (zc->zc_zap == NULL) {
+		int hb;
 		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
 		    RW_READER, TRUE, FALSE, &zc->zc_zap);
 		if (err)
 			return (err);
+
+		/*
+		 * To support zap_cursor_init_serialized, advance, retrieve,
+		 * we must add to the existing zc_cd, which may already
+		 * be 1 due to the zap_cursor_advance.
+		 */
+		ASSERT(zc->zc_hash == 0);
+		hb = zap_hashbits(zc->zc_zap);
+		zc->zc_hash = zc->zc_serialized << (64 - hb);
+		zc->zc_cd += zc->zc_serialized >> hb;
+		if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */
+			zc->zc_cd = 0;
 	} else {
 		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 	}
@@ -1037,12 +1281,46 @@ zap_cursor_advance(zap_cursor_t *zc)
 	if (zc->zc_hash == -1ULL)
 		return;
 	zc->zc_cd++;
-	if (zc->zc_cd >= ZAP_MAXCD) {
-		zc->zc_cd = 0;
-		zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS);
-		if (zc->zc_hash == 0) /* EOF */
-			zc->zc_hash = -1ULL;
+}
+
+int
+zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt)
+{
+	int err = 0;
+	mzap_ent_t *mze;
+	zap_name_t *zn;
+
+	if (zc->zc_zap == NULL) {
+		err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL,
+		    RW_READER, TRUE, FALSE, &zc->zc_zap);
+		if (err)
+			return (err);
+	} else {
+		rw_enter(&zc->zc_zap->zap_rwlock, RW_READER);
 	}
+
+	zn = zap_name_alloc(zc->zc_zap, name, mt);
+	if (zn == NULL) {
+		rw_exit(&zc->zc_zap->zap_rwlock);
+		return (ENOTSUP);
+	}
+
+	if (!zc->zc_zap->zap_ismicro) {
+		err = fzap_cursor_move_to_key(zc, zn);
+	} else {
+		mze = mze_find(zn);
+		if (mze == NULL) {
+			err = ENOENT;
+			goto out;
+		}
+		zc->zc_hash = mze->mze_hash;
+		zc->zc_cd = mze->mze_phys.mze_cd;
+	}
+
+out:
+	zap_name_free(zn);
+	rw_exit(&zc->zc_zap->zap_rwlock);
+	return (err);
 }
 
 int
@@ -1067,3 +1345,79 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs)
 	zap_unlockdir(zap);
 	return (0);
 }
+
+int
+zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add,
+    uint64_t *towrite, uint64_t *tooverwrite)
+{
+	zap_t *zap;
+	int err = 0;
+
+
+	/*
+	 * Since, we don't have a name, we cannot figure out which blocks will
+	 * be affected in this operation. So, account for the worst case :
+	 * - 3 blocks overwritten: target leaf, ptrtbl block, header block
+	 * - 4 new blocks written if adding:
+	 * 	- 2 blocks for possibly split leaves,
+	 * 	- 2 grown ptrtbl blocks
+	 *
+	 * This also accomodates the case where an add operation to a fairly
+	 * large microzap results in a promotion to fatzap.
+	 */
+	if (name == NULL) {
+		*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+		return (err);
+	}
+
+	/*
+	 * We lock the zap with adding ==  FALSE. Because, if we pass
+	 * the actual value of add, it could trigger a mzap_upgrade().
+	 * At present we are just evaluating the possibility of this operation
+	 * and hence we donot want to trigger an upgrade.
+	 */
+	err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap);
+	if (err)
+		return (err);
+
+	if (!zap->zap_ismicro) {
+		zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT);
+		if (zn) {
+			err = fzap_count_write(zn, add, towrite,
+			    tooverwrite);
+			zap_name_free(zn);
+		} else {
+			/*
+			 * We treat this case as similar to (name == NULL)
+			 */
+			*towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE;
+		}
+	} else {
+		/*
+		 * We are here if (name != NULL) and this is a micro-zap.
+		 * We account for the header block depending on whether it
+		 * is freeable.
+		 *
+		 * Incase of an add-operation it is hard to find out
+		 * if this add will promote this microzap to fatzap.
+		 * Hence, we consider the worst case and account for the
+		 * blocks assuming this microzap would be promoted to a
+		 * fatzap.
+		 *
+		 * 1 block overwritten  : header block
+		 * 4 new blocks written : 2 new split leaf, 2 grown
+		 *			ptrtbl blocks
+		 */
+		if (dmu_buf_freeable(zap->zap_dbuf))
+			*tooverwrite += SPA_MAXBLOCKSIZE;
+		else
+			*towrite += SPA_MAXBLOCKSIZE;
+
+		if (add) {
+			*towrite += 4 * SPA_MAXBLOCKSIZE;
+		}
+	}
+
+	zap_unlockdir(zap);
+	return (err);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c
index 341dc4dfe7436..36e39a320cba0 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -65,15 +65,16 @@
     ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
 #define	OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \
     ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS)
-#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
 
 #define	ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \
     ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \
     ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \
     ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE)
 
-#define	WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\
-    ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
+#define	WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS)
+#define	WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \
+    ACE_DELETE|ACE_DELETE_CHILD)
+#define	WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS)
 
 #define	OGE_CLEAR	(ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \
     ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE)
@@ -92,6 +93,8 @@
 #define	ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\
     ZFS_ACL_OBJ_ACE)
 
+#define	ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH)
+
 static uint16_t
 zfs_ace_v0_get_type(void *acep)
 {
@@ -538,8 +541,9 @@ zfs_acl_curr_node(zfs_acl_t *aclp)
  * ACE FUIDs will be created later.
  */
 int
-zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap,
-    zfs_ace_t *z_acl, int aclcnt, size_t *size)
+zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp,
+    void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size,
+    zfs_fuid_info_t **fuidp, cred_t *cr)
 {
 	int i;
 	uint16_t entry_type;
@@ -555,9 +559,9 @@ zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap,
 		entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS;
 		if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP &&
 		    entry_type != ACE_EVERYONE) {
-			if (!aclp->z_has_fuids)
-				aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who);
-			aceptr->z_fuid = (uint64_t)acep->a_who;
+			aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who,
+			    cr, (entry_type == 0) ?
+			    ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp);
 		}
 
 		/*
@@ -682,7 +686,7 @@ zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep,
  * convert old ACL format to new
  */
 void
-zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp)
+zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr)
 {
 	zfs_oldace_t *oldaclp;
 	int i;
@@ -714,9 +718,9 @@ zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp)
 	newaclnode = zfs_acl_node_alloc(aclp->z_acl_count *
 	    sizeof (zfs_object_ace_t));
 	aclp->z_ops = zfs_acl_fuid_ops;
-	VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp,
-	    newaclnode->z_acldata, aclp->z_acl_count,
-	    &newaclnode->z_size) == 0);
+	VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp,
+	    oldaclp, newaclnode->z_acldata, aclp->z_acl_count,
+	    &newaclnode->z_size, NULL, cr) == 0);
 	newaclnode->z_ace_count = aclp->z_acl_count;
 	aclp->z_version = ZFS_ACL_VERSION;
 	kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t));
@@ -770,8 +774,7 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask,
  * Also, create FUIDs for any User/Group ACEs
  */
 static uint64_t
-zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
-    zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
+zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp)
 {
 	int		entry_type;
 	mode_t		mode;
@@ -780,6 +783,7 @@ zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
 	uint64_t	who;
 	uint16_t	iflags, type;
 	uint32_t	access_mask;
+	boolean_t	an_exec_denied = B_FALSE;
 
 	mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX));
 
@@ -904,17 +908,32 @@ zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
 					}
 				}
 			}
-		}
-		/*
-		 * Now handle FUID create for user/group ACEs
-		 */
-		if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) {
-			aclp->z_ops.ace_who_set(acep,
-			    zfs_fuid_create(zp->z_zfsvfs, who, cr,
-			    (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP,
-			    tx, fuidp));
+		} else {
+			/*
+			 * Only care if this IDENTIFIER_GROUP or
+			 * USER ACE denies execute access to someone,
+			 * mode is not affected
+			 */
+			if ((access_mask & ACE_EXECUTE) && type == DENY)
+				an_exec_denied = B_TRUE;
 		}
 	}
+
+	/*
+	 * Failure to allow is effectively a deny, so execute permission
+	 * is denied if it was never mentioned or if we explicitly
+	 * weren't allowed it.
+	 */
+	if (!an_exec_denied &&
+	    ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS ||
+	    (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS))
+		an_exec_denied = B_TRUE;
+
+	if (an_exec_denied)
+		zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED;
+	else
+		zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED;
+
 	return (mode);
 }
 
@@ -954,7 +973,8 @@ zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify)
 }
 
 /*
- * Read an external acl object.
+ * Read an external acl object.  If the intent is to modify, always
+ * create a new acl and leave any cached acl in place.
  */
 static int
 zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
@@ -968,8 +988,15 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 
 	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
 
+	if (zp->z_acl_cached && !will_modify) {
+		*aclpp = zp->z_acl_cached;
+		return (0);
+	}
+
 	if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) {
 		*aclpp = zfs_acl_node_read_internal(zp, will_modify);
+		if (!will_modify)
+			zp->z_acl_cached = *aclpp;
 		return (0);
 	}
 
@@ -989,7 +1016,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 	aclnode = zfs_acl_node_alloc(aclsize);
 	list_insert_head(&aclp->z_acl, aclnode);
 	error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0,
-	    aclsize, aclnode->z_acldata);
+	    aclsize, aclnode->z_acldata, DMU_READ_PREFETCH);
 	aclnode->z_ace_count = acl_count;
 	aclp->z_acl_count = acl_count;
 	aclp->z_acl_bytes = aclsize;
@@ -1003,6 +1030,8 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
 	}
 
 	*aclpp = aclp;
+	if (!will_modify)
+		zp->z_acl_cached = aclp;
 	return (0);
 }
 
@@ -1014,8 +1043,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify)
  * already checked the acl and knows whether to inherit.
  */
 int
-zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
-    zfs_fuid_info_t **fuidp, dmu_tx_t *tx)
+zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx)
 {
 	int		error;
 	znode_phys_t	*zphys = zp->z_phys;
@@ -1026,16 +1054,18 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
 	dmu_object_type_t otype;
 	zfs_acl_node_t	*aclnode;
 
-	ASSERT(MUTEX_HELD(&zp->z_lock));
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
 
-	zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx);
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
+	zphys->zp_mode = zfs_mode_compute(zp, aclp);
 
 	/*
-	 * Decide which opbject type to use.  If we are forced to
-	 * use old ACL format than transform ACL into zfs_oldace_t
+	 * Decide which object type to use.  If we are forced to
+	 * use old ACL format then transform ACL into zfs_oldace_t
 	 * layout.
 	 */
 	if (!zfsvfs->z_use_fuids) {
@@ -1043,7 +1073,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
 	} else {
 		if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) &&
 		    (zfsvfs->z_version >= ZPL_VERSION_FUID))
-			zfs_acl_xform(zp, aclp);
+			zfs_acl_xform(zp, aclp, cr);
 		ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID);
 		otype = DMU_OT_ACL;
 	}
@@ -1125,7 +1155,6 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr,
 	if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0)
 		zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL;
 
-	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
 	return (0);
 }
 
@@ -1336,7 +1365,7 @@ zfs_acl_ace_insert(zfs_acl_t *aclp, void  *acep)
  * Prepend deny ACE
  */
 static void *
-zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep,
+zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep,
     mode_t mode)
 {
 	zfs_acl_node_t *aclnode;
@@ -1349,7 +1378,7 @@ zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep,
 	fuid = aclp->z_ops.ace_who_get(acep);
 	flags = aclp->z_ops.ace_flags_get(acep);
 	zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS));
-	zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid);
+	zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid);
 
 	return (newacep);
 }
@@ -1473,9 +1502,9 @@ zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep,
  * in PSARC/2002/240
  */
 static void
-zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
+zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid,
+    uint64_t mode, zfs_acl_t *aclp)
 {
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	void		*acep = NULL, *prevacep = NULL;
 	uint64_t	who;
 	int 		i;
@@ -1485,11 +1514,6 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
 	uint16_t	iflags, type;
 	uint32_t	access_mask;
 
-	ASSERT(MUTEX_HELD(&zp->z_acl_lock));
-	ASSERT(MUTEX_HELD(&zp->z_lock));
-
-	aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
-
 	/*
 	 * If discard then just discard all ACL nodes which
 	 * represent the ACEs.
@@ -1554,17 +1578,15 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp)
 
 					if (!reuse_deny) {
 						prevacep =
-						    zfs_acl_prepend_deny(zp,
+						    zfs_acl_prepend_deny(uid,
 						    aclp, acep, mode);
 					} else {
 						zfs_acl_prepend_fixup(
 						    aclp, prevacep,
-						    acep, mode,
-						    zp->z_phys->zp_uid);
+						    acep, mode, uid);
 					}
 					zfs_fixup_group_entries(aclp, acep,
 					    prevacep, mode);
-
 				}
 			}
 		}
@@ -1623,8 +1645,10 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode)
 	mutex_enter(&zp->z_acl_lock);
 	*aclp = NULL;
 	error = zfs_acl_node_read(zp, aclp, B_TRUE);
-	if (error == 0)
-		zfs_acl_chmod(zp, mode, *aclp);
+	if (error == 0) {
+		(*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS;
+		zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp);
+	}
 	mutex_exit(&zp->z_acl_lock);
 	mutex_exit(&zp->z_lock);
 	return (error);
@@ -1649,9 +1673,8 @@ zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep)
  * Should ACE be inherited?
  */
 static int
-zfs_ace_can_use(znode_t *zp, uint16_t acep_flags)
+zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags)
 {
-	int vtype = ZTOV(zp)->v_type;
 	int	iflags = (acep_flags & 0xf);
 
 	if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE))
@@ -1666,10 +1689,9 @@ zfs_ace_can_use(znode_t *zp, uint16_t acep_flags)
  * inherit inheritable ACEs from parent
  */
 static zfs_acl_t *
-zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
-    boolean_t *need_chmod)
+zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp,
+    uint64_t mode, boolean_t *need_chmod)
 {
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
 	void		*pacep;
 	void		*acep, *acep2;
 	zfs_acl_node_t  *aclnode, *aclnode2;
@@ -1680,8 +1702,8 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
 	size_t		ace_size;
 	void		*data1, *data2;
 	size_t		data1sz, data2sz;
-	boolean_t	vdir = ZTOV(zp)->v_type == VDIR;
-	boolean_t	vreg = ZTOV(zp)->v_type == VREG;
+	boolean_t	vdir = vtype == VDIR;
+	boolean_t	vreg = vtype == VREG;
 	boolean_t	passthrough, passthrough_x, noallow;
 
 	passthrough_x =
@@ -1710,7 +1732,7 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
 
 		ace_size = aclp->z_ops.ace_size(pacep);
 
-		if (!zfs_ace_can_use(zp, iflags))
+		if (!zfs_ace_can_use(vtype, iflags))
 			continue;
 
 		/*
@@ -1806,55 +1828,73 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode,
  * Create file system object initial permissions
  * including inheritable ACEs.
  */
-void
-zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
-    vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
-    zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp)
+int
+zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr,
+    vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids)
 {
-	uint64_t	mode, fuid, fgid;
 	int		error;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zfs_acl_t	*aclp = NULL;
+	zfsvfs_t	*zfsvfs = dzp->z_zfsvfs;
 	zfs_acl_t	*paclp;
-	xvattr_t	*xvap = (xvattr_t *)vap;
 	gid_t		gid;
 	boolean_t	need_chmod = B_TRUE;
 
-	if (setaclp)
-		aclp = setaclp;
+	bzero(acl_ids, sizeof (zfs_acl_ids_t));
+	acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode);
 
-	mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	if (vsecp)
+		if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr,
+		    &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0)
+			return (error);
 
 	/*
 	 * Determine uid and gid.
 	 */
-	if ((flag & (IS_ROOT_NODE | IS_REPLAY)) ||
+	if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay ||
 	    ((flag & IS_XATTR) && (vap->va_type == VDIR))) {
-		fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr,
-		    ZFS_OWNER, tx, fuidp);
-		fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
-		    ZFS_GROUP, tx, fuidp);
+		acl_ids->z_fuid = zfs_fuid_create(zfsvfs,
+		    (uint64_t)vap->va_uid, cr,
+		    ZFS_OWNER, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+		    (uint64_t)vap->va_gid, cr,
+		    ZFS_GROUP, &acl_ids->z_fuidp);
 		gid = vap->va_gid;
 	} else {
-		fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp);
-		fgid = 0;
+		acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER,
+		    cr, &acl_ids->z_fuidp);
+		acl_ids->z_fgid = 0;
 		if (vap->va_mask & AT_GID)  {
-			fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr,
-			    ZFS_GROUP, tx, fuidp);
+			acl_ids->z_fgid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &acl_ids->z_fuidp);
 			gid = vap->va_gid;
-			if (fgid != parent->z_phys->zp_gid &&
+			if (acl_ids->z_fgid != dzp->z_phys->zp_gid &&
 			    !groupmember(vap->va_gid, cr) &&
 			    secpolicy_vnode_create_gid(cr) != 0)
-				fgid = 0;
+				acl_ids->z_fgid = 0;
 		}
-		if (fgid == 0) {
-			if (parent->z_phys->zp_mode & S_ISGID) {
-				fgid = parent->z_phys->zp_gid;
-				gid = zfs_fuid_map_id(zfsvfs, fgid,
+		if (acl_ids->z_fgid == 0) {
+			if (dzp->z_phys->zp_mode & S_ISGID) {
+				char		*domain;
+				uint32_t	rid;
+
+				acl_ids->z_fgid = dzp->z_phys->zp_gid;
+				gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid,
 				    cr, ZFS_GROUP);
+
+				if (zfsvfs->z_use_fuids &&
+				    IS_EPHEMERAL(acl_ids->z_fgid)) {
+					domain = zfs_fuid_idx_domain(
+					    &zfsvfs->z_fuid_idx,
+					    FUID_INDEX(acl_ids->z_fgid));
+					rid = FUID_RID(acl_ids->z_fgid);
+					zfs_fuid_node_add(&acl_ids->z_fuidp,
+					    domain, rid,
+					    FUID_INDEX(acl_ids->z_fgid),
+					    acl_ids->z_fgid, ZFS_GROUP);
+				}
 			} else {
-				fgid = zfs_fuid_create_cred(zfsvfs,
-				    ZFS_GROUP, tx, cr, fuidp);
+				acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs,
+				    ZFS_GROUP, cr, &acl_ids->z_fuidp);
 				gid = crgetgid(cr);
 			}
 		}
@@ -1867,57 +1907,60 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag,
 	 * file's new group, clear the file's set-GID bit.
 	 */
 
-	if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) {
-		mode |= S_ISGID;
+	if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) &&
+	    (vap->va_type == VDIR)) {
+		acl_ids->z_mode |= S_ISGID;
 	} else {
-		if ((mode & S_ISGID) &&
+		if ((acl_ids->z_mode & S_ISGID) &&
 		    secpolicy_vnode_setids_setgids(cr, gid) != 0)
-			mode &= ~S_ISGID;
-	}
-
-	zp->z_phys->zp_uid = fuid;
-	zp->z_phys->zp_gid = fgid;
-	zp->z_phys->zp_mode = mode;
-
-	if (aclp == NULL) {
-		mutex_enter(&parent->z_lock);
-		if ((ZTOV(parent)->v_type == VDIR &&
-		    (parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
-		    !(zp->z_phys->zp_flags & ZFS_XATTR)) {
-			mutex_enter(&parent->z_acl_lock);
-			VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE));
-			mutex_exit(&parent->z_acl_lock);
-			aclp = zfs_acl_inherit(zp, paclp, mode, &need_chmod);
-			zfs_acl_free(paclp);
+			acl_ids->z_mode &= ~S_ISGID;
+	}
+
+	if (acl_ids->z_aclp == NULL) {
+		mutex_enter(&dzp->z_lock);
+		if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR &&
+		    (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) &&
+		    !(dzp->z_phys->zp_flags & ZFS_XATTR)) {
+			mutex_enter(&dzp->z_acl_lock);
+			VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE));
+			mutex_exit(&dzp->z_acl_lock);
+			acl_ids->z_aclp = zfs_acl_inherit(zfsvfs,
+			    vap->va_type, paclp, acl_ids->z_mode, &need_chmod);
 		} else {
-			aclp = zfs_acl_alloc(zfs_acl_version_zp(zp));
+			acl_ids->z_aclp =
+			    zfs_acl_alloc(zfs_acl_version_zp(dzp));
+		}
+		mutex_exit(&dzp->z_lock);
+		if (need_chmod) {
+			acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ?
+			    ZFS_ACL_AUTO_INHERIT : 0;
+			zfs_acl_chmod(zfsvfs, acl_ids->z_fuid,
+			    acl_ids->z_mode, acl_ids->z_aclp);
 		}
-		mutex_exit(&parent->z_lock);
-		mutex_enter(&zp->z_lock);
-		mutex_enter(&zp->z_acl_lock);
-		if (need_chmod)
-			zfs_acl_chmod(zp, mode, aclp);
-	} else {
-		mutex_enter(&zp->z_lock);
-		mutex_enter(&zp->z_acl_lock);
 	}
 
-	/* Force auto_inherit on all new directory objects */
-	if (vap->va_type == VDIR)
-		aclp->z_hints |= ZFS_ACL_AUTO_INHERIT;
-
-	error = zfs_aclset_common(zp, aclp, cr, fuidp, tx);
-
-	/* Set optional attributes if any */
-	if (vap->va_mask & AT_XVATTR)
-		zfs_xvattr_set(zp, xvap);
+	return (0);
+}
 
-	mutex_exit(&zp->z_lock);
-	mutex_exit(&zp->z_acl_lock);
-	ASSERT3U(error, ==, 0);
+/*
+ * Free ACL and fuid_infop, but not the acl_ids structure
+ */
+void
+zfs_acl_ids_free(zfs_acl_ids_t *acl_ids)
+{
+	if (acl_ids->z_aclp)
+		zfs_acl_free(acl_ids->z_aclp);
+	if (acl_ids->z_fuidp)
+		zfs_fuid_info_free(acl_ids->z_fuidp);
+	acl_ids->z_aclp = NULL;
+	acl_ids->z_fuidp = NULL;
+}
 
-	if (aclp != setaclp)
-		zfs_acl_free(aclp);
+boolean_t
+zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids)
+{
+	return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) ||
+	    zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid));
 }
 
 /*
@@ -1983,8 +2026,6 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	if (mask & VSA_ACE) {
 		size_t aclsz;
 
-		zfs_acl_node_t *aclnode = list_head(&aclp->z_acl);
-
 		aclsz = count * sizeof (ace_t) +
 		    sizeof (ace_object_t) * largeace;
 
@@ -1995,8 +2036,17 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 			zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr,
 			    vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES));
 		else {
-			bcopy(aclnode->z_acldata, vsecp->vsa_aclentp,
-			    count * sizeof (ace_t));
+			zfs_acl_node_t *aclnode;
+			void *start = vsecp->vsa_aclentp;
+
+			for (aclnode = list_head(&aclp->z_acl); aclnode;
+			    aclnode = list_next(&aclp->z_acl, aclnode)) {
+				bcopy(aclnode->z_acldata, start,
+				    aclnode->z_size);
+				start = (caddr_t)start + aclnode->z_size;
+			}
+			ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp ==
+			    aclp->z_acl_bytes);
 		}
 	}
 	if (mask & VSA_ACE_ACLFLAGS) {
@@ -2011,14 +2061,12 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 
 	mutex_exit(&zp->z_acl_lock);
 
-	zfs_acl_free(aclp);
-
 	return (0);
 }
 
 int
 zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
-    vsecattr_t *vsecp, zfs_acl_t **zaclp)
+    vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp)
 {
 	zfs_acl_t *aclp;
 	zfs_acl_node_t *aclnode;
@@ -2041,9 +2089,9 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type,
 			return (error);
 		}
 	} else {
-		if ((error = zfs_copy_ace_2_fuid(obj_type, aclp,
+		if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp,
 		    vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt,
-		    &aclnode->z_size)) != 0) {
+		    &aclnode->z_size, fuidp, cr)) != 0) {
 			zfs_acl_free(aclp);
 			zfs_acl_node_free(aclnode);
 			return (error);
@@ -2084,6 +2132,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	int		error;
 	zfs_acl_t	*aclp;
 	zfs_fuid_info_t	*fuidp = NULL;
+	boolean_t	fuid_dirtied;
 
 	if (mask == 0)
 		return (ENOSYS);
@@ -2094,7 +2143,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr))
 		return (error);
 
-	error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp);
+	error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp,
+	    &aclp);
 	if (error)
 		return (error);
 
@@ -2106,11 +2156,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 		aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS);
 	}
 top:
-	if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) {
-		zfs_acl_free(aclp);
-		return (error);
-	}
-
 	mutex_enter(&zp->z_lock);
 	mutex_enter(&zp->z_acl_lock);
 
@@ -2135,25 +2180,16 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 	} else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes);
 	}
-	if (aclp->z_has_fuids) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
 
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		mutex_exit(&zp->z_acl_lock);
 		mutex_exit(&zp->z_lock);
 
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -2163,14 +2199,18 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 		return (error);
 	}
 
-	error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
+	error = zfs_aclset_common(zp, aclp, cr, tx);
 	ASSERT(error == 0);
+	zp->z_acl_cached = aclp;
 
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
+	zfs_time_stamper_locked(zp, STATE_CHANGED, tx);
 	zfs_log_acl(zilog, tx, zp, vsecp, fuidp);
 
 	if (fuidp)
 		zfs_fuid_info_free(fuidp);
-	zfs_acl_free(aclp);
 	dmu_tx_commit(tx);
 done:
 	mutex_exit(&zp->z_acl_lock);
@@ -2180,45 +2220,17 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr)
 }
 
 /*
- * working_mode returns the permissions that were not granted
+ * Check accesses of interest (AoI) against attributes of the dataset
+ * such as read-only.  Returns zero if no AoI conflict with dataset
+ * attributes, otherwise an appropriate errno is returned.
  */
 static int
-zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
-    boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode)
 {
-	zfs_acl_t	*aclp;
-	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	int		error;
-	uid_t		uid = crgetuid(cr);
-	uint64_t 	who;
-	uint16_t	type, iflags;
-	uint16_t	entry_type;
-	uint32_t	access_mask;
-	uint32_t	deny_mask = 0;
-	zfs_ace_hdr_t	*acep = NULL;
-	boolean_t	checkit;
-	uid_t		fowner;
-	uid_t		gowner;
-
-	/*
-	 * Short circuit empty requests
-	 */
-	if (v4_mode == 0)
-		return (0);
-
-	*check_privs = B_TRUE;
-
-	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
-		*working_mode = 0;
-		return (0);
-	}
-
-	*working_mode = v4_mode;
-
 	if ((v4_mode & WRITE_MASK) &&
 	    (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) &&
-	    (!IS_DEVVP(ZTOV(zp)))) {
-		*check_privs = B_FALSE;
+	    (!IS_DEVVP(ZTOV(zp)) ||
+	    (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) {
 		return (EROFS);
 	}
 
@@ -2230,31 +2242,64 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 	    (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) ||
 	    (ZTOV(zp)->v_type == VDIR &&
 	    (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) {
-		*check_privs = B_FALSE;
 		return (EPERM);
 	}
 
 	if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) &&
 	    (zp->z_phys->zp_flags & ZFS_NOUNLINK)) {
-		*check_privs = B_FALSE;
 		return (EPERM);
 	}
 
 	if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) &&
 	    (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) {
-		*check_privs = B_FALSE;
 		return (EACCES);
 	}
 
-	/*
-	 * The caller requested that the ACL check be skipped.  This
-	 * would only happen if the caller checked VOP_ACCESS() with a
-	 * 32 bit ACE mask and already had the appropriate permissions.
-	 */
-	if (skipaclchk) {
-		*working_mode = 0;
-		return (0);
-	}
+	return (0);
+}
+
+/*
+ * The primary usage of this function is to loop through all of the
+ * ACEs in the znode, determining what accesses of interest (AoI) to
+ * the caller are allowed or denied.  The AoI are expressed as bits in
+ * the working_mode parameter.  As each ACE is processed, bits covered
+ * by that ACE are removed from the working_mode.  This removal
+ * facilitates two things.  The first is that when the working mode is
+ * empty (= 0), we know we've looked at all the AoI. The second is
+ * that the ACE interpretation rules don't allow a later ACE to undo
+ * something granted or denied by an earlier ACE.  Removing the
+ * discovered access or denial enforces this rule.  At the end of
+ * processing the ACEs, all AoI that were found to be denied are
+ * placed into the working_mode, giving the caller a mask of denied
+ * accesses.  Returns:
+ *	0		if all AoI granted
+ *	EACCESS 	if the denied mask is non-zero
+ *	other error	if abnormal failure (e.g., IO error)
+ *
+ * A secondary usage of the function is to determine if any of the
+ * AoI are granted.  If an ACE grants any access in
+ * the working_mode, we immediately short circuit out of the function.
+ * This mode is chosen by setting anyaccess to B_TRUE.  The
+ * working_mode is not a denied access mask upon exit if the function
+ * is used in this manner.
+ */
+static int
+zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode,
+    boolean_t anyaccess, cred_t *cr)
+{
+	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
+	zfs_acl_t	*aclp;
+	int		error;
+	uid_t		uid = crgetuid(cr);
+	uint64_t 	who;
+	uint16_t	type, iflags;
+	uint16_t	entry_type;
+	uint32_t	access_mask;
+	uint32_t	deny_mask = 0;
+	zfs_ace_hdr_t	*acep = NULL;
+	boolean_t	checkit;
+	uid_t		fowner;
+	uid_t		gowner;
 
 	zfs_fuid_map_ids(zp, cr, &fowner, &gowner);
 
@@ -2268,6 +2313,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 
 	while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask,
 	    &iflags, &type)) {
+		uint32_t mask_matched;
 
 		if (!zfs_acl_valid_ace_type(type, iflags))
 			continue;
@@ -2275,6 +2321,11 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 		if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE))
 			continue;
 
+		/* Skip ACE if it does not affect any AoI */
+		mask_matched = (access_mask & *working_mode);
+		if (!mask_matched)
+			continue;
+
 		entry_type = (iflags & ACE_TYPE_FLAGS);
 
 		checkit = B_FALSE;
@@ -2306,21 +2357,29 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 					checkit = B_TRUE;
 				break;
 			} else {
-				zfs_acl_free(aclp);
 				mutex_exit(&zp->z_acl_lock);
 				return (EIO);
 			}
 		}
 
 		if (checkit) {
-			uint32_t mask_matched = (access_mask & *working_mode);
-
-			if (mask_matched) {
-				if (type == DENY)
-					deny_mask |= mask_matched;
-
-				*working_mode &= ~mask_matched;
+			if (type == DENY) {
+				DTRACE_PROBE3(zfs__ace__denies,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				deny_mask |= mask_matched;
+			} else {
+				DTRACE_PROBE3(zfs__ace__allows,
+				    znode_t *, zp,
+				    zfs_ace_hdr_t *, acep,
+				    uint32_t, mask_matched);
+				if (anyaccess) {
+					mutex_exit(&zp->z_acl_lock);
+					return (0);
+				}
 			}
+			*working_mode &= ~mask_matched;
 		}
 
 		/* Are we done? */
@@ -2329,7 +2388,6 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 	}
 
 	mutex_exit(&zp->z_acl_lock);
-	zfs_acl_free(aclp);
 
 	/* Put the found 'denies' back on the working mode */
 	if (deny_mask) {
@@ -2342,6 +2400,68 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
 	return (0);
 }
 
+/*
+ * Return true if any access whatsoever granted, we don't actually
+ * care what access is granted.
+ */
+boolean_t
+zfs_has_access(znode_t *zp, cred_t *cr)
+{
+	uint32_t have = ACE_ALL_PERMS;
+
+	if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) {
+		uid_t		owner;
+
+		owner = zfs_fuid_map_id(zp->z_zfsvfs,
+		    zp->z_phys->zp_uid, cr, ZFS_OWNER);
+
+		return (
+		    secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 ||
+		    secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 ||
+		    secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 ||
+		    secpolicy_vnode_chown(cr, owner) == 0 ||
+		    secpolicy_vnode_setdac(cr, owner) == 0 ||
+		    secpolicy_vnode_remove(cr) == 0);
+	}
+	return (B_TRUE);
+}
+
+static int
+zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode,
+    boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr)
+{
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int err;
+
+	*working_mode = v4_mode;
+	*check_privs = B_TRUE;
+
+	/*
+	 * Short circuit empty requests
+	 */
+	if (v4_mode == 0 || zfsvfs->z_replay) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) {
+		*check_privs = B_FALSE;
+		return (err);
+	}
+
+	/*
+	 * The caller requested that the ACL check be skipped.  This
+	 * would only happen if the caller checked VOP_ACCESS() with a
+	 * 32 bit ACE mask and already had the appropriate permissions.
+	 */
+	if (skipaclchk) {
+		*working_mode = 0;
+		return (0);
+	}
+
+	return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr));
+}
+
 static int
 zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
     cred_t *cr)
@@ -2353,6 +2473,78 @@ zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs,
 	    check_privs, B_FALSE, cr));
 }
 
+int
+zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr)
+{
+	boolean_t owner = B_FALSE;
+	boolean_t groupmbr = B_FALSE;
+	boolean_t is_attr;
+	uid_t fowner;
+	uid_t gowner;
+	uid_t uid = crgetuid(cr);
+	int error;
+
+	if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED)
+		return (EACCES);
+
+	is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) &&
+	    (ZTOV(zdp)->v_type == VDIR));
+	if (is_attr)
+		goto slow;
+
+	mutex_enter(&zdp->z_acl_lock);
+
+	if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) {
+		mutex_exit(&zdp->z_acl_lock);
+		return (0);
+	}
+
+	if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 ||
+	    FUID_INDEX(zdp->z_phys->zp_gid) != 0) {
+		mutex_exit(&zdp->z_acl_lock);
+		goto slow;
+	}
+
+	fowner = (uid_t)zdp->z_phys->zp_uid;
+	gowner = (uid_t)zdp->z_phys->zp_gid;
+
+	if (uid == fowner) {
+		owner = B_TRUE;
+		if (zdp->z_phys->zp_mode & S_IXUSR) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		} else {
+			mutex_exit(&zdp->z_acl_lock);
+			goto slow;
+		}
+	}
+	if (groupmember(gowner, cr)) {
+		groupmbr = B_TRUE;
+		if (zdp->z_phys->zp_mode & S_IXGRP) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		} else {
+			mutex_exit(&zdp->z_acl_lock);
+			goto slow;
+		}
+	}
+	if (!owner && !groupmbr) {
+		if (zdp->z_phys->zp_mode & S_IXOTH) {
+			mutex_exit(&zdp->z_acl_lock);
+			return (0);
+		}
+	}
+
+	mutex_exit(&zdp->z_acl_lock);
+
+slow:
+	DTRACE_PROBE(zfs__fastpath__execute__access__miss);
+	ZFS_ENTER(zdp->z_zfsvfs);
+	error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr);
+	ZFS_EXIT(zdp->z_zfsvfs);
+	return (error);
+}
+
 /*
  * Determine whether Access should be granted/denied, invoking least
  * priv subsytem when a deny is determined.
@@ -2447,7 +2639,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
 			    owner, checkmode);
 
 		if (error == 0 && (working_mode & ACE_WRITE_OWNER))
-			error = secpolicy_vnode_chown(cr, B_TRUE);
+			error = secpolicy_vnode_chown(cr, owner);
 		if (error == 0 && (working_mode & ACE_WRITE_ACL))
 			error = secpolicy_vnode_setdac(cr, owner);
 
@@ -2456,7 +2648,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr)
 			error = secpolicy_vnode_remove(cr);
 
 		if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) {
-			error = secpolicy_vnode_chown(cr, B_FALSE);
+			error = secpolicy_vnode_chown(cr, owner);
 		}
 		if (error == 0) {
 			/*
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c
index ab97f83eb0af2..cd36696f95007 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/vfs.h>
 #include <sys/fs/zfs.h>
@@ -63,6 +61,20 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
 
 	while (ptr < end) {
 		if (zfs_layout) {
+			/*
+			 * Avoid overrun.  Embedded aces can have one
+			 * of several sizes.  We don't know exactly
+			 * how many our present, only the size of the
+			 * buffer containing them.  That size may be
+			 * larger than needed to hold the aces
+			 * present.  As long as we do not do any
+			 * swapping beyond the end of our block we are
+			 * okay.  It it safe to swap any non-ace data
+			 * within the block since it is just zeros.
+			 */
+			if (ptr + sizeof (zfs_ace_hdr_t) > end) {
+				break;
+			}
 			zacep = (zfs_ace_t *)ptr;
 			zacep->z_hdr.z_access_mask =
 			    BSWAP_32(zacep->z_hdr.z_access_mask);
@@ -71,6 +83,10 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
 			    BSWAP_16(zacep->z_hdr.z_type);
 			entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS;
 		} else {
+			/* Overrun avoidance */
+			if (ptr + sizeof (ace_t) > end) {
+				break;
+			}
 			acep = (ace_t *)ptr;
 			acep->a_access_mask = BSWAP_32(acep->a_access_mask);
 			acep->a_flags = BSWAP_16(acep->a_flags);
@@ -87,8 +103,14 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout)
 			break;
 		case ACE_IDENTIFIER_GROUP:
 		default:
+			/* Overrun avoidance */
 			if (zfs_layout) {
-				zacep->z_fuid = BSWAP_64(zacep->z_fuid);
+				if (ptr + sizeof (zfs_ace_t) <= end) {
+					zacep->z_fuid = BSWAP_64(zacep->z_fuid);
+				} else {
+					entry_size = sizeof (zfs_ace_t);
+					break;
+				}
 			}
 			switch (ace_type) {
 			case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE:
@@ -169,7 +191,8 @@ zfs_znode_byteswap(void *buf, size_t size)
 	if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) {
 		zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0],
 		    ZFS_ACE_SPACE);
-	} else
+	} else {
 		zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0],
 		    ACE_SLOT_CNT);
+	}
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c
index 208fc36295d07..d09309a3f2cc5 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * ZFS control directory (a.k.a. ".zfs")
  *
@@ -116,12 +114,16 @@ snapentry_compare(const void *a, const void *b)
 vnodeops_t *zfsctl_ops_root;
 vnodeops_t *zfsctl_ops_snapdir;
 vnodeops_t *zfsctl_ops_snapshot;
+vnodeops_t *zfsctl_ops_shares;
+vnodeops_t *zfsctl_ops_shares_dir;
 
 static const fs_operation_def_t zfsctl_tops_root[];
 static const fs_operation_def_t zfsctl_tops_snapdir[];
 static const fs_operation_def_t zfsctl_tops_snapshot[];
+static const fs_operation_def_t zfsctl_tops_shares[];
 
 static vnode_t *zfsctl_mknode_snapdir(vnode_t *);
+static vnode_t *zfsctl_mknode_shares(vnode_t *);
 static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset);
 static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *);
 
@@ -129,14 +131,18 @@ static gfs_opsvec_t zfsctl_opsvec[] = {
 	{ ".zfs", zfsctl_tops_root, &zfsctl_ops_root },
 	{ ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir },
 	{ ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot },
+	{ ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir },
+	{ ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares },
 	{ NULL }
 };
 
 /*
- * Root directory elements.  We have only a single static entry, 'snapshot'.
+ * Root directory elements.  We only have two entries
+ * snapshot and shares.
  */
 static gfs_dirent_t zfsctl_root_entries[] = {
 	{ "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE },
+	{ "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE },
 	{ NULL }
 };
 
@@ -168,21 +174,34 @@ zfsctl_fini(void)
 		vn_freevnodeops(zfsctl_ops_snapdir);
 	if (zfsctl_ops_snapshot)
 		vn_freevnodeops(zfsctl_ops_snapshot);
+	if (zfsctl_ops_shares)
+		vn_freevnodeops(zfsctl_ops_shares);
+	if (zfsctl_ops_shares_dir)
+		vn_freevnodeops(zfsctl_ops_shares_dir);
 
 	zfsctl_ops_root = NULL;
 	zfsctl_ops_snapdir = NULL;
 	zfsctl_ops_snapshot = NULL;
+	zfsctl_ops_shares = NULL;
+	zfsctl_ops_shares_dir = NULL;
 }
 
 /*
- * Return the inode number associated with the 'snapshot' directory.
+ * Return the inode number associated with the 'snapshot' or
+ * 'shares' directory.
  */
 /* ARGSUSED */
 static ino64_t
 zfsctl_root_inode_cb(vnode_t *vp, int index)
 {
-	ASSERT(index == 0);
-	return (ZFSCTL_INO_SNAPDIR);
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+
+	ASSERT(index <= 2);
+
+	if (index == 0)
+		return (ZFSCTL_INO_SNAPDIR);
+
+	return (zfsvfs->z_shares_dir);
 }
 
 /*
@@ -275,8 +294,13 @@ static int
 zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
     caller_context_t *ct)
 {
-	if (mode & VWRITE)
-		return (EACCES);
+	if (flags & V_ACE_MASK) {
+		if (mode & ACE_ALL_WRITE_PERMS)
+			return (EACCES);
+	} else {
+		if (mode & VWRITE)
+			return (EACCES);
+	}
 
 	return (0);
 }
@@ -287,14 +311,13 @@ zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr,
 static void
 zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 {
-	zfsctl_node_t	*zcp = vp->v_data;
 	timestruc_t	now;
 
 	vap->va_uid = 0;
 	vap->va_gid = 0;
 	vap->va_rdev = 0;
 	/*
-	 * We are a purly virtual object, so we have no
+	 * We are a purely virtual object, so we have no
 	 * blocksize or allocated blocks.
 	 */
 	vap->va_blksize = 0;
@@ -309,7 +332,6 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap)
 	 */
 	gethrestime(&now);
 	vap->va_atime = now;
-	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
 }
 
 /*ARGSUSED*/
@@ -345,6 +367,30 @@ zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
 	return (0);
 }
 
+
+/*ARGSUSED*/
+static int
+zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct)
+{
+	zfsvfs_t	*zfsvfs = vp->v_vfsp->vfs_data;
+	znode_t		*dzp;
+	int		error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOTSUP);
+	}
+
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+		error = VOP_FID(ZTOV(dzp), fidp, ct);
+		VN_RELE(ZTOV(dzp));
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
 /*
  * .zfs inode namespace
  *
@@ -368,10 +414,12 @@ zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
     caller_context_t *ct)
 {
 	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	zfsctl_node_t *zcp = vp->v_data;
 
 	ZFS_ENTER(zfsvfs);
 	vap->va_nodeid = ZFSCTL_INO_ROOT;
 	vap->va_nlink = vap->va_size = NROOT_ENTRIES;
+	vap->va_mtime = vap->va_ctime = zcp->zc_cmtime;
 
 	zfsctl_common_getattr(vp, vap);
 	ZFS_EXIT(zfsvfs);
@@ -411,6 +459,22 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 	return (err);
 }
 
+static int
+zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
+    caller_context_t *ct)
+{
+	/*
+	 * We only care about ACL_ENABLED so that libsec can
+	 * display ACL correctly and not default to POSIX draft.
+	 */
+	if (cmd == _PC_ACL_ENABLED) {
+		*valp = _ACL_ACE_ENABLED;
+		return (0);
+	}
+
+	return (fs_pathconf(vp, cmd, valp, cr, ct));
+}
+
 static const fs_operation_def_t zfsctl_tops_root[] = {
 	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
 	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
@@ -421,6 +485,7 @@ static const fs_operation_def_t zfsctl_tops_root[] = {
 	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_root_lookup }	},
 	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
 	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive }	},
+	{ VOPNAME_PATHCONF,	{ .vop_pathconf = zfsctl_pathconf }	},
 	{ VOPNAME_FID,		{ .vop_fid = zfsctl_common_fid	}	},
 	{ NULL }
 };
@@ -458,7 +523,7 @@ zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr)
 		VN_RELE(svp);
 		return (error);
 	}
-	VFS_RELE(svp->v_vfsp);
+
 	/*
 	 * We can't use VN_RELE(), as that will try to invoke
 	 * zfsctl_snapdir_inactive(), which would cause us to destroy
@@ -635,7 +700,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
 		if (err)
 			avl_add(&sdp->sd_snaps, sep);
 		else
-			err = dmu_objset_destroy(snapname);
+			err = dmu_objset_destroy(snapname, B_FALSE);
 	} else {
 		err = ENOENT;
 	}
@@ -671,7 +736,7 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t  **vpp,
 		return (err);
 
 	if (err == 0) {
-		err = dmu_objset_snapshot(name, dirname, B_FALSE);
+		err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE);
 		if (err)
 			return (err);
 		err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp);
@@ -712,9 +777,6 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 
 	ASSERT(dvp->v_type == VDIR);
 
-	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0)
-		return (0);
-
 	/*
 	 * If we get a recursive call, that means we got called
 	 * from the domount() code while it was trying to look up the
@@ -726,6 +788,11 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 
 	ZFS_ENTER(zfsvfs);
 
+	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
 	if (flags & FIGNORECASE) {
 		boolean_t conflict = B_FALSE;
 
@@ -786,8 +853,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 		 */
 		return (err == EILSEQ ? ENOENT : err);
 	}
-	if (dmu_objset_open(snapname, DMU_OST_ZFS,
-	    DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) {
+	if (dmu_objset_hold(snapname, FTAG, &snap) != 0) {
 		mutex_exit(&sdp->sd_lock);
 		ZFS_EXIT(zfsvfs);
 		return (ENOENT);
@@ -799,7 +865,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 	*vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap));
 	avl_insert(&sdp->sd_snaps, sep, where);
 
-	dmu_objset_close(snap);
+	dmu_objset_rele(snap, FTAG);
 domount:
 	mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) +
 	    strlen("/.zfs/snapshot/") + strlen(nm) + 1;
@@ -824,7 +890,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 		 * Return the mounted root rather than the covered mount point.
 		 * Takes the GFS vnode at .zfs/snapshot/<snapname> and returns
 		 * the ZFS vnode mounted on top of the GFS node.  This ZFS
-		 * vnode is the root the newly created vfsp.
+		 * vnode is the root of the newly created vfsp.
 		 */
 		VFS_RELE(vfsp);
 		err = traverse(vpp);
@@ -857,6 +923,37 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
 	return (err);
 }
 
+/* ARGSUSED */
+static int
+zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp,
+    int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct,
+    int *direntflags, pathname_t *realpnp)
+{
+	zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data;
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (0);
+	}
+
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOTSUP);
+	}
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0)
+		error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp,
+		    flags, rdir, cr, ct, direntflags, realpnp);
+
+	VN_RELE(ZTOV(dzp));
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
 /* ARGSUSED */
 static int
 zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
@@ -901,6 +998,33 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp,
 	return (0);
 }
 
+/* ARGSUSED */
+static int
+zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp,
+    caller_context_t *ct, int flags)
+{
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOTSUP);
+	}
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+		error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags);
+		VN_RELE(ZTOV(dzp));
+	} else {
+		*eofp = 1;
+		error = ENOENT;
+	}
+
+	ZFS_EXIT(zfsvfs);
+	return (error);
+}
+
 /*
  * pvp is the '.zfs' directory (zfsctl_node_t).
  * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t).
@@ -926,6 +1050,45 @@ zfsctl_mknode_snapdir(vnode_t *pvp)
 	return (vp);
 }
 
+vnode_t *
+zfsctl_mknode_shares(vnode_t *pvp)
+{
+	vnode_t *vp;
+	zfsctl_node_t *sdp;
+
+	vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp,
+	    zfsctl_ops_shares, NULL, NULL, MAXNAMELEN,
+	    NULL, NULL);
+	sdp = vp->v_data;
+	sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime;
+	return (vp);
+
+}
+
+/* ARGSUSED */
+static int
+zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
+    caller_context_t *ct)
+{
+	zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data;
+	znode_t *dzp;
+	int error;
+
+	ZFS_ENTER(zfsvfs);
+	if (zfsvfs->z_shares_dir == 0) {
+		ZFS_EXIT(zfsvfs);
+		return (ENOTSUP);
+	}
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) {
+		error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct);
+		VN_RELE(ZTOV(dzp));
+	}
+	ZFS_EXIT(zfsvfs);
+	return (error);
+
+
+}
+
 /* ARGSUSED */
 static int
 zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
@@ -938,6 +1101,7 @@ zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	zfsctl_common_getattr(vp, vap);
 	vap->va_nodeid = gfs_file_inode(vp);
 	vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2;
+	vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os);
 	ZFS_EXIT(zfsvfs);
 
 	return (0);
@@ -976,6 +1140,20 @@ static const fs_operation_def_t zfsctl_tops_snapdir[] = {
 	{ NULL }
 };
 
+static const fs_operation_def_t zfsctl_tops_shares[] = {
+	{ VOPNAME_OPEN,		{ .vop_open = zfsctl_common_open }	},
+	{ VOPNAME_CLOSE,	{ .vop_close = zfsctl_common_close }	},
+	{ VOPNAME_IOCTL,	{ .error = fs_inval }			},
+	{ VOPNAME_GETATTR,	{ .vop_getattr = zfsctl_shares_getattr } },
+	{ VOPNAME_ACCESS,	{ .vop_access = zfsctl_common_access }	},
+	{ VOPNAME_READDIR,	{ .vop_readdir = zfsctl_shares_readdir } },
+	{ VOPNAME_LOOKUP,	{ .vop_lookup = zfsctl_shares_lookup }	},
+	{ VOPNAME_SEEK,		{ .vop_seek = fs_seek }			},
+	{ VOPNAME_INACTIVE,	{ .vop_inactive = gfs_vop_inactive } },
+	{ VOPNAME_FID,		{ .vop_fid = zfsctl_shares_fid } },
+	{ NULL }
+};
+
 /*
  * pvp is the GFS vnode '.zfs/snapshot'.
  *
@@ -993,7 +1171,6 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset)
 	    zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL);
 	zcp = vp->v_data;
 	zcp->zc_id = objset;
-	VFS_HOLD(vp->v_vfsp);
 
 	return (vp);
 }
@@ -1032,7 +1209,6 @@ zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 
 	mutex_exit(&sdp->sd_lock);
 	VN_RELE(dvp);
-	VFS_RELE(vp->v_vfsp);
 
 	/*
 	 * Dispose of the vnode for the snapshot mount point.
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c
index 1ec4932646e90..2e3725c2bf1c3 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -114,6 +114,8 @@ zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact,
  *		  ZCIEXACT: On a purely case-insensitive file system,
  *			    this lookup should be case-sensitive.
  *		  ZRENAMING: we are locking for renaming, force narrow locks
+ *		  ZHAVELOCK: Don't grab the z_name_lock for this call. The
+ *			     current thread already holds it.
  *
  * Output arguments:
  *	zpp	- pointer to the znode for the entry (NULL if there isn't one)
@@ -208,13 +210,20 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 
 	/*
 	 * Wait until there are no locks on this name.
+	 *
+	 * Don't grab the the lock if it is already held. However, cannot
+	 * have both ZSHARED and ZHAVELOCK together.
 	 */
-	rw_enter(&dzp->z_name_lock, RW_READER);
+	ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK));
+	if (!(flag & ZHAVELOCK))
+		rw_enter(&dzp->z_name_lock, RW_READER);
+
 	mutex_enter(&dzp->z_lock);
 	for (;;) {
 		if (dzp->z_unlinked) {
 			mutex_exit(&dzp->z_lock);
-			rw_exit(&dzp->z_name_lock);
+			if (!(flag & ZHAVELOCK))
+				rw_exit(&dzp->z_name_lock);
 			return (ENOENT);
 		}
 		for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) {
@@ -224,7 +233,8 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 		}
 		if (error != 0) {
 			mutex_exit(&dzp->z_lock);
-			rw_exit(&dzp->z_name_lock);
+			if (!(flag & ZHAVELOCK))
+				rw_exit(&dzp->z_name_lock);
 			return (ENOENT);
 		}
 		if (dl == NULL)	{
@@ -235,6 +245,7 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 			cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL);
 			dl->dl_name = name;
 			dl->dl_sharecnt = 0;
+			dl->dl_namelock = 0;
 			dl->dl_namesize = 0;
 			dl->dl_dzp = dzp;
 			dl->dl_next = dzp->z_dirlocks;
@@ -246,6 +257,12 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp,
 		cv_wait(&dl->dl_cv, &dzp->z_lock);
 	}
 
+	/*
+	 * If the z_name_lock was NOT held for this dirlock record it.
+	 */
+	if (flag & ZHAVELOCK)
+		dl->dl_namelock = 1;
+
 	if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) {
 		/*
 		 * We're the second shared reference to dl.  Make a copy of
@@ -325,7 +342,10 @@ zfs_dirent_unlock(zfs_dirlock_t *dl)
 	zfs_dirlock_t **prev_dl, *cur_dl;
 
 	mutex_enter(&dzp->z_lock);
-	rw_exit(&dzp->z_name_lock);
+
+	if (!dl->dl_namelock)
+		rw_exit(&dzp->z_name_lock);
+
 	if (dl->dl_sharecnt > 1) {
 		dl->dl_sharecnt--;
 		mutex_exit(&dzp->z_lock);
@@ -561,24 +581,6 @@ zfs_rmnode(znode_t *zp)
 	ASSERT(ZTOV(zp)->v_count == 0);
 	ASSERT(zp->z_phys->zp_links == 0);
 
-	/*
-	 * If this is a ZIL replay then leave the object in the unlinked set.
-	 * Otherwise we can get a deadlock, because the delete can be
-	 * quite large and span multiple tx's and txgs, but each replay
-	 * creates a tx to atomically run the replay function and mark the
-	 * replay record as complete. We deadlock trying to start a tx in
-	 * a new txg to further the deletion but can't because the replay
-	 * tx hasn't finished.
-	 *
-	 * We actually delete the object if we get a failure to create an
-	 * object in zil_replay_log_record(), or after calling zil_replay().
-	 */
-	if (zfsvfs->z_assign >= TXG_INITIAL) {
-		zfs_znode_dmu_fini(zp);
-		zfs_znode_free(zp);
-		return;
-	}
-
 	/*
 	 * If this is an attribute directory, purge its contents.
 	 */
@@ -823,44 +825,49 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr)
 	znode_t *xzp;
 	dmu_tx_t *tx;
 	int error;
-	zfs_fuid_info_t *fuidp = NULL;
+	zfs_acl_ids_t acl_ids;
+	boolean_t fuid_dirtied;
 
 	*xvpp = NULL;
 
 	if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr))
 		return (error);
 
+	if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL,
+	    &acl_ids)) != 0)
+		return (error);
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_acl_ids_free(&acl_ids);
+		return (EDQUOT);
+	}
+
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT)
+		zfs_acl_ids_free(&acl_ids);
+		if (error == ERESTART)
 			dmu_tx_wait(tx);
 		dmu_tx_abort(tx);
 		return (error);
 	}
-	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp);
+	zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids);
+
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
 	ASSERT(xzp->z_phys->zp_parent == zp->z_id);
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
 	zp->z_phys->zp_xattr = xzp->z_id;
 
 	(void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp,
-	    xzp, "", NULL, fuidp, vap);
-	if (fuidp)
-		zfs_fuid_info_free(fuidp);
+	    xzp, "", NULL, acl_ids.z_fuidp, vap);
+
+	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	*xvpp = ZTOV(xzp);
@@ -930,7 +937,7 @@ zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags)
 	error = zfs_make_xattrdir(zp, &va, xvpp, cr);
 	zfs_dirent_unlock(dl);
 
-	if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+	if (error == ERESTART) {
 		/* NB: we already did dmu_tx_wait() if necessary */
 		goto top;
 	}
@@ -959,7 +966,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr)
 	uid_t		fowner;
 	zfsvfs_t	*zfsvfs = zdp->z_zfsvfs;
 
-	if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL)	/* ZIL replay */
+	if (zdp->z_zfsvfs->z_replay)
 		return (0);
 
 	if ((zdp->z_phys->zp_mode & S_ISVTX) == 0)
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c
index 236d69e7e6f07..0b4812666442d 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -28,6 +28,7 @@
 #include <sys/vdev.h>
 #include <sys/vdev_impl.h>
 #include <sys/zio.h>
+#include <sys/zio_checksum.h>
 
 #include <sys/fm/fs/zfs.h>
 #include <sys/fm/protocol.h>
@@ -87,21 +88,32 @@
  * this pointer is set to NULL, and no ereport will be generated (since it
  * doesn't actually correspond to any particular device or piece of data,
  * and the caller will always retry without caching or queueing anyway).
+ *
+ * For checksum errors, we want to include more information about the actual
+ * error which occurs.  Accordingly, we build an ereport when the error is
+ * noticed, but instead of sending it in immediately, we hang it off of the
+ * io_cksum_report field of the logical IO.  When the logical IO completes
+ * (successfully or not), zfs_ereport_finish_checksum() is called with the
+ * good and bad versions of the buffer (if available), and we annotate the
+ * ereport with information about the differences.
  */
-void
-zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+#ifdef _KERNEL
+static void
+zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out,
+    const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
     uint64_t stateoroffset, uint64_t size)
 {
-#ifdef _KERNEL
 	nvlist_t *ereport, *detector;
+
 	uint64_t ena;
 	char class[64];
-	int state;
 
 	/*
-	 * If we are doing a spa_tryimport(), ignore errors.
+	 * If we are doing a spa_tryimport() or in recovery mode,
+	 * ignore errors.
 	 */
-	if (spa->spa_load_state == SPA_LOAD_TRYIMPORT)
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT ||
+	    spa_load_state(spa) == SPA_LOAD_RECOVER)
 		return;
 
 	/*
@@ -109,7 +121,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	 * failed, don't bother logging any new ereports - we're just going to
 	 * get the same diagnosis anyway.
 	 */
-	if (spa->spa_load_state != SPA_LOAD_NONE &&
+	if (spa_load_state(spa) != SPA_LOAD_NONE &&
 	    spa->spa_last_open_failed)
 		return;
 
@@ -130,17 +142,48 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 			return;
 
 		/*
-		 * If the vdev has already been marked as failing due to a
-		 * failed probe, then ignore any subsequent I/O errors, as the
-		 * DE will automatically fault the vdev on the first such
-		 * failure.
+		 * If this I/O is not a retry I/O, don't post an ereport.
+		 * Otherwise, we risk making bad diagnoses based on B_FAILFAST
+		 * I/Os.
 		 */
-		if (vd != NULL &&
-		    (!vdev_readable(vd) || !vdev_writeable(vd)) &&
-		    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0)
+		if (zio->io_error == EIO &&
+		    !(zio->io_flags & ZIO_FLAG_IO_RETRY))
 			return;
+
+		if (vd != NULL) {
+			/*
+			 * If the vdev has already been marked as failing due
+			 * to a failed probe, then ignore any subsequent I/O
+			 * errors, as the DE will automatically fault the vdev
+			 * on the first such failure.  This also catches cases
+			 * where vdev_remove_wanted is set and the device has
+			 * not yet been asynchronously placed into the REMOVED
+			 * state.
+			 */
+			if (zio->io_vd == vd && !vdev_accessible(vd, zio))
+				return;
+
+			/*
+			 * Ignore checksum errors for reads from DTL regions of
+			 * leaf vdevs.
+			 */
+			if (zio->io_type == ZIO_TYPE_READ &&
+			    zio->io_error == ECKSUM &&
+			    vd->vdev_ops->vdev_op_leaf &&
+			    vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1))
+				return;
+		}
 	}
 
+	/*
+	 * For probe failure, we want to avoid posting ereports if we've
+	 * already removed the device in the meantime.
+	 */
+	if (vd != NULL &&
+	    strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 &&
+	    (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED))
+		return;
+
 	if ((ereport = fm_nvlist_create(NULL)) == NULL)
 		return;
 
@@ -159,7 +202,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	 * state, use a SPA-wide ENA.  Otherwise, if we are in an I/O state, use
 	 * a root zio-wide ENA.  Otherwise, simply use a unique ENA.
 	 */
-	if (spa->spa_load_state != SPA_LOAD_NONE) {
+	if (spa_load_state(spa) != SPA_LOAD_NONE) {
 		if (spa->spa_ena == 0)
 			spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1);
 		ena = spa->spa_ena;
@@ -188,14 +231,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	 * passed in.
 	 */
 
-	/*
-	 * If we are importing a faulted pool, then we treat it like an open,
-	 * not an import.  Otherwise, the DE will ignore all faults during
-	 * import, since the default behavior is to mark the devices as
-	 * persistently unavailable, not leave them in the faulted state.
-	 */
-	state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state;
-
 	/*
 	 * Generic payload members common to all ereports.
 	 */
@@ -203,7 +238,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 	    DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID,
 	    DATA_TYPE_UINT64, spa_guid(spa),
 	    FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32,
-	    state, NULL);
+	    spa_load_state(spa), NULL);
 
 	if (spa != NULL) {
 		fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE,
@@ -222,14 +257,18 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 		    DATA_TYPE_UINT64, vd->vdev_guid,
 		    FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE,
 		    DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL);
-		if (vd->vdev_path)
+		if (vd->vdev_path != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH,
 			    DATA_TYPE_STRING, vd->vdev_path, NULL);
-		if (vd->vdev_devid)
+		if (vd->vdev_devid != NULL)
 			fm_payload_set(ereport,
 			    FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID,
 			    DATA_TYPE_STRING, vd->vdev_devid, NULL);
+		if (vd->vdev_fru != NULL)
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU,
+			    DATA_TYPE_STRING, vd->vdev_fru, NULL);
 
 		if (pvd != NULL) {
 			fm_payload_set(ereport,
@@ -303,8 +342,339 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 		    FM_EREPORT_PAYLOAD_ZFS_PREV_STATE,
 		    DATA_TYPE_UINT64, stateoroffset, NULL);
 	}
+
 	mutex_exit(&spa->spa_errlist_lock);
 
+	*ereport_out = ereport;
+	*detector_out = detector;
+}
+
+/* if it's <= 128 bytes, save the corruption directly */
+#define	ZFM_MAX_INLINE		(128 / sizeof (uint64_t))
+
+#define	MAX_RANGES		16
+
+typedef struct zfs_ecksum_info {
+	/* histograms of set and cleared bits by bit number in a 64-bit word */
+	uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY];
+	uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY];
+
+	/* inline arrays of bits set and cleared. */
+	uint64_t zei_bits_set[ZFM_MAX_INLINE];
+	uint64_t zei_bits_cleared[ZFM_MAX_INLINE];
+
+	/*
+	 * for each range, the number of bits set and cleared.  The Hamming
+	 * distance between the good and bad buffers is the sum of them all.
+	 */
+	uint32_t zei_range_sets[MAX_RANGES];
+	uint32_t zei_range_clears[MAX_RANGES];
+
+	struct zei_ranges {
+		uint32_t	zr_start;
+		uint32_t	zr_end;
+	} zei_ranges[MAX_RANGES];
+
+	size_t	zei_range_count;
+	uint32_t zei_mingap;
+	uint32_t zei_allowed_mingap;
+
+} zfs_ecksum_info_t;
+
+static void
+update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count)
+{
+	size_t i;
+	size_t bits = 0;
+	uint64_t value = BE_64(value_arg);
+
+	/* We store the bits in big-endian (largest-first) order */
+	for (i = 0; i < 64; i++) {
+		if (value & (1ull << i)) {
+			hist[63 - i]++;
+			++bits;
+		}
+	}
+	/* update the count of bits changed */
+	*count += bits;
+}
+
+/*
+ * We've now filled up the range array, and need to increase "mingap" and
+ * shrink the range list accordingly.  zei_mingap is always the smallest
+ * distance between array entries, so we set the new_allowed_gap to be
+ * one greater than that.  We then go through the list, joining together
+ * any ranges which are closer than the new_allowed_gap.
+ *
+ * By construction, there will be at least one.  We also update zei_mingap
+ * to the new smallest gap, to prepare for our next invocation.
+ */
+static void
+shrink_ranges(zfs_ecksum_info_t *eip)
+{
+	uint32_t mingap = UINT32_MAX;
+	uint32_t new_allowed_gap = eip->zei_mingap + 1;
+
+	size_t idx, output;
+	size_t max = eip->zei_range_count;
+
+	struct zei_ranges *r = eip->zei_ranges;
+
+	ASSERT3U(eip->zei_range_count, >, 0);
+	ASSERT3U(eip->zei_range_count, <=, MAX_RANGES);
+
+	output = idx = 0;
+	while (idx < max - 1) {
+		uint32_t start = r[idx].zr_start;
+		uint32_t end = r[idx].zr_end;
+
+		while (idx < max - 1) {
+			idx++;
+
+			uint32_t nstart = r[idx].zr_start;
+			uint32_t nend = r[idx].zr_end;
+
+			uint32_t gap = nstart - end;
+			if (gap < new_allowed_gap) {
+				end = nend;
+				continue;
+			}
+			if (gap < mingap)
+				mingap = gap;
+			break;
+		}
+		r[output].zr_start = start;
+		r[output].zr_end = end;
+		output++;
+	}
+	ASSERT3U(output, <, eip->zei_range_count);
+	eip->zei_range_count = output;
+	eip->zei_mingap = mingap;
+	eip->zei_allowed_mingap = new_allowed_gap;
+}
+
+static void
+add_range(zfs_ecksum_info_t *eip, int start, int end)
+{
+	struct zei_ranges *r = eip->zei_ranges;
+	size_t count = eip->zei_range_count;
+
+	if (count >= MAX_RANGES) {
+		shrink_ranges(eip);
+		count = eip->zei_range_count;
+	}
+	if (count == 0) {
+		eip->zei_mingap = UINT32_MAX;
+		eip->zei_allowed_mingap = 1;
+	} else {
+		int gap = start - r[count - 1].zr_end;
+
+		if (gap < eip->zei_allowed_mingap) {
+			r[count - 1].zr_end = end;
+			return;
+		}
+		if (gap < eip->zei_mingap)
+			eip->zei_mingap = gap;
+	}
+	r[count].zr_start = start;
+	r[count].zr_end = end;
+	eip->zei_range_count++;
+}
+
+static size_t
+range_total_size(zfs_ecksum_info_t *eip)
+{
+	struct zei_ranges *r = eip->zei_ranges;
+	size_t count = eip->zei_range_count;
+	size_t result = 0;
+	size_t idx;
+
+	for (idx = 0; idx < count; idx++)
+		result += (r[idx].zr_end - r[idx].zr_start);
+
+	return (result);
+}
+
+static zfs_ecksum_info_t *
+annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info,
+    const uint8_t *goodbuf, const uint8_t *badbuf, size_t size,
+    boolean_t drop_if_identical)
+{
+	const uint64_t *good = (const uint64_t *)goodbuf;
+	const uint64_t *bad = (const uint64_t *)badbuf;
+
+	uint64_t allset = 0;
+	uint64_t allcleared = 0;
+
+	size_t nui64s = size / sizeof (uint64_t);
+
+	size_t inline_size;
+	int no_inline = 0;
+	size_t idx;
+	size_t range;
+
+	size_t offset = 0;
+	ssize_t start = -1;
+
+	zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP);
+
+	/* don't do any annotation for injected checksum errors */
+	if (info != NULL && info->zbc_injected)
+		return (eip);
+
+	if (info != NULL && info->zbc_has_cksum) {
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED,
+		    DATA_TYPE_UINT64_ARRAY,
+		    sizeof (info->zbc_expected) / sizeof (uint64_t),
+		    (uint64_t *)&info->zbc_expected,
+		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL,
+		    DATA_TYPE_UINT64_ARRAY,
+		    sizeof (info->zbc_actual) / sizeof (uint64_t),
+		    (uint64_t *)&info->zbc_actual,
+		    FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO,
+		    DATA_TYPE_STRING,
+		    info->zbc_checksum_name,
+		    NULL);
+
+		if (info->zbc_byteswapped) {
+			fm_payload_set(ereport,
+			    FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP,
+			    DATA_TYPE_BOOLEAN, 1,
+			    NULL);
+		}
+	}
+
+	if (badbuf == NULL || goodbuf == NULL)
+		return (eip);
+
+	ASSERT3U(nui64s, <=, UINT16_MAX);
+	ASSERT3U(size, ==, nui64s * sizeof (uint64_t));
+	ASSERT3U(size, <=, SPA_MAXBLOCKSIZE);
+	ASSERT3U(size, <=, UINT32_MAX);
+
+	/* build up the range list by comparing the two buffers. */
+	for (idx = 0; idx < nui64s; idx++) {
+		if (good[idx] == bad[idx]) {
+			if (start == -1)
+				continue;
+
+			add_range(eip, start, idx);
+			start = -1;
+		} else {
+			if (start != -1)
+				continue;
+
+			start = idx;
+		}
+	}
+	if (start != -1)
+		add_range(eip, start, idx);
+
+	/* See if it will fit in our inline buffers */
+	inline_size = range_total_size(eip);
+	if (inline_size > ZFM_MAX_INLINE)
+		no_inline = 1;
+
+	/*
+	 * If there is no change and we want to drop if the buffers are
+	 * identical, do so.
+	 */
+	if (inline_size == 0 && drop_if_identical) {
+		kmem_free(eip, sizeof (*eip));
+		return (NULL);
+	}
+
+	/*
+	 * Now walk through the ranges, filling in the details of the
+	 * differences.  Also convert our uint64_t-array offsets to byte
+	 * offsets.
+	 */
+	for (range = 0; range < eip->zei_range_count; range++) {
+		size_t start = eip->zei_ranges[range].zr_start;
+		size_t end = eip->zei_ranges[range].zr_end;
+
+		for (idx = start; idx < end; idx++) {
+			uint64_t set, cleared;
+
+			// bits set in bad, but not in good
+			set = ((~good[idx]) & bad[idx]);
+			// bits set in good, but not in bad
+			cleared = (good[idx] & (~bad[idx]));
+
+			allset |= set;
+			allcleared |= cleared;
+
+			if (!no_inline) {
+				ASSERT3U(offset, <, inline_size);
+				eip->zei_bits_set[offset] = set;
+				eip->zei_bits_cleared[offset] = cleared;
+				offset++;
+			}
+
+			update_histogram(set, eip->zei_histogram_set,
+			    &eip->zei_range_sets[range]);
+			update_histogram(cleared, eip->zei_histogram_cleared,
+			    &eip->zei_range_clears[range]);
+		}
+
+		/* convert to byte offsets */
+		eip->zei_ranges[range].zr_start	*= sizeof (uint64_t);
+		eip->zei_ranges[range].zr_end	*= sizeof (uint64_t);
+	}
+	eip->zei_allowed_mingap	*= sizeof (uint64_t);
+	inline_size		*= sizeof (uint64_t);
+
+	/* fill in ereport */
+	fm_payload_set(ereport,
+	    FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES,
+	    DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count,
+	    (uint32_t *)eip->zei_ranges,
+	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP,
+	    DATA_TYPE_UINT32, eip->zei_allowed_mingap,
+	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS,
+	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets,
+	    FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS,
+	    DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears,
+	    NULL);
+
+	if (!no_inline) {
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS,
+		    DATA_TYPE_UINT8_ARRAY,
+		    inline_size, (uint8_t *)eip->zei_bits_set,
+		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS,
+		    DATA_TYPE_UINT8_ARRAY,
+		    inline_size, (uint8_t *)eip->zei_bits_cleared,
+		    NULL);
+	} else {
+		fm_payload_set(ereport,
+		    FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM,
+		    DATA_TYPE_UINT16_ARRAY,
+		    NBBY * sizeof (uint64_t), eip->zei_histogram_set,
+		    FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM,
+		    DATA_TYPE_UINT16_ARRAY,
+		    NBBY * sizeof (uint64_t), eip->zei_histogram_cleared,
+		    NULL);
+	}
+	return (eip);
+}
+#endif
+
+void
+zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
+    uint64_t stateoroffset, uint64_t size)
+{
+#ifdef _KERNEL
+	nvlist_t *ereport = NULL;
+	nvlist_t *detector = NULL;
+
+	zfs_ereport_start(&ereport, &detector,
+	    subclass, spa, vd, zio, stateoroffset, size);
+
+	if (ereport == NULL)
+		return;
+
 	fm_ereport_post(ereport, EVCH_SLEEP);
 
 	fm_nvlist_destroy(ereport, FM_NVA_FREE);
@@ -312,6 +682,122 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio,
 #endif
 }
 
+void
+zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd,
+    struct zio *zio, uint64_t offset, uint64_t length, void *arg,
+    zio_bad_cksum_t *info)
+{
+	zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP);
+
+	if (zio->io_vsd != NULL)
+		zio->io_vsd_ops->vsd_cksum_report(zio, report, arg);
+	else
+		zio_vsd_default_cksum_report(zio, report, arg);
+
+	/* copy the checksum failure information if it was provided */
+	if (info != NULL) {
+		report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP);
+		bcopy(info, report->zcr_ckinfo, sizeof (*info));
+	}
+
+	report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift;
+	report->zcr_length = length;
+
+#ifdef _KERNEL
+	zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector,
+	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+	if (report->zcr_ereport == NULL) {
+		report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo);
+		kmem_free(report, sizeof (*report));
+		return;
+	}
+#endif
+
+	mutex_enter(&spa->spa_errlist_lock);
+	report->zcr_next = zio->io_logical->io_cksum_report;
+	zio->io_logical->io_cksum_report = report;
+	mutex_exit(&spa->spa_errlist_lock);
+}
+
+void
+zfs_ereport_finish_checksum(zio_cksum_report_t *report,
+    const void *good_data, const void *bad_data, boolean_t drop_if_identical)
+{
+#ifdef _KERNEL
+	zfs_ecksum_info_t *info = NULL;
+	info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo,
+	    good_data, bad_data, report->zcr_length, drop_if_identical);
+
+	if (info != NULL)
+		fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+
+	fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE);
+	fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE);
+	report->zcr_ereport = report->zcr_detector = NULL;
+
+	if (info != NULL)
+		kmem_free(info, sizeof (*info));
+#endif
+}
+
+void
+zfs_ereport_free_checksum(zio_cksum_report_t *rpt)
+{
+#ifdef _KERNEL
+	if (rpt->zcr_ereport != NULL) {
+		fm_nvlist_destroy(rpt->zcr_ereport,
+		    FM_NVA_FREE);
+		fm_nvlist_destroy(rpt->zcr_detector,
+		    FM_NVA_FREE);
+	}
+#endif
+	rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo);
+
+	if (rpt->zcr_ckinfo != NULL)
+		kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo));
+
+	kmem_free(rpt, sizeof (*rpt));
+}
+
+void
+zfs_ereport_send_interim_checksum(zio_cksum_report_t *report)
+{
+#ifdef _KERNEL
+	fm_ereport_post(report->zcr_ereport, EVCH_SLEEP);
+#endif
+}
+
+void
+zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd,
+    struct zio *zio, uint64_t offset, uint64_t length,
+    const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc)
+{
+#ifdef _KERNEL
+	nvlist_t *ereport = NULL;
+	nvlist_t *detector = NULL;
+	zfs_ecksum_info_t *info;
+
+	zfs_ereport_start(&ereport, &detector,
+	    FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length);
+
+	if (ereport == NULL)
+		return;
+
+	info = annotate_ecksum(ereport, zbc, good_data, bad_data, length,
+	    B_FALSE);
+
+	if (info != NULL)
+		fm_ereport_post(ereport, EVCH_SLEEP);
+
+	fm_nvlist_destroy(ereport, FM_NVA_FREE);
+	fm_nvlist_destroy(detector, FM_NVA_FREE);
+
+	if (info != NULL)
+		kmem_free(info, sizeof (*info));
+#endif
+}
+
 static void
 zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
 {
@@ -319,6 +805,9 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name)
 	nvlist_t *resource;
 	char class[64];
 
+	if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT)
+		return;
+
 	if ((resource = fm_nvlist_create(NULL)) == NULL)
 		return;
 
@@ -360,3 +849,15 @@ zfs_post_autoreplace(spa_t *spa, vdev_t *vd)
 {
 	zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE);
 }
+
+/*
+ * The 'resource.fs.zfs.statechange' event is an internal signal that the
+ * given vdev has transitioned its state to DEGRADED or HEALTHY.  This will
+ * cause the retire agent to repair any outstanding fault management cases
+ * open because the device was not found (fault.fs.zfs.device).
+ */
+void
+zfs_post_state_change(spa_t *spa, vdev_t *vd)
+{
+	zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c
index 7cb505258d8f7..dfa4f8daef58d 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c
@@ -19,12 +19,11 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #include <sys/zfs_context.h>
-#include <sys/sunddi.h>
 #include <sys/dmu.h>
 #include <sys/avl.h>
 #include <sys/zap.h>
@@ -47,8 +46,10 @@
  * During file system initialization the nvlist(s) are read and
  * two AVL trees are created.  One tree is keyed by the index number
  * and the other by the domain string.  Nodes are never removed from
- * trees, but new entries may be added.  If a new entry is added then the
- * on-disk packed nvlist will also be updated.
+ * trees, but new entries may be added.  If a new entry is added then
+ * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then
+ * be responsible for calling zfs_fuid_sync() to sync the changes to disk.
+ *
  */
 
 #define	FUID_IDX	"fuid_idx"
@@ -97,6 +98,15 @@ domain_compare(const void *arg1, const void *arg2)
 	return (val > 0 ? 1 : -1);
 }
 
+void
+zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree)
+{
+	avl_create(idx_tree, idx_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
+	avl_create(domain_tree, domain_compare,
+	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
+}
+
 /*
  * load initial fuid domain and idx trees.  This function is used by
  * both the kernel and zdb.
@@ -108,12 +118,9 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
 	dmu_buf_t *db;
 	uint64_t fuid_size;
 
-	avl_create(idx_tree, idx_compare,
-	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode));
-	avl_create(domain_tree, domain_compare,
-	    sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode));
-
-	VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db));
+	ASSERT(fuid_obj != 0);
+	VERIFY(0 == dmu_bonus_hold(os, fuid_obj,
+	    FTAG, &db));
 	fuid_size = *(uint64_t *)db->db_data;
 	dmu_buf_rele(db, FTAG);
 
@@ -125,7 +132,8 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree,
 		int i;
 
 		packed = kmem_alloc(fuid_size, KM_SLEEP);
-		VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0);
+		VERIFY(dmu_read(os, fuid_obj, 0,
+		    fuid_size, packed, DMU_READ_PREFETCH) == 0);
 		VERIFY(nvlist_unpack(packed, fuid_size,
 		    &nvp, 0) == 0);
 		VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY,
@@ -189,10 +197,8 @@ zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx)
  * Load the fuid table(s) into memory.
  */
 static void
-zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+zfs_fuid_init(zfsvfs_t *zfsvfs)
 {
-	int error = 0;
-
 	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
 
 	if (zfsvfs->z_fuid_loaded) {
@@ -200,41 +206,101 @@ zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 		return;
 	}
 
-	if (zfsvfs->z_fuid_obj == 0) {
-
-		/* first make sure we need to allocate object */
-
-		error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
-		    ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
-		if (error == ENOENT && tx != NULL) {
-			zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
-			    DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
-			    sizeof (uint64_t), tx);
-			VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
-			    ZFS_FUID_TABLES, sizeof (uint64_t), 1,
-			    &zfsvfs->z_fuid_obj, tx) == 0);
-		}
-	}
+	zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain);
 
+	(void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ,
+	    ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj);
 	if (zfsvfs->z_fuid_obj != 0) {
 		zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os,
 		    zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx,
 		    &zfsvfs->z_fuid_domain);
-		zfsvfs->z_fuid_loaded = B_TRUE;
 	}
 
+	zfsvfs->z_fuid_loaded = B_TRUE;
+	rw_exit(&zfsvfs->z_fuid_lock);
+}
+
+/*
+ * sync out AVL trees to persistent storage.
+ */
+void
+zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	nvlist_t *nvp;
+	nvlist_t **fuids;
+	size_t nvsize = 0;
+	char *packed;
+	dmu_buf_t *db;
+	fuid_domain_t *domnode;
+	int numnodes;
+	int i;
+
+	if (!zfsvfs->z_fuid_dirty) {
+		return;
+	}
+
+	rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER);
+
+	/*
+	 * First see if table needs to be created?
+	 */
+	if (zfsvfs->z_fuid_obj == 0) {
+		zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os,
+		    DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE,
+		    sizeof (uint64_t), tx);
+		VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+		    ZFS_FUID_TABLES, sizeof (uint64_t), 1,
+		    &zfsvfs->z_fuid_obj, tx) == 0);
+	}
+
+	VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	numnodes = avl_numnodes(&zfsvfs->z_fuid_idx);
+	fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP);
+	for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++,
+	    domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) {
+		VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
+		    domnode->f_idx) == 0);
+		VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0);
+		VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN,
+		    domnode->f_ksid->kd_name) == 0);
+	}
+	VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
+	    fuids, numnodes) == 0);
+	for (i = 0; i != numnodes; i++)
+		nvlist_free(fuids[i]);
+	kmem_free(fuids, numnodes * sizeof (void *));
+	VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
+	packed = kmem_alloc(nvsize, KM_SLEEP);
+	VERIFY(nvlist_pack(nvp, &packed, &nvsize,
+	    NV_ENCODE_XDR, KM_SLEEP) == 0);
+	nvlist_free(nvp);
+	zfsvfs->z_fuid_size = nvsize;
+	dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
+	    zfsvfs->z_fuid_size, packed, tx);
+	kmem_free(packed, zfsvfs->z_fuid_size);
+	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
+	    FTAG, &db));
+	dmu_buf_will_dirty(db, tx);
+	*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
+	dmu_buf_rele(db, FTAG);
+
+	zfsvfs->z_fuid_dirty = B_FALSE;
 	rw_exit(&zfsvfs->z_fuid_lock);
 }
 
 /*
  * Query domain table for a given domain.
  *
- * If domain isn't found it is added to AVL trees and
- * the results are pushed out to disk.
+ * If domain isn't found and addok is set, it is added to AVL trees and
+ * the zfsvfs->z_fuid_dirty flag will be set to TRUE.  It will then be
+ * necessary for the caller or another thread to detect the dirty table
+ * and sync out the changes.
  */
 int
-zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
-    dmu_tx_t *tx)
+zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain,
+    char **retdomain, boolean_t addok)
 {
 	fuid_domain_t searchnode, *findnode;
 	avl_index_t loc;
@@ -246,16 +312,16 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
 	 * for the user nobody.
 	 */
 	if (domain[0] == '\0') {
-		*retdomain = nulldomain;
+		if (retdomain)
+			*retdomain = nulldomain;
 		return (0);
 	}
 
 	searchnode.f_ksid = ksid_lookupdomain(domain);
-	if (retdomain) {
+	if (retdomain)
 		*retdomain = searchnode.f_ksid->kd_name;
-	}
 	if (!zfsvfs->z_fuid_loaded)
-		zfs_fuid_init(zfsvfs, tx);
+		zfs_fuid_init(zfsvfs);
 
 retry:
 	rw_enter(&zfsvfs->z_fuid_lock, rw);
@@ -265,15 +331,9 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
 		rw_exit(&zfsvfs->z_fuid_lock);
 		ksiddomain_rele(searchnode.f_ksid);
 		return (findnode->f_idx);
-	} else {
+	} else if (addok) {
 		fuid_domain_t *domnode;
-		nvlist_t *nvp;
-		nvlist_t **fuids;
 		uint64_t retidx;
-		size_t nvsize = 0;
-		char *packed;
-		dmu_buf_t *db;
-		int i = 0;
 
 		if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) {
 			rw_exit(&zfsvfs->z_fuid_lock);
@@ -288,46 +348,12 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
 
 		avl_add(&zfsvfs->z_fuid_domain, domnode);
 		avl_add(&zfsvfs->z_fuid_idx, domnode);
-		/*
-		 * Now resync the on-disk nvlist.
-		 */
-		VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0);
-
-		domnode = avl_first(&zfsvfs->z_fuid_domain);
-		fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP);
-		while (domnode) {
-			VERIFY(nvlist_alloc(&fuids[i],
-			    NV_UNIQUE_NAME, KM_SLEEP) == 0);
-			VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX,
-			    domnode->f_idx) == 0);
-			VERIFY(nvlist_add_uint64(fuids[i],
-			    FUID_OFFSET, 0) == 0);
-			VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN,
-			    domnode->f_ksid->kd_name) == 0);
-			domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode);
-		}
-		VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY,
-		    fuids, retidx) == 0);
-		for (i = 0; i != retidx; i++)
-			nvlist_free(fuids[i]);
-		kmem_free(fuids, retidx * sizeof (void *));
-		VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0);
-		packed = kmem_alloc(nvsize, KM_SLEEP);
-		VERIFY(nvlist_pack(nvp, &packed, &nvsize,
-		    NV_ENCODE_XDR, KM_SLEEP) == 0);
-		nvlist_free(nvp);
-		zfsvfs->z_fuid_size = nvsize;
-		dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0,
-		    zfsvfs->z_fuid_size, packed, tx);
-		kmem_free(packed, zfsvfs->z_fuid_size);
-		VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj,
-		    FTAG, &db));
-		dmu_buf_will_dirty(db, tx);
-		*(uint64_t *)db->db_data = zfsvfs->z_fuid_size;
-		dmu_buf_rele(db, FTAG);
-
+		zfsvfs->z_fuid_dirty = B_TRUE;
 		rw_exit(&zfsvfs->z_fuid_lock);
 		return (retidx);
+	} else {
+		rw_exit(&zfsvfs->z_fuid_lock);
+		return (-1);
 	}
 }
 
@@ -337,7 +363,7 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain,
  * Returns a pointer from an avl node of the domain string.
  *
  */
-static char *
+const char *
 zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
 {
 	char *domain;
@@ -346,7 +372,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx)
 		return (NULL);
 
 	if (!zfsvfs->z_fuid_loaded)
-		zfs_fuid_init(zfsvfs, NULL);
+		zfs_fuid_init(zfsvfs);
 
 	rw_enter(&zfsvfs->z_fuid_lock, RW_READER);
 
@@ -374,7 +400,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
     cred_t *cr, zfs_fuid_type_t type)
 {
 	uint32_t index = FUID_INDEX(fuid);
-	char *domain;
+	const char *domain;
 	uid_t id;
 
 	if (index == 0)
@@ -400,7 +426,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid,
  * If ACL has multiple domains, then keep only one copy of each unique
  * domain.
  */
-static void
+void
 zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
     uint64_t idx, uint64_t id, zfs_fuid_type_t type)
 {
@@ -439,6 +465,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
 	}
 
 	if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) {
+
 		/*
 		 * Now allocate fuid entry and add it on the end of the list
 		 */
@@ -463,7 +490,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid,
  */
 uint64_t
 zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
-    dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp)
+    cred_t *cr, zfs_fuid_info_t **fuidp)
 {
 	uint64_t	idx;
 	ksid_t		*ksid;
@@ -482,6 +509,11 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
 			id = crgetuid(cr);
 		else
 			id = crgetgid(cr);
+
+		if (IS_EPHEMERAL(id)) {
+			return ((uint64_t)(type == ZFS_OWNER ?
+			    UID_NOBODY : GID_NOBODY));
+		}
 	}
 
 	if (!zfsvfs->z_use_fuids || (!IS_EPHEMERAL(id)))
@@ -490,7 +522,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
 	rid = ksid_getrid(ksid);
 	domain = ksid_getdomain(ksid);
 
-	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
 
 	zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type);
 
@@ -511,7 +543,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type,
  */
 uint64_t
 zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
-    zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp)
+    zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp)
 {
 	const char *domain;
 	char *kdomain;
@@ -519,7 +551,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
 	uint32_t rid;
 	idmap_stat status;
 	uint64_t idx;
-	boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL);
 	zfs_fuid_t *zfuid = NULL;
 	zfs_fuid_info_t *fuidp;
 
@@ -534,7 +565,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
 	if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0)
 		return (id);
 
-	if (is_replay) {
+	if (zfsvfs->z_replay) {
 		fuidp = zfsvfs->z_fuid_replay;
 
 		/*
@@ -582,10 +613,11 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr,
 		}
 	}
 
-	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx);
+	idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE);
 
-	if (!is_replay)
-		zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type);
+	if (!zfsvfs->z_replay)
+		zfs_fuid_node_add(fuidpp, kdomain,
+		    rid, idx, id, type);
 	else if (zfuid != NULL) {
 		list_remove(&fuidp->z_fuids, zfuid);
 		kmem_free(zfuid, sizeof (zfs_fuid_t));
@@ -659,16 +691,15 @@ boolean_t
 zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 {
 	ksid_t		*ksid = crgetsid(cr, KSID_GROUP);
+	ksidlist_t	*ksidlist = crgetsidlist(cr);
 	uid_t		gid;
 
-	if (ksid) {
+	if (ksid && ksidlist) {
 		int 		i;
 		ksid_t		*ksid_groups;
-		ksidlist_t	*ksidlist = crgetsidlist(cr);
 		uint32_t	idx = FUID_INDEX(id);
 		uint32_t	rid = FUID_RID(id);
 
-		ASSERT(ksidlist);
 		ksid_groups = ksidlist->ksl_sids;
 
 		for (i = 0; i != ksidlist->ksl_nsid; i++) {
@@ -678,7 +709,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 					return (B_TRUE);
 				}
 			} else {
-				char *domain;
+				const char *domain;
 
 				domain = zfs_fuid_find_by_idx(zfsvfs, idx);
 				ASSERT(domain != NULL);
@@ -701,4 +732,19 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr)
 	gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP);
 	return (groupmember(gid, cr));
 }
+
+void
+zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
+{
+	if (zfsvfs->z_fuid_obj == 0) {
+		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+		    FUID_SIZE_ESTIMATE(zfsvfs));
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
+	} else {
+		dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+		dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+		    FUID_SIZE_ESTIMATE(zfsvfs));
+	}
+}
 #endif
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c
index c77892f90ab5b..f3c2c1d1bb349 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -36,12 +36,13 @@
 #include <sys/cmn_err.h>
 #include <sys/stat.h>
 #include <sys/zfs_ioctl.h>
+#include <sys/zfs_vfsops.h>
 #include <sys/zfs_znode.h>
 #include <sys/zap.h>
 #include <sys/spa.h>
 #include <sys/spa_impl.h>
 #include <sys/vdev.h>
-#include <sys/vdev_impl.h>
+#include <sys/priv_impl.h>
 #include <sys/dmu.h>
 #include <sys/dsl_dir.h>
 #include <sys/dsl_dataset.h>
@@ -79,21 +80,36 @@ dev_info_t *zfs_dip;
 typedef int zfs_ioc_func_t(zfs_cmd_t *);
 typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *);
 
+typedef enum {
+	NO_NAME,
+	POOL_NAME,
+	DATASET_NAME
+} zfs_ioc_namecheck_t;
+
 typedef struct zfs_ioc_vec {
 	zfs_ioc_func_t		*zvec_func;
 	zfs_secpolicy_func_t	*zvec_secpolicy;
-	enum {
-		NO_NAME,
-		POOL_NAME,
-		DATASET_NAME
-	} zvec_namecheck;
+	zfs_ioc_namecheck_t	zvec_namecheck;
 	boolean_t		zvec_his_log;
+	boolean_t		zvec_pool_check;
 } zfs_ioc_vec_t;
 
-static void clear_props(char *dataset, nvlist_t *props);
+/* This array is indexed by zfs_userquota_prop_t */
+static const char *userquota_perms[] = {
+	ZFS_DELEG_PERM_USERUSED,
+	ZFS_DELEG_PERM_USERQUOTA,
+	ZFS_DELEG_PERM_GROUPUSED,
+	ZFS_DELEG_PERM_GROUPQUOTA,
+};
+
+static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc);
+static int zfs_check_settable(const char *name, nvpair_t *property,
+    cred_t *cr);
+static int zfs_check_clearable(char *dataset, nvlist_t *props,
+    nvlist_t **errors);
 static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *,
     boolean_t *);
-int zfs_set_prop_nvlist(const char *, nvlist_t *);
+int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **);
 
 /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */
 void
@@ -163,22 +179,15 @@ history_str_get(zfs_cmd_t *zc)
 static boolean_t
 zfs_is_bootfs(const char *name)
 {
-	spa_t *spa;
-	boolean_t ret = B_FALSE;
-
-	if (spa_open(name, &spa, FTAG) == 0) {
-		if (spa->spa_bootfs) {
-			objset_t *os;
+	objset_t *os;
 
-			if (dmu_objset_open(name, DMU_OST_ZFS,
-			    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
-				ret = (dmu_objset_id(os) == spa->spa_bootfs);
-				dmu_objset_close(os);
-			}
-		}
-		spa_close(spa, FTAG);
+	if (dmu_objset_hold(name, FTAG, &os) == 0) {
+		boolean_t ret;
+		ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os)));
+		dmu_objset_rele(os, FTAG);
+		return (ret);
 	}
-	return (ret);
+	return (B_FALSE);
 }
 
 /*
@@ -212,13 +221,17 @@ zpl_earlier_version(const char *name, int version)
 	objset_t *os;
 	boolean_t rc = B_TRUE;
 
-	if (dmu_objset_open(name, DMU_OST_ANY,
-	    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
+	if (dmu_objset_hold(name, FTAG, &os) == 0) {
 		uint64_t zplversion;
 
+		if (dmu_objset_type(os) != DMU_OST_ZFS) {
+			dmu_objset_rele(os, FTAG);
+			return (B_TRUE);
+		}
+		/* XXX reading from non-owned objset */
 		if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0)
 			rc = zplversion < version;
-		dmu_objset_close(os);
+		dmu_objset_rele(os, FTAG);
 	}
 	return (rc);
 }
@@ -318,9 +331,109 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr)
 	return (error);
 }
 
+/*
+ * Policy for setting the security label property.
+ *
+ * Returns 0 for success, non-zero for access and other errors.
+ */
+static int
+zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr)
+{
+	char		ds_hexsl[MAXNAMELEN];
+	bslabel_t	ds_sl, new_sl;
+	boolean_t	new_default = FALSE;
+	uint64_t	zoned;
+	int		needed_priv = -1;
+	int		error;
+
+	/* First get the existing dataset label. */
+	error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+	if (error)
+		return (EPERM);
+
+	if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0)
+		new_default = TRUE;
+
+	/* The label must be translatable */
+	if (!new_default && (hexstr_to_label(strval, &new_sl) != 0))
+		return (EINVAL);
+
+	/*
+	 * In a non-global zone, disallow attempts to set a label that
+	 * doesn't match that of the zone; otherwise no other checks
+	 * are needed.
+	 */
+	if (!INGLOBALZONE(curproc)) {
+		if (new_default || !blequal(&new_sl, CR_SL(CRED())))
+			return (EPERM);
+		return (0);
+	}
+
+	/*
+	 * For global-zone datasets (i.e., those whose zoned property is
+	 * "off", verify that the specified new label is valid for the
+	 * global zone.
+	 */
+	if (dsl_prop_get_integer(name,
+	    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+		return (EPERM);
+	if (!zoned) {
+		if (zfs_check_global_label(name, strval) != 0)
+			return (EPERM);
+	}
+
+	/*
+	 * If the existing dataset label is nondefault, check if the
+	 * dataset is mounted (label cannot be changed while mounted).
+	 * Get the zfsvfs; if there isn't one, then the dataset isn't
+	 * mounted (or isn't a dataset, doesn't exist, ...).
+	 */
+	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) {
+		objset_t *os;
+		static char *setsl_tag = "setsl_tag";
+
+		/*
+		 * Try to own the dataset; abort if there is any error,
+		 * (e.g., already mounted, in use, or other error).
+		 */
+		error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE,
+		    setsl_tag, &os);
+		if (error)
+			return (EPERM);
+
+		dmu_objset_disown(os, setsl_tag);
+
+		if (new_default) {
+			needed_priv = PRIV_FILE_DOWNGRADE_SL;
+			goto out_check;
+		}
+
+		if (hexstr_to_label(strval, &new_sl) != 0)
+			return (EPERM);
+
+		if (blstrictdom(&ds_sl, &new_sl))
+			needed_priv = PRIV_FILE_DOWNGRADE_SL;
+		else if (blstrictdom(&new_sl, &ds_sl))
+			needed_priv = PRIV_FILE_UPGRADE_SL;
+	} else {
+		/* dataset currently has a default label */
+		if (!new_default)
+			needed_priv = PRIV_FILE_UPGRADE_SL;
+	}
+
+out_check:
+	if (needed_priv != -1)
+		return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL));
+	return (0);
+}
+
 static int
-zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
+zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval,
+    cred_t *cr)
 {
+	char *strval;
+
 	/*
 	 * Check permissions for special properties.
 	 */
@@ -342,16 +455,29 @@ zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr)
 			 * quota on things *under* (ie. contained by)
 			 * the thing they own.
 			 */
-			if (dsl_prop_get_integer(name, "zoned", &zoned,
+			if (dsl_prop_get_integer(dsname, "zoned", &zoned,
 			    setpoint))
 				return (EPERM);
-			if (!zoned || strlen(name) <= strlen(setpoint))
+			if (!zoned || strlen(dsname) <= strlen(setpoint))
 				return (EPERM);
 		}
 		break;
+
+	case ZFS_PROP_MLSLABEL:
+		if (!is_system_labeled())
+			return (EPERM);
+
+		if (nvpair_value_string(propval, &strval) == 0) {
+			int err;
+
+			err = zfs_set_slabel_policy(dsname, strval, CRED());
+			if (err != 0)
+				return (err);
+		}
+		break;
 	}
 
-	return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr));
+	return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr));
 }
 
 int
@@ -373,13 +499,8 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr)
 int
 zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr)
 {
-	int error;
-	error = zfs_secpolicy_write_perms(zc->zc_name,
-	    ZFS_DELEG_PERM_ROLLBACK, cr);
-	if (error == 0)
-		error = zfs_secpolicy_write_perms(zc->zc_name,
-		    ZFS_DELEG_PERM_MOUNT, cr);
-	return (error);
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_ROLLBACK, cr));
 }
 
 int
@@ -389,6 +510,30 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr)
 	    ZFS_DELEG_PERM_SEND, cr));
 }
 
+static int
+zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr)
+{
+	vnode_t *vp;
+	int error;
+
+	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
+	    NO_FOLLOW, NULL, &vp)) != 0)
+		return (error);
+
+	/* Now make sure mntpnt and dataset are ZFS */
+
+	if (vp->v_vfsp->vfs_fstype != zfsfstype ||
+	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
+	    zc->zc_name) != 0)) {
+		VN_RELE(vp);
+		return (EPERM);
+	}
+
+	VN_RELE(vp);
+	return (dsl_deleg_access(zc->zc_name,
+	    ZFS_DELEG_PERM_SHARE, cr));
+}
+
 int
 zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr)
 {
@@ -398,25 +543,20 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr)
 	if (secpolicy_nfs(cr) == 0) {
 		return (0);
 	} else {
-		vnode_t *vp;
-		int error;
-
-		if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
-		    NO_FOLLOW, NULL, &vp)) != 0)
-			return (error);
-
-		/* Now make sure mntpnt and dataset are ZFS */
+		return (zfs_secpolicy_deleg_share(zc, cr));
+	}
+}
 
-		if (vp->v_vfsp->vfs_fstype != zfsfstype ||
-		    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
-		    zc->zc_name) != 0)) {
-			VN_RELE(vp);
-			return (EPERM);
-		}
+int
+zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr)
+{
+	if (!INGLOBALZONE(curproc))
+		return (EPERM);
 
-		VN_RELE(vp);
-		return (dsl_deleg_access(zc->zc_name,
-		    ZFS_DELEG_PERM_SHARE, cr));
+	if (secpolicy_smb(cr) == 0) {
+		return (0);
+	} else {
+		return (zfs_secpolicy_deleg_share(zc, cr));
 	}
 }
 
@@ -460,6 +600,31 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr)
 	return (zfs_secpolicy_destroy_perms(zc->zc_name, cr));
 }
 
+/*
+ * Destroying snapshots with delegated permissions requires
+ * descendent mount and destroy permissions.
+ * Reassemble the full filesystem@snap name so dsl_deleg_access()
+ * can do the correct permission check.
+ *
+ * Since this routine is used when doing a recursive destroy of snapshots
+ * and destroying snapshots requires descendent permissions, a successfull
+ * check of the top level snapshot applies to snapshots of all descendent
+ * datasets as well.
+ */
+static int
+zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr)
+{
+	int error;
+	char *dsname;
+
+	dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value);
+
+	error = zfs_secpolicy_destroy_perms(dsname, cr);
+
+	strfree(dsname);
+	return (error);
+}
+
 /*
  * Must have sys_config privilege to check the iscsi permission
  */
@@ -473,7 +638,7 @@ zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr)
 int
 zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr)
 {
-	char 	parentname[MAXNAMELEN];
+	char	parentname[MAXNAMELEN];
 	int	error;
 
 	if ((error = zfs_secpolicy_write_perms(from,
@@ -508,7 +673,7 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr)
 static int
 zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
 {
-	char 	parentname[MAXNAMELEN];
+	char	parentname[MAXNAMELEN];
 	objset_t *clone;
 	int error;
 
@@ -517,20 +682,19 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
 	if (error)
 		return (error);
 
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_USER | DS_MODE_READONLY, &clone);
+	error = dmu_objset_hold(zc->zc_name, FTAG, &clone);
 
 	if (error == 0) {
 		dsl_dataset_t *pclone = NULL;
 		dsl_dir_t *dd;
-		dd = clone->os->os_dsl_dataset->ds_dir;
+		dd = clone->os_dsl_dataset->ds_dir;
 
 		rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER);
 		error = dsl_dataset_hold_obj(dd->dd_pool,
 		    dd->dd_phys->dd_origin_obj, FTAG, &pclone);
 		rw_exit(&dd->dd_pool->dp_config_rwlock);
 		if (error) {
-			dmu_objset_close(clone);
+			dmu_objset_rele(clone, FTAG);
 			return (error);
 		}
 
@@ -538,7 +702,7 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr)
 		    ZFS_DELEG_PERM_MOUNT, cr);
 
 		dsl_dataset_name(pclone, parentname);
-		dmu_objset_close(clone);
+		dmu_objset_rele(clone, FTAG);
 		dsl_dataset_rele(pclone, FTAG);
 		if (error == 0)
 			error = zfs_secpolicy_write_perms(parentname,
@@ -567,16 +731,8 @@ zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr)
 int
 zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr)
 {
-	int error;
-
-	if ((error = zfs_secpolicy_write_perms(name,
-	    ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0)
-		return (error);
-
-	error = zfs_secpolicy_write_perms(name,
-	    ZFS_DELEG_PERM_MOUNT, cr);
-
-	return (error);
+	return (zfs_secpolicy_write_perms(name,
+	    ZFS_DELEG_PERM_SNAPSHOT, cr));
 }
 
 static int
@@ -589,8 +745,8 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr)
 static int
 zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr)
 {
-	char 	parentname[MAXNAMELEN];
-	int 	error;
+	char	parentname[MAXNAMELEN];
+	int	error;
 
 	if ((error = zfs_get_parent(zc->zc_name, parentname,
 	    sizeof (parentname))) != 0)
@@ -638,22 +794,6 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr)
 	return (0);
 }
 
-/*
- * Just like zfs_secpolicy_config, except that we will check for
- * mount permission on the dataset for permission to create/remove
- * the minor nodes.
- */
-static int
-zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr)
-{
-	if (secpolicy_sys_config(cr, B_FALSE) != 0) {
-		return (dsl_deleg_access(zc->zc_name,
-		    ZFS_DELEG_PERM_MOUNT, cr));
-	}
-
-	return (0);
-}
-
 /*
  * Policy for fault injection.  Requires all privileges.
  */
@@ -675,17 +815,80 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr)
 		return (zfs_secpolicy_write_perms(zc->zc_name,
 		    ZFS_DELEG_PERM_USERPROP, cr));
 	} else {
-		if (!zfs_prop_inheritable(prop))
-			return (EINVAL);
-		return (zfs_secpolicy_setprop(zc->zc_name, prop, cr));
+		return (zfs_secpolicy_setprop(zc->zc_name, prop,
+		    NULL, cr));
+	}
+}
+
+static int
+zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr)
+{
+	int err = zfs_secpolicy_read(zc, cr);
+	if (err)
+		return (err);
+
+	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (EINVAL);
+
+	if (zc->zc_value[0] == 0) {
+		/*
+		 * They are asking about a posix uid/gid.  If it's
+		 * themself, allow it.
+		 */
+		if (zc->zc_objset_type == ZFS_PROP_USERUSED ||
+		    zc->zc_objset_type == ZFS_PROP_USERQUOTA) {
+			if (zc->zc_guid == crgetuid(cr))
+				return (0);
+		} else {
+			if (groupmember(zc->zc_guid, cr))
+				return (0);
+		}
 	}
+
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    userquota_perms[zc->zc_objset_type], cr));
+}
+
+static int
+zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr)
+{
+	int err = zfs_secpolicy_read(zc, cr);
+	if (err)
+		return (err);
+
+	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (EINVAL);
+
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    userquota_perms[zc->zc_objset_type], cr));
+}
+
+static int
+zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr)
+{
+	return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION,
+	    NULL, cr));
+}
+
+static int
+zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr)
+{
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_HOLD, cr));
+}
+
+static int
+zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr)
+{
+	return (zfs_secpolicy_write_perms(zc->zc_name,
+	    ZFS_DELEG_PERM_RELEASE, cr));
 }
 
 /*
  * Returns the nvlist as specified by the user in the zfs_cmd_t.
  */
 static int
-get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
+get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp)
 {
 	char *packed;
 	int error;
@@ -699,7 +902,8 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
 
 	packed = kmem_alloc(size, KM_SLEEP);
 
-	if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) {
+	if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size,
+	    iflag)) != 0) {
 		kmem_free(packed, size);
 		return (error);
 	}
@@ -715,6 +919,41 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp)
 	return (0);
 }
 
+static int
+fit_error_list(zfs_cmd_t *zc, nvlist_t **errors)
+{
+	size_t size;
+
+	VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
+
+	if (size > zc->zc_nvlist_dst_size) {
+		nvpair_t *more_errors;
+		int n = 0;
+
+		if (zc->zc_nvlist_dst_size < 1024)
+			return (ENOMEM);
+
+		VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0);
+		more_errors = nvlist_prev_nvpair(*errors, NULL);
+
+		do {
+			nvpair_t *pair = nvlist_prev_nvpair(*errors,
+			    more_errors);
+			VERIFY(nvlist_remove_nvpair(*errors, pair) == 0);
+			n++;
+			VERIFY(nvlist_size(*errors, &size,
+			    NV_ENCODE_NATIVE) == 0);
+		} while (size > zc->zc_nvlist_dst_size);
+
+		VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0);
+		VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0);
+		ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0);
+		ASSERT(size <= zc->zc_nvlist_dst_size);
+	}
+
+	return (0);
+}
+
 static int
 put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 {
@@ -730,8 +969,8 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 		packed = kmem_alloc(size, KM_SLEEP);
 		VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE,
 		    KM_SLEEP) == 0);
-		error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst,
-		    size);
+		error = ddi_copyout(packed,
+		    (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags);
 		kmem_free(packed, size);
 	}
 
@@ -739,6 +978,71 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl)
 	return (error);
 }
 
+static int
+getzfsvfs(const char *dsname, zfsvfs_t **zfvp)
+{
+	objset_t *os;
+	int error;
+
+	error = dmu_objset_hold(dsname, FTAG, &os);
+	if (error)
+		return (error);
+	if (dmu_objset_type(os) != DMU_OST_ZFS) {
+		dmu_objset_rele(os, FTAG);
+		return (EINVAL);
+	}
+
+	mutex_enter(&os->os_user_ptr_lock);
+	*zfvp = dmu_objset_get_user(os);
+	if (*zfvp) {
+		VFS_HOLD((*zfvp)->z_vfs);
+	} else {
+		error = ESRCH;
+	}
+	mutex_exit(&os->os_user_ptr_lock);
+	dmu_objset_rele(os, FTAG);
+	return (error);
+}
+
+/*
+ * Find a zfsvfs_t for a mounted filesystem, or create our own, in which
+ * case its z_vfs will be NULL, and it will be opened as the owner.
+ */
+static int
+zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp)
+{
+	int error = 0;
+
+	if (getzfsvfs(name, zfvp) != 0)
+		error = zfsvfs_create(name, zfvp);
+	if (error == 0) {
+		rrw_enter(&(*zfvp)->z_teardown_lock, RW_READER, tag);
+		if ((*zfvp)->z_unmounted) {
+			/*
+			 * XXX we could probably try again, since the unmounting
+			 * thread should be just about to disassociate the
+			 * objset from the zfsvfs.
+			 */
+			rrw_exit(&(*zfvp)->z_teardown_lock, tag);
+			return (EBUSY);
+		}
+	}
+	return (error);
+}
+
+static void
+zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
+{
+	rrw_exit(&zfsvfs->z_teardown_lock, tag);
+
+	if (zfsvfs->z_vfs) {
+		VFS_RELE(zfsvfs->z_vfs);
+	} else {
+		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
+		zfsvfs_free(zfsvfs);
+	}
+}
+
 static int
 zfs_ioc_pool_create(zfs_cmd_t *zc)
 {
@@ -749,11 +1053,12 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 	char *buf;
 
 	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    &config))
+	    zc->zc_iflags, &config))
 		return (error);
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
-	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &props))) {
 		nvlist_free(config);
 		return (error);
 	}
@@ -792,8 +1097,8 @@ zfs_ioc_pool_create(zfs_cmd_t *zc)
 	/*
 	 * Set the remaining root properties
 	 */
-	if (!error &&
-	    (error = zfs_set_prop_nvlist(zc->zc_name, rootprops)) != 0)
+	if (!error && (error = zfs_set_prop_nvlist(zc->zc_name,
+	    ZPROP_SRC_LOCAL, rootprops, NULL)) != 0)
 		(void) spa_destroy(zc->zc_name);
 
 	if (buf != NULL)
@@ -814,22 +1119,25 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc)
 	int error;
 	zfs_log_history(zc);
 	error = spa_destroy(zc->zc_name);
+	if (error == 0)
+		zvol_remove_minors(zc->zc_name);
 	return (error);
 }
 
 static int
 zfs_ioc_pool_import(zfs_cmd_t *zc)
 {
-	int error;
 	nvlist_t *config, *props = NULL;
 	uint64_t guid;
+	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    &config)) != 0)
+	    zc->zc_iflags, &config)) != 0)
 		return (error);
 
 	if (zc->zc_nvlist_src_size != 0 && (error =
-	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) {
+	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &props))) {
 		nvlist_free(config);
 		return (error);
 	}
@@ -838,11 +1146,13 @@ zfs_ioc_pool_import(zfs_cmd_t *zc)
 	    guid != zc->zc_guid)
 		error = EINVAL;
 	else if (zc->zc_cookie)
-		error = spa_import_faulted(zc->zc_name, config,
-		    props);
+		error = spa_import_verbatim(zc->zc_name, config, props);
 	else
 		error = spa_import(zc->zc_name, config, props);
 
+	if (zc->zc_nvlist_dst != 0)
+		(void) put_nvlist(zc, config);
+
 	nvlist_free(config);
 
 	if (props)
@@ -856,9 +1166,12 @@ zfs_ioc_pool_export(zfs_cmd_t *zc)
 {
 	int error;
 	boolean_t force = (boolean_t)zc->zc_cookie;
+	boolean_t hardforce = (boolean_t)zc->zc_guid;
 
 	zfs_log_history(zc);
-	error = spa_export(zc->zc_name, NULL, force);
+	error = spa_export(zc->zc_name, NULL, force, hardforce);
+	if (error == 0)
+		zvol_remove_minors(zc->zc_name);
 	return (error);
 }
 
@@ -916,7 +1229,7 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc)
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    &tryconfig)) != 0)
+	    zc->zc_iflags, &tryconfig)) != 0)
 		return (error);
 
 	config = spa_tryimport(tryconfig);
@@ -1004,9 +1317,9 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc)
 	hist_buf = kmem_alloc(size, KM_SLEEP);
 	if ((error = spa_history_get(spa, &zc->zc_history_offset,
 	    &zc->zc_history_len, hist_buf)) == 0) {
-		error = xcopyout(hist_buf,
-		    (char *)(uintptr_t)zc->zc_history,
-		    zc->zc_history_len);
+		error = ddi_copyout(hist_buf,
+		    (void *)(uintptr_t)zc->zc_history,
+		    zc->zc_history_len, zc->zc_iflags);
 	}
 
 	spa_close(spa, FTAG);
@@ -1025,18 +1338,30 @@ zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc)
 	return (0);
 }
 
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_obj		object to find
+ *
+ * outputs:
+ * zc_value		name of object
+ */
 static int
 zfs_ioc_obj_to_path(zfs_cmd_t *zc)
 {
-	objset_t *osp;
+	objset_t *os;
 	int error;
 
-	if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS,
-	    DS_MODE_USER | DS_MODE_READONLY, &osp)) != 0)
+	/* XXX reading from objset not owned */
+	if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0)
 		return (error);
-	error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value,
+	if (dmu_objset_type(os) != DMU_OST_ZFS) {
+		dmu_objset_rele(os, FTAG);
+		return (EINVAL);
+	}
+	error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value,
 	    sizeof (zc->zc_value));
-	dmu_objset_close(osp);
+	dmu_objset_rele(os, FTAG);
 
 	return (error);
 }
@@ -1054,7 +1379,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 		return (error);
 
 	error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    &config);
+	    zc->zc_iflags, &config);
 	(void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE,
 	    &l2cache, &nl2cache);
 
@@ -1071,7 +1396,8 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc)
 	 *
 	 * l2cache and spare devices are ok to be added to a rootpool.
 	 */
-	if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) {
+	if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) {
+		nvlist_free(config);
 		spa_close(spa, FTAG);
 		return (EDOM);
 	}
@@ -1117,11 +1443,19 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc)
 		break;
 
 	case VDEV_STATE_FAULTED:
-		error = vdev_fault(spa, zc->zc_guid);
+		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+		    zc->zc_obj != VDEV_AUX_EXTERNAL)
+			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+		error = vdev_fault(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	case VDEV_STATE_DEGRADED:
-		error = vdev_degrade(spa, zc->zc_guid);
+		if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED &&
+		    zc->zc_obj != VDEV_AUX_EXTERNAL)
+			zc->zc_obj = VDEV_AUX_ERR_EXCEEDED;
+
+		error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj);
 		break;
 
 	default:
@@ -1144,7 +1478,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc)
 		return (error);
 
 	if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
-	    &config)) == 0) {
+	    zc->zc_iflags, &config)) == 0) {
 		error = spa_vdev_attach(spa, zc->zc_guid, config, replacing);
 		nvlist_free(config);
 	}
@@ -1162,35 +1496,87 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc)
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE);
+	error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE);
 
 	spa_close(spa, FTAG);
 	return (error);
 }
 
 static int
-zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
+zfs_ioc_vdev_split(zfs_cmd_t *zc)
 {
 	spa_t *spa;
-	char *path = zc->zc_value;
-	uint64_t guid = zc->zc_guid;
+	nvlist_t *config, *props = NULL;
 	int error;
+	boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT);
 
-	error = spa_open(zc->zc_name, &spa, FTAG);
-	if (error != 0)
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
 		return (error);
 
-	error = spa_vdev_setpath(spa, guid, path);
-	spa_close(spa, FTAG);
-	return (error);
-}
+	if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size,
+	    zc->zc_iflags, &config)) {
+		spa_close(spa, FTAG);
+		return (error);
+	}
 
-/*
- * inputs:
- * zc_name		name of filesystem
- * zc_nvlist_dst_size	size of buffer for property nvlist
- *
- * outputs:
+	if (zc->zc_nvlist_src_size != 0 && (error =
+	    get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &props))) {
+		spa_close(spa, FTAG);
+		nvlist_free(config);
+		return (error);
+	}
+
+	error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp);
+
+	spa_close(spa, FTAG);
+
+	nvlist_free(config);
+	nvlist_free(props);
+
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_setpath(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *path = zc->zc_value;
+	uint64_t guid = zc->zc_guid;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = spa_vdev_setpath(spa, guid, path);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+static int
+zfs_ioc_vdev_setfru(zfs_cmd_t *zc)
+{
+	spa_t *spa;
+	char *fru = zc->zc_value;
+	uint64_t guid = zc->zc_guid;
+	int error;
+
+	error = spa_open(zc->zc_name, &spa, FTAG);
+	if (error != 0)
+		return (error);
+
+	error = spa_vdev_setfru(spa, guid, fru);
+	spa_close(spa, FTAG);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_dst_size	size of buffer for property nvlist
+ *
+ * outputs:
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
@@ -1202,20 +1588,20 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
 	int error;
 	nvlist_t *nv;
 
-	if (error = dmu_objset_open(zc->zc_name,
-	    DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
+	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
 		return (error);
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
 
 	if (zc->zc_nvlist_dst != 0 &&
-	    (error = dsl_prop_get_all(os, &nv, FALSE)) == 0) {
+	    (error = dsl_prop_get_all(os, &nv)) == 0) {
 		dmu_objset_stats(os, nv);
 		/*
 		 * NB: zvol_get_stats() will read the objset contents,
 		 * which we aren't supposed to do with a
 		 * DS_MODE_USER hold, because it could be
 		 * inconsistent.  So this is a bit of a workaround...
+		 * XXX reading with out owning
 		 */
 		if (!zc->zc_objset_stats.dds_inconsistent) {
 			if (dmu_objset_type(os) == DMU_OST_ZVOL)
@@ -1225,7 +1611,50 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc)
 		nvlist_free(nv);
 	}
 
-	dmu_objset_close(os);
+	dmu_objset_rele(os, FTAG);
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_nvlist_dst_size	size of buffer for property nvlist
+ *
+ * outputs:
+ * zc_nvlist_dst	received property nvlist
+ * zc_nvlist_dst_size	size of received property nvlist
+ *
+ * Gets received properties (distinct from local properties on or after
+ * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from
+ * local property values.
+ */
+static int
+zfs_ioc_objset_recvd_props(zfs_cmd_t *zc)
+{
+	objset_t *os = NULL;
+	int error;
+	nvlist_t *nv;
+
+	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os))
+		return (error);
+
+	/*
+	 * Without this check, we would return local property values if the
+	 * caller has not already received properties on or after
+	 * SPA_VERSION_RECVD_PROPS.
+	 */
+	if (!dsl_prop_get_hasrecvd(os)) {
+		dmu_objset_rele(os, FTAG);
+		return (ENOTSUP);
+	}
+
+	if (zc->zc_nvlist_dst != 0 &&
+	    (error = dsl_prop_get_received(os, &nv)) == 0) {
+		error = put_nvlist(zc, nv);
+		nvlist_free(nv);
+	}
+
+	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
@@ -1260,8 +1689,8 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
 	objset_t *os;
 	int err;
 
-	if (err = dmu_objset_open(zc->zc_name,
-	    DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os))
+	/* XXX reading without owning */
+	if (err = dmu_objset_hold(zc->zc_name, FTAG, &os))
 		return (err);
 
 	dmu_objset_fast_stat(os, &zc->zc_objset_stats);
@@ -1286,30 +1715,25 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc)
 	} else {
 		err = ENOENT;
 	}
-	dmu_objset_close(os);
+	dmu_objset_rele(os, FTAG);
 	return (err);
 }
 
-static void
-zfs_prefetch_datasets(zfs_cmd_t *zc, objset_t *os, char *p)
-{
-	uint64_t cookie = 0;
-	int error;
-
-	do {
-		error = dmu_dir_list_next(os,
-		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
-		    NULL, &cookie);
-	} while (error == 0 && !INGLOBALZONE(curproc) &&
-	    !zone_dataset_visible(zc->zc_name, NULL) &&
-	    !dmu_objset_prefetch(zc->zc_name, NULL));
-}
-
-static void
-zfs_prefetch_snapshots(zfs_cmd_t *zc)
+static boolean_t
+dataset_name_hidden(const char *name)
 {
-	dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
-	    NULL, DS_FIND_SNAPSHOTS);
+	/*
+	 * Skip over datasets that are not visible in this zone,
+	 * internal datasets (which have a $ in their name), and
+	 * temporary datasets (which have a % in their name).
+	 */
+	if (strchr(name, '$') != NULL)
+		return (B_TRUE);
+	if (strchr(name, '%') != NULL)
+		return (B_TRUE);
+	if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL))
+		return (B_TRUE);
+	return (B_FALSE);
 }
 
 /*
@@ -1320,6 +1744,7 @@ zfs_prefetch_snapshots(zfs_cmd_t *zc)
  *
  * outputs:
  * zc_name		name of next filesystem
+ * zc_cookie		zap cursor
  * zc_objset_stats	stats
  * zc_nvlist_dst	property nvlist
  * zc_nvlist_dst_size	size of property nvlist
@@ -1330,9 +1755,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 	objset_t *os;
 	int error;
 	char *p;
+	size_t orig_len = strlen(zc->zc_name);
 
-	if (error = dmu_objset_open(zc->zc_name,
-	    DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) {
+top:
+	if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) {
 		if (error == ENOENT)
 			error = ESRCH;
 		return (error);
@@ -1343,25 +1769,40 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc)
 		(void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name));
 	p = zc->zc_name + strlen(zc->zc_name);
 
-	if (zc->zc_cookie == 0)
-		zfs_prefetch_datasets(zc, os, p);
+	/*
+	 * Pre-fetch the datasets.  dmu_objset_prefetch() always returns 0
+	 * but is not declared void because its called by dmu_objset_find().
+	 */
+	if (zc->zc_cookie == 0) {
+		uint64_t cookie = 0;
+		int len = sizeof (zc->zc_name) - (p - zc->zc_name);
+
+		while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0)
+			(void) dmu_objset_prefetch(p, NULL);
+	}
+
 	do {
 		error = dmu_dir_list_next(os,
 		    sizeof (zc->zc_name) - (p - zc->zc_name), p,
 		    NULL, &zc->zc_cookie);
 		if (error == ENOENT)
 			error = ESRCH;
-	} while (error == 0 && !INGLOBALZONE(curproc) &&
-	    !zone_dataset_visible(zc->zc_name, NULL));
-	dmu_objset_close(os);
+	} while (error == 0 && dataset_name_hidden(zc->zc_name) &&
+	    !(zc->zc_iflags & FKIOCTL));
+	dmu_objset_rele(os, FTAG);
 
 	/*
-	 * If it's a hidden dataset (ie. with a '$' in its name), don't
-	 * try to get stats for it.  Userland will skip over it.
+	 * If it's an internal dataset (ie. with a '$' in its name),
+	 * don't try to get stats for it, otherwise we'll return ENOENT.
 	 */
-	if (error == 0 && strchr(zc->zc_name, '$') == NULL)
+	if (error == 0 && strchr(zc->zc_name, '$') == NULL) {
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-
+		if (error == ENOENT) {
+			/* We lost a race with destroy, get the next one. */
+			zc->zc_name[orig_len] = '\0';
+			goto top;
+		}
+	}
 	return (error);
 }
 
@@ -1383,30 +1824,38 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 	objset_t *os;
 	int error;
 
-	error = dmu_objset_open(zc->zc_name,
-	    DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os);
+top:
+	if (zc->zc_cookie == 0)
+		(void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch,
+		    NULL, DS_FIND_SNAPSHOTS);
+
+	error = dmu_objset_hold(zc->zc_name, FTAG, &os);
 	if (error)
 		return (error == ENOENT ? ESRCH : error);
 
-	if (zc->zc_cookie == 0)
-		zfs_prefetch_snapshots(zc);
 	/*
 	 * A dataset name of maximum length cannot have any snapshots,
 	 * so exit immediately.
 	 */
 	if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) {
-		dmu_objset_close(os);
+		dmu_objset_rele(os, FTAG);
 		return (ESRCH);
 	}
 
 	error = dmu_snapshot_list_next(os,
 	    sizeof (zc->zc_name) - strlen(zc->zc_name),
 	    zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL);
-	dmu_objset_close(os);
-	if (error == 0)
+	dmu_objset_rele(os, FTAG);
+	if (error == 0) {
 		error = zfs_ioc_objset_stats(zc); /* fill in the stats */
-	else if (error == ENOENT)
+		if (error == ENOENT)  {
+			/* We lost a race with destroy, get the next one. */
+			*strchr(zc->zc_name, '@') = '\0';
+			goto top;
+		}
+	} else if (error == ENOENT) {
 		error = ESRCH;
+	}
 
 	/* if we failed, undo the @ that we tacked on to zc_name */
 	if (error)
@@ -1414,233 +1863,410 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc)
 	return (error);
 }
 
-int
-zfs_set_prop_nvlist(const char *name, nvlist_t *nvl)
+static int
+zfs_prop_set_userquota(const char *dsname, nvpair_t *pair)
 {
-	nvpair_t *elem;
-	int error;
-	uint64_t intval;
-	char *strval;
+	const char *propname = nvpair_name(pair);
+	uint64_t *valary;
+	unsigned int vallen;
+	const char *domain;
+	zfs_userquota_prop_t type;
+	uint64_t rid;
+	uint64_t quota;
+	zfsvfs_t *zfsvfs;
+	int err;
+
+	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &pair) == 0);
+	}
 
+	VERIFY(nvpair_value_uint64_array(pair, &valary, &vallen) == 0);
+	VERIFY(vallen == 3);
+	type = valary[0];
+	rid = valary[1];
+	quota = valary[2];
 	/*
-	 * First validate permission to set all of the properties
+	 * The propname is encoded as
+	 * userquota@<rid>-<domain>.
 	 */
-	elem = NULL;
-	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-		const char *propname = nvpair_name(elem);
-		zfs_prop_t prop = zfs_name_to_prop(propname);
+	domain = strchr(propname, '-') + 1;
 
-		if (prop == ZPROP_INVAL) {
-			/*
-			 * If this is a user-defined property, it must be a
-			 * string, and there is no further validation to do.
-			 */
-			if (!zfs_prop_user(propname) ||
-			    nvpair_type(elem) != DATA_TYPE_STRING)
-				return (EINVAL);
-
-			if (error = zfs_secpolicy_write_perms(name,
-			    ZFS_DELEG_PERM_USERPROP, CRED()))
-				return (error);
-			continue;
-		}
-
-		if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0)
-			return (error);
-
-		/*
-		 * Check that this value is valid for this pool version
-		 */
-		switch (prop) {
-		case ZFS_PROP_COMPRESSION:
-			/*
-			 * If the user specified gzip compression, make sure
-			 * the SPA supports it. We ignore any errors here since
-			 * we'll catch them later.
-			 */
-			if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
-			    nvpair_value_uint64(elem, &intval) == 0) {
-				if (intval >= ZIO_COMPRESS_GZIP_1 &&
-				    intval <= ZIO_COMPRESS_GZIP_9 &&
-				    zfs_earlier_version(name,
-				    SPA_VERSION_GZIP_COMPRESSION))
-					return (ENOTSUP);
+	err = zfsvfs_hold(dsname, FTAG, &zfsvfs);
+	if (err == 0) {
+		err = zfs_set_userquota(zfsvfs, type, domain, rid, quota);
+		zfsvfs_rele(zfsvfs, FTAG);
+	}
 
-				/*
-				 * If this is a bootable dataset then
-				 * verify that the compression algorithm
-				 * is supported for booting. We must return
-				 * something other than ENOTSUP since it
-				 * implies a downrev pool version.
-				 */
-				if (zfs_is_bootfs(name) &&
-				    !BOOTFS_COMPRESS_VALID(intval))
-					return (ERANGE);
-			}
-			break;
+	return (err);
+}
 
-		case ZFS_PROP_COPIES:
-			if (zfs_earlier_version(name,
-			    SPA_VERSION_DITTO_BLOCKS))
-				return (ENOTSUP);
-			break;
+/*
+ * If the named property is one that has a special function to set its value,
+ * return 0 on success and a positive error code on failure; otherwise if it is
+ * not one of the special properties handled by this function, return -1.
+ *
+ * XXX: It would be better for callers of the properety interface if we handled
+ * these special cases in dsl_prop.c (in the dsl layer).
+ */
+static int
+zfs_prop_set_special(const char *dsname, zprop_source_t source,
+    nvpair_t *pair)
+{
+	const char *propname = nvpair_name(pair);
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	uint64_t intval;
+	int err;
 
-		case ZFS_PROP_SHARESMB:
-			if (zpl_earlier_version(name, ZPL_VERSION_FUID))
-				return (ENOTSUP);
-			break;
+	if (prop == ZPROP_INVAL) {
+		if (zfs_prop_userquota(propname))
+			return (zfs_prop_set_userquota(dsname, pair));
+		return (-1);
+	}
 
-		case ZFS_PROP_ACLINHERIT:
-			if (nvpair_type(elem) == DATA_TYPE_UINT64 &&
-			    nvpair_value_uint64(elem, &intval) == 0)
-				if (intval == ZFS_ACL_PASSTHROUGH_X &&
-				    zfs_earlier_version(name,
-				    SPA_VERSION_PASSTHROUGH_X))
-					return (ENOTSUP);
-		}
+	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &pair) == 0);
 	}
 
-	elem = NULL;
-	while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) {
-		const char *propname = nvpair_name(elem);
-		zfs_prop_t prop = zfs_name_to_prop(propname);
+	if (zfs_prop_get_type(prop) == PROP_TYPE_STRING)
+		return (-1);
 
-		if (prop == ZPROP_INVAL) {
-			VERIFY(nvpair_value_string(elem, &strval) == 0);
-			error = dsl_prop_set(name, propname, 1,
-			    strlen(strval) + 1, strval);
-			if (error == 0)
-				continue;
-			else
-				return (error);
-		}
+	VERIFY(0 == nvpair_value_uint64(pair, &intval));
 
-		switch (prop) {
-		case ZFS_PROP_QUOTA:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = dsl_dir_set_quota(name, intval)) != 0)
-				return (error);
-			break;
+	switch (prop) {
+	case ZFS_PROP_QUOTA:
+		err = dsl_dir_set_quota(dsname, source, intval);
+		break;
+	case ZFS_PROP_REFQUOTA:
+		err = dsl_dataset_set_quota(dsname, source, intval);
+		break;
+	case ZFS_PROP_RESERVATION:
+		err = dsl_dir_set_reservation(dsname, source, intval);
+		break;
+	case ZFS_PROP_REFRESERVATION:
+		err = dsl_dataset_set_reservation(dsname, source, intval);
+		break;
+	case ZFS_PROP_VOLSIZE:
+		err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip),
+		    intval);
+		break;
+	case ZFS_PROP_VERSION:
+	{
+		zfsvfs_t *zfsvfs;
 
-		case ZFS_PROP_REFQUOTA:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = dsl_dataset_set_quota(name, intval)) != 0)
-				return (error);
+		if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0)
 			break;
 
-		case ZFS_PROP_RESERVATION:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = dsl_dir_set_reservation(name,
-			    intval)) != 0)
-				return (error);
-			break;
+		err = zfs_set_version(zfsvfs, intval);
+		zfsvfs_rele(zfsvfs, FTAG);
 
-		case ZFS_PROP_REFRESERVATION:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = dsl_dataset_set_reservation(name,
-			    intval)) != 0)
-				return (error);
-			break;
+		if (err == 0 && intval >= ZPL_VERSION_USERSPACE) {
+			zfs_cmd_t *zc;
 
-		case ZFS_PROP_VOLSIZE:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = zvol_set_volsize(name,
-			    ddi_driver_major(zfs_dip), intval)) != 0)
-				return (error);
-			break;
+			zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
+			(void) strcpy(zc->zc_name, dsname);
+			(void) zfs_ioc_userspace_upgrade(zc);
+			kmem_free(zc, sizeof (zfs_cmd_t));
+		}
+		break;
+	}
 
-		case ZFS_PROP_VOLBLOCKSIZE:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = zvol_set_volblocksize(name, intval)) != 0)
-				return (error);
-			break;
+	default:
+		err = -1;
+	}
 
-		case ZFS_PROP_VERSION:
-			if ((error = nvpair_value_uint64(elem, &intval)) != 0 ||
-			    (error = zfs_set_version(name, intval)) != 0)
-				return (error);
-			break;
+	return (err);
+}
 
-		default:
-			if (nvpair_type(elem) == DATA_TYPE_STRING) {
-				if (zfs_prop_get_type(prop) !=
-				    PROP_TYPE_STRING)
-					return (EINVAL);
-				VERIFY(nvpair_value_string(elem, &strval) == 0);
-				if ((error = dsl_prop_set(name,
-				    nvpair_name(elem), 1, strlen(strval) + 1,
-				    strval)) != 0)
-					return (error);
-			} else if (nvpair_type(elem) == DATA_TYPE_UINT64) {
+/*
+ * This function is best effort. If it fails to set any of the given properties,
+ * it continues to set as many as it can and returns the first error
+ * encountered. If the caller provides a non-NULL errlist, it also gives the
+ * complete list of names of all the properties it failed to set along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property is set successfully, zero is returned and the list pointed
+ * at by errlist is NULL.
+ */
+int
+zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl,
+    nvlist_t **errlist)
+{
+	nvpair_t *pair;
+	nvpair_t *propval;
+	int rv = 0;
+	uint64_t intval;
+	char *strval;
+	nvlist_t *genericnvl;
+	nvlist_t *errors;
+	nvlist_t *retrynvl;
+
+	VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+	VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+retry:
+	pair = NULL;
+	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+		const char *propname = nvpair_name(pair);
+		zfs_prop_t prop = zfs_name_to_prop(propname);
+		int err = 0;
+
+		/* decode the property value */
+		propval = pair;
+		if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+			nvlist_t *attrs;
+			VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+			VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+			    &propval) == 0);
+		}
+
+		/* Validate value type */
+		if (prop == ZPROP_INVAL) {
+			if (zfs_prop_user(propname)) {
+				if (nvpair_type(propval) != DATA_TYPE_STRING)
+					err = EINVAL;
+			} else if (zfs_prop_userquota(propname)) {
+				if (nvpair_type(propval) !=
+				    DATA_TYPE_UINT64_ARRAY)
+					err = EINVAL;
+			}
+		} else {
+			if (nvpair_type(propval) == DATA_TYPE_STRING) {
+				if (zfs_prop_get_type(prop) != PROP_TYPE_STRING)
+					err = EINVAL;
+			} else if (nvpair_type(propval) == DATA_TYPE_UINT64) {
 				const char *unused;
 
-				VERIFY(nvpair_value_uint64(elem, &intval) == 0);
+				VERIFY(nvpair_value_uint64(propval,
+				    &intval) == 0);
 
 				switch (zfs_prop_get_type(prop)) {
 				case PROP_TYPE_NUMBER:
 					break;
 				case PROP_TYPE_STRING:
-					return (EINVAL);
+					err = EINVAL;
+					break;
 				case PROP_TYPE_INDEX:
 					if (zfs_prop_index_to_string(prop,
 					    intval, &unused) != 0)
-						return (EINVAL);
+						err = EINVAL;
 					break;
 				default:
 					cmn_err(CE_PANIC,
 					    "unknown property type");
-					break;
 				}
+			} else {
+				err = EINVAL;
+			}
+		}
+
+		/* Validate permissions */
+		if (err == 0)
+			err = zfs_check_settable(dsname, pair, CRED());
+
+		if (err == 0) {
+			err = zfs_prop_set_special(dsname, source, pair);
+			if (err == -1) {
+				/*
+				 * For better performance we build up a list of
+				 * properties to set in a single transaction.
+				 */
+				err = nvlist_add_nvpair(genericnvl, pair);
+			} else if (err != 0 && nvl != retrynvl) {
+				/*
+				 * This may be a spurious error caused by
+				 * receiving quota and reservation out of order.
+				 * Try again in a second pass.
+				 */
+				err = nvlist_add_nvpair(retrynvl, pair);
+			}
+		}
+
+		if (err != 0)
+			VERIFY(nvlist_add_int32(errors, propname, err) == 0);
+	}
 
-				if ((error = dsl_prop_set(name, propname,
-				    8, 1, &intval)) != 0)
-					return (error);
+	if (nvl != retrynvl && !nvlist_empty(retrynvl)) {
+		nvl = retrynvl;
+		goto retry;
+	}
+
+	if (!nvlist_empty(genericnvl) &&
+	    dsl_props_set(dsname, source, genericnvl) != 0) {
+		/*
+		 * If this fails, we still want to set as many properties as we
+		 * can, so try setting them individually.
+		 */
+		pair = NULL;
+		while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) {
+			const char *propname = nvpair_name(pair);
+			int err = 0;
+
+			propval = pair;
+			if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+				nvlist_t *attrs;
+				VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+				VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+				    &propval) == 0);
+			}
+
+			if (nvpair_type(propval) == DATA_TYPE_STRING) {
+				VERIFY(nvpair_value_string(propval,
+				    &strval) == 0);
+				err = dsl_prop_set(dsname, propname, source, 1,
+				    strlen(strval) + 1, strval);
 			} else {
-				return (EINVAL);
+				VERIFY(nvpair_value_uint64(propval,
+				    &intval) == 0);
+				err = dsl_prop_set(dsname, propname, source, 8,
+				    1, &intval);
+			}
+
+			if (err != 0) {
+				VERIFY(nvlist_add_int32(errors, propname,
+				    err) == 0);
 			}
-			break;
 		}
 	}
+	nvlist_free(genericnvl);
+	nvlist_free(retrynvl);
+
+	if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+		nvlist_free(errors);
+		errors = NULL;
+	} else {
+		VERIFY(nvpair_value_int32(pair, &rv) == 0);
+	}
+
+	if (errlist == NULL)
+		nvlist_free(errors);
+	else
+		*errlist = errors;
+
+	return (rv);
+}
+
+/*
+ * Check that all the properties are valid user properties.
+ */
+static int
+zfs_check_userprops(char *fsname, nvlist_t *nvl)
+{
+	nvpair_t *pair = NULL;
+	int error = 0;
+
+	while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) {
+		const char *propname = nvpair_name(pair);
+		char *valstr;
+
+		if (!zfs_prop_user(propname) ||
+		    nvpair_type(pair) != DATA_TYPE_STRING)
+			return (EINVAL);
 
+		if (error = zfs_secpolicy_write_perms(fsname,
+		    ZFS_DELEG_PERM_USERPROP, CRED()))
+			return (error);
+
+		if (strlen(propname) >= ZAP_MAXNAMELEN)
+			return (ENAMETOOLONG);
+
+		VERIFY(nvpair_value_string(pair, &valstr) == 0);
+		if (strlen(valstr) >= ZAP_MAXVALUELEN)
+			return (E2BIG);
+	}
 	return (0);
 }
 
+static void
+props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops)
+{
+	nvpair_t *pair;
+
+	VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	pair = NULL;
+	while ((pair = nvlist_next_nvpair(props, pair)) != NULL) {
+		if (nvlist_exists(skipped, nvpair_name(pair)))
+			continue;
+
+		VERIFY(nvlist_add_nvpair(*newprops, pair) == 0);
+	}
+}
+
+static int
+clear_received_props(objset_t *os, const char *fs, nvlist_t *props,
+    nvlist_t *skipped)
+{
+	int err = 0;
+	nvlist_t *cleared_props = NULL;
+	props_skip(props, skipped, &cleared_props);
+	if (!nvlist_empty(cleared_props)) {
+		/*
+		 * Acts on local properties until the dataset has received
+		 * properties at least once on or after SPA_VERSION_RECVD_PROPS.
+		 */
+		zprop_source_t flags = (ZPROP_SRC_NONE |
+		    (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0));
+		err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL);
+	}
+	nvlist_free(cleared_props);
+	return (err);
+}
+
 /*
  * inputs:
  * zc_name		name of filesystem
- * zc_value		name of property to inherit
+ * zc_value		name of property to set
  * zc_nvlist_src{_size}	nvlist of properties to apply
- * zc_cookie		clear existing local props?
+ * zc_cookie		received properties flag
  *
- * outputs:		none
+ * outputs:
+ * zc_nvlist_dst{_size} error for each unapplied received property
  */
 static int
 zfs_ioc_set_prop(zfs_cmd_t *zc)
 {
 	nvlist_t *nvl;
+	boolean_t received = zc->zc_cookie;
+	zprop_source_t source = (received ? ZPROP_SRC_RECEIVED :
+	    ZPROP_SRC_LOCAL);
+	nvlist_t *errors = NULL;
 	int error;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    &nvl)) != 0)
+	    zc->zc_iflags, &nvl)) != 0)
 		return (error);
 
-	if (zc->zc_cookie) {
+	if (received) {
 		nvlist_t *origprops;
 		objset_t *os;
 
-		if (dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-		    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
-			if (dsl_prop_get_all(os, &origprops, TRUE) == 0) {
-				clear_props(zc->zc_name, origprops);
+		if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) {
+			if (dsl_prop_get_received(os, &origprops) == 0) {
+				(void) clear_received_props(os,
+				    zc->zc_name, origprops, nvl);
 				nvlist_free(origprops);
 			}
-			dmu_objset_close(os);
-		}
 
+			dsl_prop_set_hasrecvd(os);
+			dmu_objset_rele(os, FTAG);
+		}
 	}
 
-	error = zfs_set_prop_nvlist(zc->zc_name, nvl);
+	error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors);
+
+	if (zc->zc_nvlist_dst != NULL && errors != NULL) {
+		(void) put_nvlist(zc, errors);
+	}
 
+	nvlist_free(errors);
 	nvlist_free(nvl);
 	return (error);
 }
@@ -1649,14 +2275,75 @@ zfs_ioc_set_prop(zfs_cmd_t *zc)
  * inputs:
  * zc_name		name of filesystem
  * zc_value		name of property to inherit
+ * zc_cookie		revert to received value if TRUE
  *
  * outputs:		none
  */
 static int
 zfs_ioc_inherit_prop(zfs_cmd_t *zc)
 {
+	const char *propname = zc->zc_value;
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	boolean_t received = zc->zc_cookie;
+	zprop_source_t source = (received
+	    ? ZPROP_SRC_NONE		/* revert to received value, if any */
+	    : ZPROP_SRC_INHERITED);	/* explicitly inherit */
+
+	if (received) {
+		nvlist_t *dummy;
+		nvpair_t *pair;
+		zprop_type_t type;
+		int err;
+
+		/*
+		 * zfs_prop_set_special() expects properties in the form of an
+		 * nvpair with type info.
+		 */
+		if (prop == ZPROP_INVAL) {
+			if (!zfs_prop_user(propname))
+				return (EINVAL);
+
+			type = PROP_TYPE_STRING;
+		} else if (prop == ZFS_PROP_VOLSIZE ||
+		    prop == ZFS_PROP_VERSION) {
+			return (EINVAL);
+		} else {
+			type = zfs_prop_get_type(prop);
+		}
+
+		VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+		switch (type) {
+		case PROP_TYPE_STRING:
+			VERIFY(0 == nvlist_add_string(dummy, propname, ""));
+			break;
+		case PROP_TYPE_NUMBER:
+		case PROP_TYPE_INDEX:
+			VERIFY(0 == nvlist_add_uint64(dummy, propname, 0));
+			break;
+		default:
+			nvlist_free(dummy);
+			return (EINVAL);
+		}
+
+		pair = nvlist_next_nvpair(dummy, NULL);
+		err = zfs_prop_set_special(zc->zc_name, source, pair);
+		nvlist_free(dummy);
+		if (err != -1)
+			return (err); /* special property already handled */
+	} else {
+		/*
+		 * Only check this in the non-received case. We want to allow
+		 * 'inherit -S' to revert non-inheritable properties like quota
+		 * and reservation to the received or default values even though
+		 * they are not considered inheritable.
+		 */
+		if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop))
+			return (EINVAL);
+	}
+
 	/* the property name has been validated by zfs_secpolicy_inherit() */
-	return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL));
+	return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL));
 }
 
 static int
@@ -1665,11 +2352,32 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc)
 	nvlist_t *props;
 	spa_t *spa;
 	int error;
+	nvpair_t *pair;
 
-	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    &props)))
+	if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
+	    zc->zc_iflags, &props))
 		return (error);
 
+	/*
+	 * If the only property is the configfile, then just do a spa_lookup()
+	 * to handle the faulted case.
+	 */
+	pair = nvlist_next_nvpair(props, NULL);
+	if (pair != NULL && strcmp(nvpair_name(pair),
+	    zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 &&
+	    nvlist_next_nvpair(props, pair) == NULL) {
+		mutex_enter(&spa_namespace_lock);
+		if ((spa = spa_lookup(zc->zc_name)) != NULL) {
+			spa_configfile_set(spa, props, B_FALSE);
+			spa_config_sync(spa, B_FALSE, B_TRUE);
+		}
+		mutex_exit(&spa_namespace_lock);
+		if (spa != NULL) {
+			nvlist_free(props);
+			return (0);
+		}
+	}
+
 	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
 		nvlist_free(props);
 		return (error);
@@ -1690,20 +2398,27 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc)
 	int error;
 	nvlist_t *nvp = NULL;
 
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
-		return (error);
-
-	error = spa_prop_get(spa, &nvp);
+	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) {
+		/*
+		 * If the pool is faulted, there may be properties we can still
+		 * get (such as altroot and cachefile), so attempt to get them
+		 * anyway.
+		 */
+		mutex_enter(&spa_namespace_lock);
+		if ((spa = spa_lookup(zc->zc_name)) != NULL)
+			error = spa_prop_get(spa, &nvp);
+		mutex_exit(&spa_namespace_lock);
+	} else {
+		error = spa_prop_get(spa, &nvp);
+		spa_close(spa, FTAG);
+	}
 
 	if (error == 0 && zc->zc_nvlist_dst != NULL)
 		error = put_nvlist(zc, nvp);
 	else
 		error = EFAULT;
 
-	spa_close(spa, FTAG);
-
-	if (nvp)
-		nvlist_free(nvp);
+	nvlist_free(nvp);
 	return (error);
 }
 
@@ -1719,7 +2434,7 @@ zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc)
 	cred_t	*usercred;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    &nvp)) != 0) {
+	    zc->zc_iflags, &nvp)) != 0) {
 		return (error);
 	}
 
@@ -1769,7 +2484,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc)
 	nvlist_t *fsaclnv = NULL;
 
 	if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    &fsaclnv)) != 0)
+	    zc->zc_iflags, &fsaclnv)) != 0)
 		return (error);
 
 	/*
@@ -1825,30 +2540,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc)
 	return (error);
 }
 
-/*
- * inputs:
- * zc_name		name of volume
- *
- * outputs:		none
- */
-static int
-zfs_ioc_create_minor(zfs_cmd_t *zc)
-{
-	return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip)));
-}
-
-/*
- * inputs:
- * zc_name		name of volume
- *
- * outputs:		none
- */
-static int
-zfs_ioc_remove_minor(zfs_cmd_t *zc)
-{
-	return (zvol_remove_minor(zc->zc_name));
-}
-
 /*
  * Search the vfs list for a specified resource.  Returns a pointer to it
  * or NULL if no suitable entry is found. The caller of this routine
@@ -1906,11 +2597,10 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx)
  * processing.
  */
 static int
-zfs_fill_zplprops_impl(objset_t *os, uint64_t default_zplver,
+zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver,
     boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops,
     boolean_t *is_ci)
 {
-	uint64_t zplver = default_zplver;
 	uint64_t sense = ZFS_PROP_UNDEFINED;
 	uint64_t norm = ZFS_PROP_UNDEFINED;
 	uint64_t u8 = ZFS_PROP_UNDEFINED;
@@ -1998,6 +2688,8 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
 	ASSERT(cp != NULL);
 	cp[0] = '\0';
 
+	if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE))
+		zplver = ZPL_VERSION_USERSPACE - 1;
 	if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) {
 		zplver = ZPL_VERSION_FUID - 1;
 		fuids_ok = B_FALSE;
@@ -2006,13 +2698,12 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops,
 	/*
 	 * Open parent object set so we can inherit zplprop values.
 	 */
-	if ((error = dmu_objset_open(parentname, DMU_OST_ANY,
-	    DS_MODE_USER | DS_MODE_READONLY, &os)) != 0)
+	if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0)
 		return (error);
 
 	error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops,
 	    zplprops, is_ci);
-	dmu_objset_close(os);
+	dmu_objset_rele(os, FTAG);
 	return (error);
 }
 
@@ -2073,7 +2764,7 @@ zfs_ioc_create(zfs_cmd_t *zc)
 
 	if (zc->zc_nvlist_src != NULL &&
 	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    &nvprops)) != 0)
+	    zc->zc_iflags, &nvprops)) != 0)
 		return (error);
 
 	zct.zct_zplprops = NULL;
@@ -2089,21 +2780,18 @@ zfs_ioc_create(zfs_cmd_t *zc)
 			return (EINVAL);
 		}
 
-		error = dmu_objset_open(zc->zc_value, type,
-		    DS_MODE_USER | DS_MODE_READONLY, &clone);
+		error = dmu_objset_hold(zc->zc_value, FTAG, &clone);
 		if (error) {
 			nvlist_free(nvprops);
 			return (error);
 		}
 
-		error = dmu_objset_create(zc->zc_name, type, clone, 0,
-		    NULL, NULL);
+		error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0);
+		dmu_objset_rele(clone, FTAG);
 		if (error) {
-			dmu_objset_close(clone);
 			nvlist_free(nvprops);
 			return (error);
 		}
-		dmu_objset_close(clone);
 	} else {
 		boolean_t is_insensitive = B_FALSE;
 
@@ -2160,7 +2848,7 @@ zfs_ioc_create(zfs_cmd_t *zc)
 				return (error);
 			}
 		}
-		error = dmu_objset_create(zc->zc_name, type, NULL,
+		error = dmu_objset_create(zc->zc_name, type,
 		    is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct);
 		nvlist_free(zct.zct_zplprops);
 	}
@@ -2169,41 +2857,24 @@ zfs_ioc_create(zfs_cmd_t *zc)
 	 * It would be nice to do this atomically.
 	 */
 	if (error == 0) {
-		if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0)
-			(void) dmu_objset_destroy(zc->zc_name);
+		error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL,
+		    nvprops, NULL);
+		if (error != 0)
+			(void) dmu_objset_destroy(zc->zc_name, B_FALSE);
 	}
 	nvlist_free(nvprops);
 	return (error);
 }
 
-struct snap_prop_arg {
-	nvlist_t *nvprops;
-	const char *snapname;
-};
-
-static int
-set_snap_props(char *name, void *arg)
-{
-	struct snap_prop_arg *snpa = arg;
-	int len = strlen(name) + strlen(snpa->snapname) + 2;
-	char *buf = kmem_alloc(len, KM_SLEEP);
-	int err;
-
-	(void) snprintf(buf, len, "%s@%s", name, snpa->snapname);
-	err = zfs_set_prop_nvlist(buf, snpa->nvprops);
-	if (err)
-		(void) dmu_objset_destroy(buf);
-	kmem_free(buf, len);
-	return (err);
-}
-
 /*
  * inputs:
  * zc_name	name of filesystem
  * zc_value	short name of snapshot
  * zc_cookie	recursive flag
+ * zc_nvlist_src[_size] property list
  *
- * outputs:	none
+ * outputs:
+ * zc_value	short snapname (i.e. part after the '@')
  */
 static int
 zfs_ioc_snapshot(zfs_cmd_t *zc)
@@ -2217,48 +2888,37 @@ zfs_ioc_snapshot(zfs_cmd_t *zc)
 
 	if (zc->zc_nvlist_src != NULL &&
 	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    &nvprops)) != 0)
+	    zc->zc_iflags, &nvprops)) != 0)
 		return (error);
 
-	error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, recursive);
+	error = zfs_check_userprops(zc->zc_name, nvprops);
+	if (error)
+		goto out;
 
-	/*
-	 * It would be nice to do this atomically.
-	 */
-	if (error == 0) {
-		struct snap_prop_arg snpa;
-		snpa.nvprops = nvprops;
-		snpa.snapname = zc->zc_value;
-		if (recursive) {
-			error = dmu_objset_find(zc->zc_name,
-			    set_snap_props, &snpa, DS_FIND_CHILDREN);
-			if (error) {
-				(void) dmu_snapshots_destroy(zc->zc_name,
-				    zc->zc_value);
-			}
-		} else {
-			error = set_snap_props(zc->zc_name, &snpa);
-		}
+	if (!nvlist_empty(nvprops) &&
+	    zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) {
+		error = ENOTSUP;
+		goto out;
 	}
+
+	error = dmu_objset_snapshot(zc->zc_name, zc->zc_value,
+	    nvprops, recursive);
+
+out:
 	nvlist_free(nvprops);
 	return (error);
 }
 
 int
-zfs_unmount_snap(char *name, void *arg)
+zfs_unmount_snap(const char *name, void *arg)
 {
 	vfs_t *vfsp = NULL;
 
 	if (arg) {
 		char *snapname = arg;
-		int len = strlen(name) + strlen(snapname) + 2;
-		char *buf = kmem_alloc(len, KM_SLEEP);
-
-		(void) strcpy(buf, name);
-		(void) strcat(buf, "@");
-		(void) strcat(buf, snapname);
-		vfsp = zfs_get_vfs(buf);
-		kmem_free(buf, len);
+		char *fullname = kmem_asprintf("%s@%s", name, snapname);
+		vfsp = zfs_get_vfs(fullname);
+		strfree(fullname);
 	} else if (strchr(name, '@')) {
 		vfsp = zfs_get_vfs(name);
 	}
@@ -2283,8 +2943,9 @@ zfs_unmount_snap(char *name, void *arg)
 
 /*
  * inputs:
- * zc_name	name of filesystem
- * zc_value	short name of snapshot
+ * zc_name		name of filesystem
+ * zc_value		short name of snapshot
+ * zc_defer_destroy	mark for deferred destroy
  *
  * outputs:	none
  */
@@ -2299,26 +2960,32 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc)
 	    zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN);
 	if (err)
 		return (err);
-	return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value));
+	return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value,
+	    zc->zc_defer_destroy));
 }
 
 /*
  * inputs:
  * zc_name		name of dataset to destroy
  * zc_objset_type	type of objset
+ * zc_defer_destroy	mark for deferred destroy
  *
  * outputs:		none
  */
 static int
 zfs_ioc_destroy(zfs_cmd_t *zc)
 {
+	int err;
 	if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) {
-		int err = zfs_unmount_snap(zc->zc_name, NULL);
+		err = zfs_unmount_snap(zc->zc_name, NULL);
 		if (err)
 			return (err);
 	}
 
-	return (dmu_objset_destroy(zc->zc_name));
+	err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy);
+	if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0)
+		(void) zvol_remove_minor(zc->zc_name);
+	return (err);
 }
 
 /*
@@ -2330,50 +2997,78 @@ zfs_ioc_destroy(zfs_cmd_t *zc)
 static int
 zfs_ioc_rollback(zfs_cmd_t *zc)
 {
-	objset_t *os;
+	dsl_dataset_t *ds, *clone;
 	int error;
-	zfsvfs_t *zfsvfs = NULL;
+	zfsvfs_t *zfsvfs;
+	char *clone_name;
 
-	/*
-	 * Get the zfsvfs for the receiving objset. There
-	 * won't be one if we're operating on a zvol, if the
-	 * objset doesn't exist yet, or is not mounted.
-	 */
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os);
+	error = dsl_dataset_hold(zc->zc_name, FTAG, &ds);
 	if (error)
 		return (error);
 
-	if (dmu_objset_type(os) == DMU_OST_ZFS) {
-		mutex_enter(&os->os->os_user_ptr_lock);
-		zfsvfs = dmu_objset_get_user(os);
-		if (zfsvfs != NULL)
-			VFS_HOLD(zfsvfs->z_vfs);
-		mutex_exit(&os->os->os_user_ptr_lock);
+	/* must not be a snapshot */
+	if (dsl_dataset_is_snapshot(ds)) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EINVAL);
+	}
+
+	/* must have a most recent snapshot */
+	if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) {
+		dsl_dataset_rele(ds, FTAG);
+		return (EINVAL);
 	}
 
-	if (zfsvfs != NULL) {
-		char *osname;
-		int mode;
+	/*
+	 * Create clone of most recent snapshot.
+	 */
+	clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name);
+	error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT);
+	if (error)
+		goto out;
+
+	error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone);
+	if (error)
+		goto out;
 
-		osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-		error = zfs_suspend_fs(zfsvfs, osname, &mode);
+	/*
+	 * Do clone swap.
+	 */
+	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+		error = zfs_suspend_fs(zfsvfs);
 		if (error == 0) {
 			int resume_err;
 
-			ASSERT(strcmp(osname, zc->zc_name) == 0);
-			error = dmu_objset_rollback(os);
-			resume_err = zfs_resume_fs(zfsvfs, osname, mode);
+			if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+				error = dsl_dataset_clone_swap(clone, ds,
+				    B_TRUE);
+				dsl_dataset_disown(ds, FTAG);
+				ds = NULL;
+			} else {
+				error = EBUSY;
+			}
+			resume_err = zfs_resume_fs(zfsvfs, zc->zc_name);
 			error = error ? error : resume_err;
-		} else {
-			dmu_objset_close(os);
 		}
-		kmem_free(osname, MAXNAMELEN);
 		VFS_RELE(zfsvfs->z_vfs);
 	} else {
-		error = dmu_objset_rollback(os);
+		if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) {
+			error = dsl_dataset_clone_swap(clone, ds, B_TRUE);
+			dsl_dataset_disown(ds, FTAG);
+			ds = NULL;
+		} else {
+			error = EBUSY;
+		}
 	}
-	/* Note, the dmu_objset_rollback() releases the objset for us. */
 
+	/*
+	 * Destroy clone (which also closes it).
+	 */
+	(void) dsl_dataset_destroy(clone, FTAG, B_FALSE);
+
+out:
+	strfree(clone_name);
+	if (ds)
+		dsl_dataset_rele(ds, FTAG);
 	return (error);
 }
 
@@ -2406,28 +3101,267 @@ zfs_ioc_rename(zfs_cmd_t *zc)
 		if (err)
 			return (err);
 	}
+	if (zc->zc_objset_type == DMU_OST_ZVOL)
+		(void) zvol_remove_minor(zc->zc_name);
 	return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive));
 }
 
-static void
-clear_props(char *dataset, nvlist_t *props)
+static int
+zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr)
+{
+	const char *propname = nvpair_name(pair);
+	boolean_t issnap = (strchr(dsname, '@') != NULL);
+	zfs_prop_t prop = zfs_name_to_prop(propname);
+	uint64_t intval;
+	int err;
+
+	if (prop == ZPROP_INVAL) {
+		if (zfs_prop_user(propname)) {
+			if (err = zfs_secpolicy_write_perms(dsname,
+			    ZFS_DELEG_PERM_USERPROP, cr))
+				return (err);
+			return (0);
+		}
+
+		if (!issnap && zfs_prop_userquota(propname)) {
+			const char *perm = NULL;
+			const char *uq_prefix =
+			    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA];
+			const char *gq_prefix =
+			    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA];
+
+			if (strncmp(propname, uq_prefix,
+			    strlen(uq_prefix)) == 0) {
+				perm = ZFS_DELEG_PERM_USERQUOTA;
+			} else if (strncmp(propname, gq_prefix,
+			    strlen(gq_prefix)) == 0) {
+				perm = ZFS_DELEG_PERM_GROUPQUOTA;
+			} else {
+				/* USERUSED and GROUPUSED are read-only */
+				return (EINVAL);
+			}
+
+			if (err = zfs_secpolicy_write_perms(dsname, perm, cr))
+				return (err);
+			return (0);
+		}
+
+		return (EINVAL);
+	}
+
+	if (issnap)
+		return (EINVAL);
+
+	if (nvpair_type(pair) == DATA_TYPE_NVLIST) {
+		/*
+		 * dsl_prop_get_all_impl() returns properties in this
+		 * format.
+		 */
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(pair, &attrs) == 0);
+		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &pair) == 0);
+	}
+
+	/*
+	 * Check that this value is valid for this pool version
+	 */
+	switch (prop) {
+	case ZFS_PROP_COMPRESSION:
+		/*
+		 * If the user specified gzip compression, make sure
+		 * the SPA supports it. We ignore any errors here since
+		 * we'll catch them later.
+		 */
+		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+		    nvpair_value_uint64(pair, &intval) == 0) {
+			if (intval >= ZIO_COMPRESS_GZIP_1 &&
+			    intval <= ZIO_COMPRESS_GZIP_9 &&
+			    zfs_earlier_version(dsname,
+			    SPA_VERSION_GZIP_COMPRESSION)) {
+				return (ENOTSUP);
+			}
+
+			if (intval == ZIO_COMPRESS_ZLE &&
+			    zfs_earlier_version(dsname,
+			    SPA_VERSION_ZLE_COMPRESSION))
+				return (ENOTSUP);
+
+			/*
+			 * If this is a bootable dataset then
+			 * verify that the compression algorithm
+			 * is supported for booting. We must return
+			 * something other than ENOTSUP since it
+			 * implies a downrev pool version.
+			 */
+			if (zfs_is_bootfs(dsname) &&
+			    !BOOTFS_COMPRESS_VALID(intval)) {
+				return (ERANGE);
+			}
+		}
+		break;
+
+	case ZFS_PROP_COPIES:
+		if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS))
+			return (ENOTSUP);
+		break;
+
+	case ZFS_PROP_DEDUP:
+		if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP))
+			return (ENOTSUP);
+		break;
+
+	case ZFS_PROP_SHARESMB:
+		if (zpl_earlier_version(dsname, ZPL_VERSION_FUID))
+			return (ENOTSUP);
+		break;
+
+	case ZFS_PROP_ACLINHERIT:
+		if (nvpair_type(pair) == DATA_TYPE_UINT64 &&
+		    nvpair_value_uint64(pair, &intval) == 0) {
+			if (intval == ZFS_ACL_PASSTHROUGH_X &&
+			    zfs_earlier_version(dsname,
+			    SPA_VERSION_PASSTHROUGH_X))
+				return (ENOTSUP);
+		}
+		break;
+	}
+
+	return (zfs_secpolicy_setprop(dsname, prop, pair, CRED()));
+}
+
+/*
+ * Removes properties from the given props list that fail permission checks
+ * needed to clear them and to restore them in case of a receive error. For each
+ * property, make sure we have both set and inherit permissions.
+ *
+ * Returns the first error encountered if any permission checks fail. If the
+ * caller provides a non-NULL errlist, it also gives the complete list of names
+ * of all the properties that failed a permission check along with the
+ * corresponding error numbers. The caller is responsible for freeing the
+ * returned errlist.
+ *
+ * If every property checks out successfully, zero is returned and the list
+ * pointed at by errlist is NULL.
+ */
+static int
+zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist)
 {
 	zfs_cmd_t *zc;
-	nvpair_t *prop;
+	nvpair_t *pair, *next_pair;
+	nvlist_t *errors;
+	int err, rv = 0;
 
 	if (props == NULL)
-		return;
+		return (0);
+
+	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
 	zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP);
 	(void) strcpy(zc->zc_name, dataset);
-	for (prop = nvlist_next_nvpair(props, NULL); prop;
-	    prop = nvlist_next_nvpair(props, prop)) {
-		(void) strcpy(zc->zc_value, nvpair_name(prop));
-		if (zfs_secpolicy_inherit(zc, CRED()) == 0)
-			(void) zfs_ioc_inherit_prop(zc);
+	pair = nvlist_next_nvpair(props, NULL);
+	while (pair != NULL) {
+		next_pair = nvlist_next_nvpair(props, pair);
+
+		(void) strcpy(zc->zc_value, nvpair_name(pair));
+		if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 ||
+		    (err = zfs_secpolicy_inherit(zc, CRED())) != 0) {
+			VERIFY(nvlist_remove_nvpair(props, pair) == 0);
+			VERIFY(nvlist_add_int32(errors,
+			    zc->zc_value, err) == 0);
+		}
+		pair = next_pair;
+	}
+	kmem_free(zc, sizeof (zfs_cmd_t));
+
+	if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) {
+		nvlist_free(errors);
+		errors = NULL;
+	} else {
+		VERIFY(nvpair_value_int32(pair, &rv) == 0);
+	}
+
+	if (errlist == NULL)
+		nvlist_free(errors);
+	else
+		*errlist = errors;
+
+	return (rv);
+}
+
+static boolean_t
+propval_equals(nvpair_t *p1, nvpair_t *p2)
+{
+	if (nvpair_type(p1) == DATA_TYPE_NVLIST) {
+		/* dsl_prop_get_all_impl() format */
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(p1, &attrs) == 0);
+		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &p1) == 0);
+	}
+
+	if (nvpair_type(p2) == DATA_TYPE_NVLIST) {
+		nvlist_t *attrs;
+		VERIFY(nvpair_value_nvlist(p2, &attrs) == 0);
+		VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE,
+		    &p2) == 0);
+	}
+
+	if (nvpair_type(p1) != nvpair_type(p2))
+		return (B_FALSE);
+
+	if (nvpair_type(p1) == DATA_TYPE_STRING) {
+		char *valstr1, *valstr2;
+
+		VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0);
+		VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0);
+		return (strcmp(valstr1, valstr2) == 0);
+	} else {
+		uint64_t intval1, intval2;
+
+		VERIFY(nvpair_value_uint64(p1, &intval1) == 0);
+		VERIFY(nvpair_value_uint64(p2, &intval2) == 0);
+		return (intval1 == intval2);
+	}
+}
+
+/*
+ * Remove properties from props if they are not going to change (as determined
+ * by comparison with origprops). Remove them from origprops as well, since we
+ * do not need to clear or restore properties that won't change.
+ */
+static void
+props_reduce(nvlist_t *props, nvlist_t *origprops)
+{
+	nvpair_t *pair, *next_pair;
+
+	if (origprops == NULL)
+		return; /* all props need to be received */
+
+	pair = nvlist_next_nvpair(props, NULL);
+	while (pair != NULL) {
+		const char *propname = nvpair_name(pair);
+		nvpair_t *match;
+
+		next_pair = nvlist_next_nvpair(props, pair);
+
+		if ((nvlist_lookup_nvpair(origprops, propname,
+		    &match) != 0) || !propval_equals(pair, match))
+			goto next; /* need to set received value */
+
+		/* don't clear the existing received value */
+		(void) nvlist_remove_nvpair(origprops, match);
+		/* don't bother receiving the property */
+		(void) nvlist_remove_nvpair(props, pair);
+next:
+		pair = next_pair;
 	}
-	kmem_free(zc, sizeof (zfs_cmd_t));
 }
 
+#ifdef	DEBUG
+static boolean_t zfs_ioc_recv_inject_err;
+#endif
+
 /*
  * inputs:
  * zc_name		name of containing filesystem
@@ -2440,6 +3374,8 @@ clear_props(char *dataset, nvlist_t *props)
  *
  * outputs:
  * zc_cookie		number of bytes read
+ * zc_nvlist_dst{_size} error for each unapplied received property
+ * zc_obj		zprop_errflags_t
  */
 static int
 zfs_ioc_recv(zfs_cmd_t *zc)
@@ -2447,15 +3383,18 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 	file_t *fp;
 	objset_t *os;
 	dmu_recv_cookie_t drc;
-	zfsvfs_t *zfsvfs = NULL;
 	boolean_t force = (boolean_t)zc->zc_guid;
-	int error, fd;
+	int fd;
+	int error = 0;
+	int props_error = 0;
+	nvlist_t *errors;
 	offset_t off;
-	nvlist_t *props = NULL;
-	nvlist_t *origprops = NULL;
+	nvlist_t *props = NULL; /* sent properties */
+	nvlist_t *origprops = NULL; /* existing properties */
 	objset_t *origin = NULL;
 	char *tosnap;
 	char tofs[ZFS_MAXNAMELEN];
+	boolean_t first_recvd_props = B_FALSE;
 
 	if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 ||
 	    strchr(zc->zc_value, '@') == NULL ||
@@ -2464,12 +3403,11 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 
 	(void) strcpy(tofs, zc->zc_value);
 	tosnap = strchr(tofs, '@');
-	*tosnap = '\0';
-	tosnap++;
+	*tosnap++ = '\0';
 
 	if (zc->zc_nvlist_src != NULL &&
 	    (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size,
-	    &props)) != 0)
+	    zc->zc_iflags, &props)) != 0)
 		return (error);
 
 	fd = zc->zc_cookie;
@@ -2479,105 +3417,182 @@ zfs_ioc_recv(zfs_cmd_t *zc)
 		return (EBADF);
 	}
 
-	if (dmu_objset_open(tofs, DMU_OST_ANY,
-	    DS_MODE_USER | DS_MODE_READONLY, &os) == 0) {
-		/*
-		 * Try to get the zfsvfs for the receiving objset.
-		 * There won't be one if we're operating on a zvol,
-		 * if the objset doesn't exist yet, or is not mounted.
-		 */
-		mutex_enter(&os->os->os_user_ptr_lock);
-		if (zfsvfs = dmu_objset_get_user(os)) {
-			if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) {
-				mutex_exit(&os->os->os_user_ptr_lock);
-				dmu_objset_close(os);
-				zfsvfs = NULL;
-				error = EBUSY;
-				goto out;
-			}
-			VFS_HOLD(zfsvfs->z_vfs);
+	VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+
+	if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) {
+		if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) &&
+		    !dsl_prop_get_hasrecvd(os)) {
+			first_recvd_props = B_TRUE;
 		}
-		mutex_exit(&os->os->os_user_ptr_lock);
 
 		/*
-		 * If new properties are supplied, they are to completely
-		 * replace the existing ones, so stash away the existing ones.
+		 * If new received properties are supplied, they are to
+		 * completely replace the existing received properties, so stash
+		 * away the existing ones.
 		 */
-		if (props)
-			(void) dsl_prop_get_all(os, &origprops, TRUE);
+		if (dsl_prop_get_received(os, &origprops) == 0) {
+			nvlist_t *errlist = NULL;
+			/*
+			 * Don't bother writing a property if its value won't
+			 * change (and avoid the unnecessary security checks).
+			 *
+			 * The first receive after SPA_VERSION_RECVD_PROPS is a
+			 * special case where we blow away all local properties
+			 * regardless.
+			 */
+			if (!first_recvd_props)
+				props_reduce(props, origprops);
+			if (zfs_check_clearable(tofs, origprops,
+			    &errlist) != 0)
+				(void) nvlist_merge(errors, errlist, 0);
+			nvlist_free(errlist);
+		}
 
-		dmu_objset_close(os);
+		dmu_objset_rele(os, FTAG);
 	}
 
 	if (zc->zc_string[0]) {
-		error = dmu_objset_open(zc->zc_string, DMU_OST_ANY,
-		    DS_MODE_USER | DS_MODE_READONLY, &origin);
+		error = dmu_objset_hold(zc->zc_string, FTAG, &origin);
 		if (error)
 			goto out;
 	}
 
-	error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record,
-	    force, origin, zfsvfs != NULL, &drc);
+	error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds,
+	    &zc->zc_begin_record, force, origin, &drc);
 	if (origin)
-		dmu_objset_close(origin);
+		dmu_objset_rele(origin, FTAG);
 	if (error)
 		goto out;
 
 	/*
-	 * Reset properties.  We do this before we receive the stream
-	 * so that the properties are applied to the new data.
+	 * Set properties before we receive the stream so that they are applied
+	 * to the new data. Note that we must call dmu_recv_stream() if
+	 * dmu_recv_begin() succeeds.
 	 */
 	if (props) {
-		clear_props(tofs, origprops);
+		nvlist_t *errlist;
+
+		if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) {
+			if (drc.drc_newfs) {
+				if (spa_version(os->os_spa) >=
+				    SPA_VERSION_RECVD_PROPS)
+					first_recvd_props = B_TRUE;
+			} else if (origprops != NULL) {
+				if (clear_received_props(os, tofs, origprops,
+				    first_recvd_props ? NULL : props) != 0)
+					zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+			} else {
+				zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+			}
+			dsl_prop_set_hasrecvd(os);
+		} else if (!drc.drc_newfs) {
+			zc->zc_obj |= ZPROP_ERR_NOCLEAR;
+		}
+
+		(void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED,
+		    props, &errlist);
+		(void) nvlist_merge(errors, errlist, 0);
+		nvlist_free(errlist);
+	}
+
+	if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) {
 		/*
-		 * XXX - Note, this is all-or-nothing; should be best-effort.
+		 * Caller made zc->zc_nvlist_dst less than the minimum expected
+		 * size or supplied an invalid address.
 		 */
-		(void) zfs_set_prop_nvlist(tofs, props);
+		props_error = EINVAL;
 	}
 
 	off = fp->f_offset;
 	error = dmu_recv_stream(&drc, fp->f_vnode, &off);
 
-	if (error == 0 && zfsvfs) {
-		char *osname;
-		int mode;
+	if (error == 0) {
+		zfsvfs_t *zfsvfs = NULL;
 
-		/* online recv */
-		osname = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-		error = zfs_suspend_fs(zfsvfs, osname, &mode);
-		if (error == 0) {
-			int resume_err;
+		if (getzfsvfs(tofs, &zfsvfs) == 0) {
+			/* online recv */
+			int end_err;
 
-			error = dmu_recv_end(&drc);
-			resume_err = zfs_resume_fs(zfsvfs, osname, mode);
-			error = error ? error : resume_err;
+			error = zfs_suspend_fs(zfsvfs);
+			/*
+			 * If the suspend fails, then the recv_end will
+			 * likely also fail, and clean up after itself.
+			 */
+			end_err = dmu_recv_end(&drc);
+			if (error == 0) {
+				int resume_err =
+				    zfs_resume_fs(zfsvfs, tofs);
+				error = error ? error : resume_err;
+			}
+			error = error ? error : end_err;
+			VFS_RELE(zfsvfs->z_vfs);
 		} else {
-			dmu_recv_abort_cleanup(&drc);
+			error = dmu_recv_end(&drc);
 		}
-		kmem_free(osname, MAXNAMELEN);
-	} else if (error == 0) {
-		error = dmu_recv_end(&drc);
 	}
 
 	zc->zc_cookie = off - fp->f_offset;
 	if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0)
 		fp->f_offset = off;
 
+#ifdef	DEBUG
+	if (zfs_ioc_recv_inject_err) {
+		zfs_ioc_recv_inject_err = B_FALSE;
+		error = 1;
+	}
+#endif
 	/*
 	 * On error, restore the original props.
 	 */
 	if (error && props) {
-		clear_props(tofs, props);
-		(void) zfs_set_prop_nvlist(tofs, origprops);
+		if (dmu_objset_hold(tofs, FTAG, &os) == 0) {
+			if (clear_received_props(os, tofs, props, NULL) != 0) {
+				/*
+				 * We failed to clear the received properties.
+				 * Since we may have left a $recvd value on the
+				 * system, we can't clear the $hasrecvd flag.
+				 */
+				zc->zc_obj |= ZPROP_ERR_NORESTORE;
+			} else if (first_recvd_props) {
+				dsl_prop_unset_hasrecvd(os);
+			}
+			dmu_objset_rele(os, FTAG);
+		} else if (!drc.drc_newfs) {
+			/* We failed to clear the received properties. */
+			zc->zc_obj |= ZPROP_ERR_NORESTORE;
+		}
+
+		if (origprops == NULL && !drc.drc_newfs) {
+			/* We failed to stash the original properties. */
+			zc->zc_obj |= ZPROP_ERR_NORESTORE;
+		}
+
+		/*
+		 * dsl_props_set() will not convert RECEIVED to LOCAL on or
+		 * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL
+		 * explictly if we're restoring local properties cleared in the
+		 * first new-style receive.
+		 */
+		if (origprops != NULL &&
+		    zfs_set_prop_nvlist(tofs, (first_recvd_props ?
+		    ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED),
+		    origprops, NULL) != 0) {
+			/*
+			 * We stashed the original properties but failed to
+			 * restore them.
+			 */
+			zc->zc_obj |= ZPROP_ERR_NORESTORE;
+		}
 	}
 out:
-	if (zfsvfs) {
-		mutex_exit(&zfsvfs->z_online_recv_lock);
-		VFS_RELE(zfsvfs->z_vfs);
-	}
 	nvlist_free(props);
 	nvlist_free(origprops);
+	nvlist_free(errors);
 	releasef(fd);
+
+	if (error == 0)
+		error = props_error;
+
 	return (error);
 }
 
@@ -2599,8 +3614,7 @@ zfs_ioc_send(zfs_cmd_t *zc)
 	int error;
 	offset_t off;
 
-	error = dmu_objset_open(zc->zc_name, DMU_OST_ANY,
-	    DS_MODE_USER | DS_MODE_READONLY, &tosnap);
+	error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap);
 	if (error)
 		return (error);
 
@@ -2614,20 +3628,19 @@ zfs_ioc_send(zfs_cmd_t *zc)
 		if (cp)
 			*(cp+1) = 0;
 		(void) strncat(buf, zc->zc_value, MAXPATHLEN);
-		error = dmu_objset_open(buf, DMU_OST_ANY,
-		    DS_MODE_USER | DS_MODE_READONLY, &fromsnap);
+		error = dmu_objset_hold(buf, FTAG, &fromsnap);
 		kmem_free(buf, MAXPATHLEN);
 		if (error) {
-			dmu_objset_close(tosnap);
+			dmu_objset_rele(tosnap, FTAG);
 			return (error);
 		}
 	}
 
 	fp = getf(zc->zc_cookie);
 	if (fp == NULL) {
-		dmu_objset_close(tosnap);
+		dmu_objset_rele(tosnap, FTAG);
 		if (fromsnap)
-			dmu_objset_close(fromsnap);
+			dmu_objset_rele(fromsnap, FTAG);
 		return (EBADF);
 	}
 
@@ -2638,8 +3651,8 @@ zfs_ioc_send(zfs_cmd_t *zc)
 		fp->f_offset = off;
 	releasef(zc->zc_cookie);
 	if (fromsnap)
-		dmu_objset_close(fromsnap);
-	dmu_objset_close(tosnap);
+		dmu_objset_rele(fromsnap, FTAG);
+	dmu_objset_rele(tosnap, FTAG);
 	return (error);
 }
 
@@ -2715,16 +3728,38 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 		mutex_exit(&spa_namespace_lock);
 		return (EIO);
 	}
-	if (spa->spa_log_state == SPA_LOG_MISSING) {
+	if (spa_get_log_state(spa) == SPA_LOG_MISSING) {
 		/* we need to let spa_open/spa_load clear the chains */
-		spa->spa_log_state = SPA_LOG_CLEAR;
+		spa_set_log_state(spa, SPA_LOG_CLEAR);
 	}
+	spa->spa_last_open_failed = 0;
 	mutex_exit(&spa_namespace_lock);
 
-	if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0)
+	if (zc->zc_cookie & ZPOOL_NO_REWIND) {
+		error = spa_open(zc->zc_name, &spa, FTAG);
+	} else {
+		nvlist_t *policy;
+		nvlist_t *config = NULL;
+
+		if (zc->zc_nvlist_src == NULL)
+			return (EINVAL);
+
+		if ((error = get_nvlist(zc->zc_nvlist_src,
+		    zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) {
+			error = spa_open_rewind(zc->zc_name, &spa, FTAG,
+			    policy, &config);
+			if (config != NULL) {
+				(void) put_nvlist(zc, config);
+				nvlist_free(config);
+			}
+			nvlist_free(policy);
+		}
+	}
+
+	if (error)
 		return (error);
 
-	spa_vdev_state_enter(spa);
+	spa_vdev_state_enter(spa, SCL_NONE);
 
 	if (zc->zc_guid == 0) {
 		vd = NULL;
@@ -2744,11 +3779,12 @@ zfs_ioc_clear(zfs_cmd_t *zc)
 	/*
 	 * Resume any suspended I/Os.
 	 */
-	zio_resume(spa);
+	if (zio_resume(spa) != 0)
+		error = EIO;
 
 	spa_close(spa, FTAG);
 
-	return (0);
+	return (error);
 }
 
 /*
@@ -2756,7 +3792,8 @@ zfs_ioc_clear(zfs_cmd_t *zc)
  * zc_name	name of filesystem
  * zc_value	name of origin snapshot
  *
- * outputs:	none
+ * outputs:
+ * zc_string	name of conflicting snapshot, if there is one
  */
 static int
 zfs_ioc_promote(zfs_cmd_t *zc)
@@ -2772,7 +3809,118 @@ zfs_ioc_promote(zfs_cmd_t *zc)
 		*cp = '\0';
 	(void) dmu_objset_find(zc->zc_value,
 	    zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS);
-	return (dsl_dataset_promote(zc->zc_name));
+	return (dsl_dataset_promote(zc->zc_name, zc->zc_string));
+}
+
+/*
+ * Retrieve a single {user|group}{used|quota}@... property.
+ *
+ * inputs:
+ * zc_name	name of filesystem
+ * zc_objset_type zfs_userquota_prop_t
+ * zc_value	domain name (eg. "S-1-234-567-89")
+ * zc_guid	RID/UID/GID
+ *
+ * outputs:
+ * zc_cookie	property value
+ */
+static int
+zfs_ioc_userspace_one(zfs_cmd_t *zc)
+{
+	zfsvfs_t *zfsvfs;
+	int error;
+
+	if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS)
+		return (EINVAL);
+
+	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
+	if (error)
+		return (error);
+
+	error = zfs_userspace_one(zfsvfs,
+	    zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie);
+	zfsvfs_rele(zfsvfs, FTAG);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ * zc_cookie		zap cursor
+ * zc_objset_type	zfs_userquota_prop_t
+ * zc_nvlist_dst[_size] buffer to fill (not really an nvlist)
+ *
+ * outputs:
+ * zc_nvlist_dst[_size]	data buffer (array of zfs_useracct_t)
+ * zc_cookie	zap cursor
+ */
+static int
+zfs_ioc_userspace_many(zfs_cmd_t *zc)
+{
+	zfsvfs_t *zfsvfs;
+	int error;
+
+	error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs);
+	if (error)
+		return (error);
+
+	int bufsize = zc->zc_nvlist_dst_size;
+	void *buf = kmem_alloc(bufsize, KM_SLEEP);
+
+	error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie,
+	    buf, &zc->zc_nvlist_dst_size);
+
+	if (error == 0) {
+		error = xcopyout(buf,
+		    (void *)(uintptr_t)zc->zc_nvlist_dst,
+		    zc->zc_nvlist_dst_size);
+	}
+	kmem_free(buf, bufsize);
+	zfsvfs_rele(zfsvfs, FTAG);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ *
+ * outputs:
+ * none
+ */
+static int
+zfs_ioc_userspace_upgrade(zfs_cmd_t *zc)
+{
+	objset_t *os;
+	int error = 0;
+	zfsvfs_t *zfsvfs;
+
+	if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) {
+		if (!dmu_objset_userused_enabled(zfsvfs->z_os)) {
+			/*
+			 * If userused is not enabled, it may be because the
+			 * objset needs to be closed & reopened (to grow the
+			 * objset_phys_t).  Suspend/resume the fs will do that.
+			 */
+			error = zfs_suspend_fs(zfsvfs);
+			if (error == 0)
+				error = zfs_resume_fs(zfsvfs, zc->zc_name);
+		}
+		if (error == 0)
+			error = dmu_objset_userspace_upgrade(zfsvfs->z_os);
+		VFS_RELE(zfsvfs->z_vfs);
+	} else {
+		/* XXX kind of reading contents without owning */
+		error = dmu_objset_hold(zc->zc_name, FTAG, &os);
+		if (error)
+			return (error);
+
+		error = dmu_objset_userspace_upgrade(os);
+		dmu_objset_rele(os, FTAG);
+	}
+
+	return (error);
 }
 
 /*
@@ -2888,7 +4036,7 @@ zfs_ioc_share(zfs_cmd_t *zc)
 		if (error = zsmbexport_fs((void *)
 		    (uintptr_t)zc->zc_share.z_exportdata,
 		    zc->zc_share.z_sharetype == ZFS_SHARE_SMB ?
-		    B_TRUE : B_FALSE)) {
+		    B_TRUE: B_FALSE)) {
 			return (error);
 		}
 		break;
@@ -2909,64 +4057,350 @@ zfs_ioc_share(zfs_cmd_t *zc)
 
 }
 
+ace_t full_access[] = {
+	{(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0}
+};
+
+/*
+ * Remove all ACL files in shares dir
+ */
+static int
+zfs_smb_acl_purge(znode_t *dzp)
+{
+	zap_cursor_t	zc;
+	zap_attribute_t	zap;
+	zfsvfs_t *zfsvfs = dzp->z_zfsvfs;
+	int error;
+
+	for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id);
+	    (error = zap_cursor_retrieve(&zc, &zap)) == 0;
+	    zap_cursor_advance(&zc)) {
+		if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred,
+		    NULL, 0)) != 0)
+			break;
+	}
+	zap_cursor_fini(&zc);
+	return (error);
+}
+
+static int
+zfs_ioc_smb_acl(zfs_cmd_t *zc)
+{
+	vnode_t *vp;
+	znode_t *dzp;
+	vnode_t *resourcevp = NULL;
+	znode_t *sharedir;
+	zfsvfs_t *zfsvfs;
+	nvlist_t *nvlist;
+	char *src, *target;
+	vattr_t vattr;
+	vsecattr_t vsec;
+	int error = 0;
+
+	if ((error = lookupname(zc->zc_value, UIO_SYSSPACE,
+	    NO_FOLLOW, NULL, &vp)) != 0)
+		return (error);
+
+	/* Now make sure mntpnt and dataset are ZFS */
+
+	if (vp->v_vfsp->vfs_fstype != zfsfstype ||
+	    (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource),
+	    zc->zc_name) != 0)) {
+		VN_RELE(vp);
+		return (EINVAL);
+	}
+
+	dzp = VTOZ(vp);
+	zfsvfs = dzp->z_zfsvfs;
+	ZFS_ENTER(zfsvfs);
+
+	/*
+	 * Create share dir if its missing.
+	 */
+	mutex_enter(&zfsvfs->z_lock);
+	if (zfsvfs->z_shares_dir == 0) {
+		dmu_tx_t *tx;
+
+		tx = dmu_tx_create(zfsvfs->z_os);
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE,
+		    ZFS_SHARES_DIR);
+		dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
+		error = dmu_tx_assign(tx, TXG_WAIT);
+		if (error) {
+			dmu_tx_abort(tx);
+		} else {
+			error = zfs_create_share_dir(zfsvfs, tx);
+			dmu_tx_commit(tx);
+		}
+		if (error) {
+			mutex_exit(&zfsvfs->z_lock);
+			VN_RELE(vp);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+	}
+	mutex_exit(&zfsvfs->z_lock);
+
+	ASSERT(zfsvfs->z_shares_dir);
+	if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) {
+		VN_RELE(vp);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
+	switch (zc->zc_cookie) {
+	case ZFS_SMB_ACL_ADD:
+		vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+		vattr.va_type = VREG;
+		vattr.va_mode = S_IFREG|0777;
+		vattr.va_uid = 0;
+		vattr.va_gid = 0;
+
+		vsec.vsa_mask = VSA_ACE;
+		vsec.vsa_aclentp = &full_access;
+		vsec.vsa_aclentsz = sizeof (full_access);
+		vsec.vsa_aclcnt = 1;
+
+		error = VOP_CREATE(ZTOV(sharedir), zc->zc_string,
+		    &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec);
+		if (resourcevp)
+			VN_RELE(resourcevp);
+		break;
+
+	case ZFS_SMB_ACL_REMOVE:
+		error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred,
+		    NULL, 0);
+		break;
+
+	case ZFS_SMB_ACL_RENAME:
+		if ((error = get_nvlist(zc->zc_nvlist_src,
+		    zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) {
+			VN_RELE(vp);
+			ZFS_EXIT(zfsvfs);
+			return (error);
+		}
+		if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) ||
+		    nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET,
+		    &target)) {
+			VN_RELE(vp);
+			VN_RELE(ZTOV(sharedir));
+			ZFS_EXIT(zfsvfs);
+			nvlist_free(nvlist);
+			return (error);
+		}
+		error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target,
+		    kcred, NULL, 0);
+		nvlist_free(nvlist);
+		break;
+
+	case ZFS_SMB_ACL_PURGE:
+		error = zfs_smb_acl_purge(sharedir);
+		break;
+
+	default:
+		error = EINVAL;
+		break;
+	}
+
+	VN_RELE(vp);
+	VN_RELE(ZTOV(sharedir));
+
+	ZFS_EXIT(zfsvfs);
+
+	return (error);
+}
+
+/*
+ * inputs:
+ * zc_name	name of filesystem
+ * zc_value	short name of snap
+ * zc_string	user-supplied tag for this reference
+ * zc_cookie	recursive flag
+ * zc_temphold	set if hold is temporary
+ *
+ * outputs:		none
+ */
+static int
+zfs_ioc_hold(zfs_cmd_t *zc)
+{
+	boolean_t recursive = zc->zc_cookie;
+
+	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+		return (EINVAL);
+
+	return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value,
+	    zc->zc_string, recursive, zc->zc_temphold));
+}
+
+/*
+ * inputs:
+ * zc_name	name of dataset from which we're releasing a user reference
+ * zc_value	short name of snap
+ * zc_string	user-supplied tag for this reference
+ * zc_cookie	recursive flag
+ *
+ * outputs:		none
+ */
+static int
+zfs_ioc_release(zfs_cmd_t *zc)
+{
+	boolean_t recursive = zc->zc_cookie;
+
+	if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0)
+		return (EINVAL);
+
+	return (dsl_dataset_user_release(zc->zc_name, zc->zc_value,
+	    zc->zc_string, recursive));
+}
+
+/*
+ * inputs:
+ * zc_name		name of filesystem
+ *
+ * outputs:
+ * zc_nvlist_src{_size}	nvlist of snapshot holds
+ */
+static int
+zfs_ioc_get_holds(zfs_cmd_t *zc)
+{
+	nvlist_t *nvp;
+	int error;
+
+	if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) {
+		error = put_nvlist(zc, nvp);
+		nvlist_free(nvp);
+	}
+
+	return (error);
+}
+
 /*
  * pool create, destroy, and export don't log the history as part of
  * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export
  * do the logging of those commands.
  */
 static zfs_ioc_vec_t zfs_ioc_vec[] = {
-	{ zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_pool_destroy,	zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_pool_configs,	zfs_secpolicy_none, NO_NAME, B_FALSE },
-	{ zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE },
-	{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE },
-	{ zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE },
-	{ zfs_ioc_pool_upgrade,	zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_set_state, zfs_secpolicy_config,	POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_vdev_setpath,	zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_objset_stats,	zfs_secpolicy_read, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_dataset_list_next, zfs_secpolicy_read,
-	    DATASET_NAME, B_FALSE },
-	{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read,
-	    DATASET_NAME, B_FALSE },
-	{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_create_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_remove_minor,	zfs_secpolicy_minor, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_rename, zfs_secpolicy_rename,	DATASET_NAME, B_TRUE },
-	{ zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_inject_fault,	zfs_secpolicy_inject, NO_NAME, B_FALSE },
-	{ zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE },
-	{ zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE },
-	{ zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE },
-	{ zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE },
-	{ zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy,	DATASET_NAME, B_TRUE },
-	{ zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE },
-	{ zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE },
-	{ zfs_ioc_pool_set_props, zfs_secpolicy_config,	POOL_NAME, B_TRUE },
-	{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE },
-	{ zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE },
-	{ zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi,
-	    DATASET_NAME, B_FALSE },
-	{ zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE },
-	{ zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE },
+	{ zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_destroy,	zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_FALSE },
+	{ zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_configs,	zfs_secpolicy_none, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_pool_upgrade,	zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_vdev_set_state, zfs_secpolicy_config,	POOL_NAME, B_TRUE,
+	    B_FALSE },
+	{ zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_vdev_setpath,	zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_vdev_setfru,	zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_objset_stats,	zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE },
+	{ zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE },
+	{ zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE,
+	    B_TRUE},
+	{ zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_rename, zfs_secpolicy_rename,	DATASET_NAME, B_TRUE, B_TRUE },
+	{ zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE },
+	{ zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE },
+	{ zfs_ioc_inject_fault,	zfs_secpolicy_inject, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE },
+	{ zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME,
+	    B_TRUE, B_TRUE },
+	{ zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_pool_set_props, zfs_secpolicy_config,	POOL_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE },
+	{ zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_userspace_one, zfs_secpolicy_userspace_one,
+	    DATASET_NAME, B_FALSE, B_FALSE },
+	{ zfs_ioc_userspace_many, zfs_secpolicy_userspace_many,
+	    DATASET_NAME, B_FALSE, B_FALSE },
+	{ zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade,
+	    DATASET_NAME, B_FALSE, B_TRUE },
+	{ zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE },
+	{ zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE,
+	    B_TRUE },
+	{ zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_TRUE },
+	{ zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE,
+	    B_FALSE },
+	{ zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE,
+	    B_TRUE }
 };
 
+int
+pool_status_check(const char *name, zfs_ioc_namecheck_t type)
+{
+	spa_t *spa;
+	int error;
+
+	ASSERT(type == POOL_NAME || type == DATASET_NAME);
+
+	error = spa_open(name, &spa, FTAG);
+	if (error == 0) {
+		if (spa_suspended(spa))
+			error = EAGAIN;
+		spa_close(spa, FTAG);
+	}
+	return (error);
+}
+
 static int
 zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 {
@@ -2985,9 +4419,9 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 
 	zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP);
 
-	error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t));
+	error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag);
 
-	if (error == 0)
+	if ((error == 0) && !(flag & FKIOCTL))
 		error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr);
 
 	/*
@@ -2996,15 +4430,22 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 	 */
 	if (error == 0) {
 		zc->zc_name[sizeof (zc->zc_name) - 1] = '\0';
+		zc->zc_iflags = flag & FKIOCTL;
 		switch (zfs_ioc_vec[vec].zvec_namecheck) {
 		case POOL_NAME:
 			if (pool_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
+			if (zfs_ioc_vec[vec].zvec_pool_check)
+				error = pool_status_check(zc->zc_name,
+				    zfs_ioc_vec[vec].zvec_namecheck);
 			break;
 
 		case DATASET_NAME:
 			if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0)
 				error = EINVAL;
+			if (zfs_ioc_vec[vec].zvec_pool_check)
+				error = pool_status_check(zc->zc_name,
+				    zfs_ioc_vec[vec].zvec_namecheck);
 			break;
 
 		case NO_NAME:
@@ -3015,10 +4456,10 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 	if (error == 0)
 		error = zfs_ioc_vec[vec].zvec_func(zc);
 
-	rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t));
+	rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag);
 	if (error == 0) {
 		error = rc;
-		if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE)
+		if (zfs_ioc_vec[vec].zvec_his_log)
 			zfs_log_history(zc);
 	}
 
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c
index 11cd4c264b573..b4e74dad1f44a 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -45,13 +45,25 @@
 #include <sys/spa.h>
 #include <sys/zfs_fuid.h>
 #include <sys/ddi.h>
+#include <sys/dsl_dataset.h>
 
 /*
- * All the functions in this file are used to construct the log entries
- * to record transactions. They allocate * an intent log transaction
- * structure (itx_t) and save within it all the information necessary to
- * possibly replay the transaction. The itx is then assigned a sequence
- * number and inserted in the in-memory list anchored in the zilog.
+ * These zfs_log_* functions must be called within a dmu tx, in one
+ * of 2 contexts depending on zilog->z_replay:
+ *
+ * Non replay mode
+ * ---------------
+ * We need to record the transaction so that if it is committed to
+ * the Intent Log then it can be replayed.  An intent log transaction
+ * structure (itx_t) is allocated and all the information necessary to
+ * possibly replay the transaction is saved in it. The itx is then assigned
+ * a sequence number and inserted in the in-memory list anchored in the zilog.
+ *
+ * Replay mode
+ * -----------
+ * We need to mark the intent log record as replayed in the log header.
+ * This is done in the same transaction as the replay so that they
+ * commit atomically.
  */
 
 int
@@ -155,6 +167,9 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
 		ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 		bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ);
+	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+		*attrs |= (xoap->xoa_reparse == 0) ? 0 :
+		    XAT0_REPARSE;
 }
 
 static void *
@@ -228,7 +243,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	size_t namesize = strlen(name) + 1;
 	size_t fuidsz = 0;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
 	/*
@@ -331,7 +346,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	lr_remove_t *lr;
 	size_t namesize = strlen(name) + 1;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
@@ -355,7 +370,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	lr_link_t *lr;
 	size_t namesize = strlen(name) + 1;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize);
@@ -382,7 +397,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	size_t namesize = strlen(name) + 1;
 	size_t linksize = strlen(link) + 1;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize);
@@ -416,7 +431,7 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
 	size_t snamesize = strlen(sname) + 1;
 	size_t dnamesize = strlen(dname) + 1;
 
-	if (zilog == NULL)
+	if (zil_replaying(zilog, tx))
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize);
@@ -437,9 +452,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype,
  */
 ssize_t zfs_immediate_write_sz = 32768;
 
-#define	ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \
-    sizeof (lr_write_t))
-
 void
 zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	znode_t *zp, offset_t off, ssize_t resid, int ioflag)
@@ -447,35 +459,17 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	itx_wr_state_t write_state;
 	boolean_t slogging;
 	uintptr_t fsync_cnt;
+	ssize_t immediate_write_sz;
 
-	if (zilog == NULL || zp->z_unlinked)
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
-	/*
-	 * Writes are handled in three different ways:
-	 *
-	 * WR_INDIRECT:
-	 *    In this mode, if we need to commit the write later, then the block
-	 *    is immediately written into the file system (using dmu_sync),
-	 *    and a pointer to the block is put into the log record.
-	 *    When the txg commits the block is linked in.
-	 *    This saves additionally writing the data into the log record.
-	 *    There are a few requirements for this to occur:
-	 *	- write is greater than zfs_immediate_write_sz
-	 *	- not using slogs (as slogs are assumed to always be faster
-	 *	  than writing into the main pool)
-	 *	- the write occupies only one block
-	 * WR_COPIED:
-	 *    If we know we'll immediately be committing the
-	 *    transaction (FSYNC or FDSYNC), the we allocate a larger
-	 *    log record here for the data and copy the data in.
-	 * WR_NEED_COPY:
-	 *    Otherwise we don't allocate a buffer, and *if* we need to
-	 *    flush the write later then a buffer is allocated and
-	 *    we retrieve the data using the dmu.
-	 */
-	slogging = spa_has_slogs(zilog->zl_spa);
-	if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz)
+	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+	    ? 0 : zfs_immediate_write_sz;
+
+	slogging = spa_has_slogs(zilog->zl_spa) &&
+	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+	if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz)
 		write_state = WR_INDIRECT;
 	else if (ioflag & (FSYNC | FDSYNC))
 		write_state = WR_COPIED;
@@ -503,9 +497,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 		    (write_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
 		if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os,
-		    zp->z_id, off, len, lr + 1) != 0) {
-			kmem_free(itx, offsetof(itx_t, itx_lr) +
-			    itx->itx_lr.lrc_reclen);
+		    zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+			zil_itx_destroy(itx);
 			itx = zil_itx_create(txtype, sizeof (*lr));
 			lr = (lr_write_t *)&itx->itx_lr;
 			write_state = WR_NEED_COPY;
@@ -546,7 +539,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	uint64_t seq;
 	lr_truncate_t *lr;
 
-	if (zilog == NULL || zp->z_unlinked)
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	itx = zil_itx_create(txtype, sizeof (*lr));
@@ -574,8 +567,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype,
 	size_t		recsize = sizeof (lr_setattr_t);
 	void		*start;
 
-
-	if (zilog == NULL || zp->z_unlinked)
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	/*
@@ -641,7 +633,7 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp,
 	size_t txsize;
 	size_t aclbytes = vsecp->vsa_aclentsz;
 
-	if (zilog == NULL || zp->z_unlinked)
+	if (zil_replaying(zilog, tx) || zp->z_unlinked)
 		return;
 
 	txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ?
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c
index 85b79703a7807..39daf968b2310 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/param.h>
 #include <sys/systm.h>
@@ -129,6 +127,8 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap)
 		ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime);
 	if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP))
 		bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ);
+	if (XVA_ISSET_REQ(xvap, XAT_REPARSE))
+		xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0);
 }
 
 static int
@@ -275,9 +275,9 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs,
 	uint64_t txtype;
 	int error;
 
+	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lracl, sizeof (*lracl));
-		txtype = (int)lr->lr_common.lrc_txtype;
 		if (txtype == TX_CREATE_ACL_ATTR ||
 		    txtype == TX_MKDIR_ACL_ATTR) {
 			lrattr = (lr_attr_t *)(caddr_t)(lracl + 1);
@@ -318,7 +318,7 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs,
 
 	if (lr->lr_common.lrc_txtype & TX_CI)
 		vflg |= FIGNORECASE;
-	switch ((int)lr->lr_common.lrc_txtype) {
+	switch (txtype) {
 	case TX_CREATE_ACL:
 		aclstart = (caddr_t)(lracl + 1);
 		fuidstart = (caddr_t)aclstart +
@@ -391,7 +391,8 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs,
 
 	VN_RELE(ZTOV(dzp));
 
-	zfs_fuid_info_free(zfsvfs->z_fuid_replay);
+	if (zfsvfs->z_fuid_replay)
+		zfs_fuid_info_free(zfsvfs->z_fuid_replay);
 	zfsvfs->z_fuid_replay = NULL;
 
 	return (error);
@@ -413,9 +414,9 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 	uint64_t txtype;
 	int error;
 
+	txtype = (lr->lr_common.lrc_txtype & ~TX_CI);
 	if (byteswap) {
 		byteswap_uint64_array(lr, sizeof (*lr));
-		txtype = (int)lr->lr_common.lrc_txtype;
 		if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR)
 			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 	}
@@ -460,7 +461,7 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 		    lr->lr_uid, lr->lr_gid);
 	}
 
-	switch ((int)lr->lr_common.lrc_txtype) {
+	switch (txtype) {
 	case TX_CREATE_ATTR:
 		lrattr = (lr_attr_t *)(caddr_t)(lr + 1);
 		xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize);
@@ -498,7 +499,6 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap)
 		    &vp, kcred, NULL, vflg, NULL);
 		break;
 	case TX_MKXATTR:
-		name = (char *)(lr + 1);
 		error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred);
 		break;
 	case TX_SYMLINK:
@@ -625,6 +625,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
 	znode_t	*zp;
 	int error;
 	ssize_t resid;
+	uint64_t orig_eof, eod, offset, length;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
@@ -640,8 +641,64 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
 		return (error);
 	}
 
-	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length,
-	    lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+	eod = offset + length;		/* end of data for this write */
+
+	orig_eof = zp->z_phys->zp_size;
+
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+	}
+
+	error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset,
+	    UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid);
+
+	/*
+	 * This may be a write from a dmu_sync() for a whole block,
+	 * and may extend beyond the current end of the file.
+	 * We can't just replay what was written for this TX_WRITE as
+	 * a future TX_WRITE2 may extend the eof and the data for that
+	 * write needs to be there. So we write the whole block and
+	 * reduce the eof.
+	 */
+	if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */
+		zp->z_phys->zp_size = eod;
+
+	VN_RELE(ZTOV(zp));
+
+	return (error);
+}
+
+/*
+ * TX_WRITE2 are only generated when dmu_sync() returns EALREADY
+ * meaning the pool block is already being synced. So now that we always write
+ * out full blocks, all we have to do is expand the eof if
+ * the file is grown.
+ */
+static int
+zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap)
+{
+	znode_t	*zp;
+	int error;
+	uint64_t end;
+
+	if (byteswap)
+		byteswap_uint64_array(lr, sizeof (*lr));
+
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
+		return (error);
+
+	end = lr->lr_offset + lr->lr_length;
+	if (end > zp->z_phys->zp_size) {
+		ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz);
+		zp->z_phys->zp_size = end;
+	}
 
 	VN_RELE(ZTOV(zp));
 
@@ -658,16 +715,8 @@ zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap)
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log truncates out of order, it's possible the
-		 * file has been removed. In this case just drop the truncate
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
-	}
 
 	bzero(&fl, sizeof (fl));
 	fl.l_type = F_WRLCK;
@@ -701,16 +750,8 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap)
 			zfs_replay_swap_attrs((lr_attr_t *)(lr + 1));
 	}
 
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log setattrs out of order, it's possible the
-		 * file has been removed. In this case just drop the setattr
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
-	}
 
 	zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode,
 	    lr->lr_uid, lr->lr_gid, 0, lr->lr_foid);
@@ -756,16 +797,8 @@ zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap)
 		zfs_oldace_byteswap(ace, lr->lr_aclcnt);
 	}
 
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log acls out of order, it's possible the
-		 * file has been removed. In this case just drop the acl
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
-	}
 
 	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT;
@@ -813,16 +846,8 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap)
 		}
 	}
 
-	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) {
-		/*
-		 * As we can log acls out of order, it's possible the
-		 * file has been removed. In this case just drop the acl
-		 * and return success.
-		 */
-		if (error == ENOENT)
-			error = 0;
+	if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0)
 		return (error);
-	}
 
 	bzero(&vsa, sizeof (vsa));
 	vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS;
@@ -875,4 +900,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = {
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL */
 	zfs_replay_create,	/* TX_MKDIR_ATTR */
 	zfs_replay_create_acl,	/* TX_MKDIR_ACL_ATTR */
+	zfs_replay_write2,	/* TX_WRITE2 */
 };
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c
index f0a75b5fa0d71..4de8d8a2dfed9 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c
@@ -19,12 +19,10 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This file contains the code to implement file range locking in
  * ZFS, although there isn't much specific to ZFS (all that comes to mind
@@ -431,6 +429,8 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type)
 	new = kmem_alloc(sizeof (rl_t), KM_SLEEP);
 	new->r_zp = zp;
 	new->r_off = off;
+	if (len + off < off)	/* overflow */
+		len = UINT64_MAX - off;
 	new->r_len = len;
 	new->r_cnt = 1; /* assume it's going to be in the tree */
 	new->r_type = type;
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c
index 06b4dee4620bb..6759a812edefc 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -67,6 +67,8 @@ static major_t zfs_major;
 static minor_t zfs_minor;
 static kmutex_t	zfs_dev_mtx;
 
+extern int sys_shutdown;
+
 static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr);
 static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr);
 static int zfs_mountroot(vfs_t *vfsp, enum whymountroot);
@@ -145,12 +147,24 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr)
 		 * Sync a specific filesystem.
 		 */
 		zfsvfs_t *zfsvfs = vfsp->vfs_data;
+		dsl_pool_t *dp;
 
 		ZFS_ENTER(zfsvfs);
+		dp = dmu_objset_pool(zfsvfs->z_os);
+
+		/*
+		 * If the system is shutting down, then skip any
+		 * filesystems which may exist on a suspended pool.
+		 */
+		if (sys_shutdown && spa_suspended(dp->dp_spa)) {
+			ZFS_EXIT(zfsvfs);
+			return (0);
+		}
+
 		if (zfsvfs->z_log != NULL)
 			zil_commit(zfsvfs->z_log, UINT64_MAX, 0);
 		else
-			txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0);
+			txg_wait_synced(dp, 0);
 		ZFS_EXIT(zfsvfs);
 	} else {
 		/*
@@ -554,6 +568,371 @@ zfs_register_callbacks(vfs_t *vfsp)
 
 }
 
+static void
+uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid,
+    int64_t delta, dmu_tx_t *tx)
+{
+	uint64_t used = 0;
+	char buf[32];
+	int err;
+	uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+
+	if (delta == 0)
+		return;
+
+	(void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid);
+	err = zap_lookup(os, obj, buf, 8, 1, &used);
+	ASSERT(err == 0 || err == ENOENT);
+	/* no underflow/overflow */
+	ASSERT(delta > 0 || used >= -delta);
+	ASSERT(delta < 0 || used + delta > used);
+	used += delta;
+	if (used == 0)
+		err = zap_remove(os, obj, buf, tx);
+	else
+		err = zap_update(os, obj, buf, 8, 1, &used, tx);
+	ASSERT(err == 0);
+}
+
+static int
+zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus,
+    uint64_t *userp, uint64_t *groupp)
+{
+	znode_phys_t *znp = bonus;
+
+	if (bonustype != DMU_OT_ZNODE)
+		return (ENOENT);
+
+	*userp = znp->zp_uid;
+	*groupp = znp->zp_gid;
+	return (0);
+}
+
+static void
+fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr,
+    char *domainbuf, int buflen, uid_t *ridp)
+{
+	uint64_t fuid;
+	const char *domain;
+
+	fuid = strtonum(fuidstr, NULL);
+
+	domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid));
+	if (domain)
+		(void) strlcpy(domainbuf, domain, buflen);
+	else
+		domainbuf[0] = '\0';
+	*ridp = FUID_RID(fuid);
+}
+
+static uint64_t
+zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type)
+{
+	switch (type) {
+	case ZFS_PROP_USERUSED:
+		return (DMU_USERUSED_OBJECT);
+	case ZFS_PROP_GROUPUSED:
+		return (DMU_GROUPUSED_OBJECT);
+	case ZFS_PROP_USERQUOTA:
+		return (zfsvfs->z_userquota_obj);
+	case ZFS_PROP_GROUPQUOTA:
+		return (zfsvfs->z_groupquota_obj);
+	}
+	return (0);
+}
+
+int
+zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    uint64_t *cookiep, void *vbuf, uint64_t *bufsizep)
+{
+	int error;
+	zap_cursor_t zc;
+	zap_attribute_t za;
+	zfs_useracct_t *buf = vbuf;
+	uint64_t obj;
+
+	if (!dmu_objset_userspace_present(zfsvfs->z_os))
+		return (ENOTSUP);
+
+	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+	if (obj == 0) {
+		*bufsizep = 0;
+		return (0);
+	}
+
+	for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep);
+	    (error = zap_cursor_retrieve(&zc, &za)) == 0;
+	    zap_cursor_advance(&zc)) {
+		if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) >
+		    *bufsizep)
+			break;
+
+		fuidstr_to_sid(zfsvfs, za.za_name,
+		    buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid);
+
+		buf->zu_space = za.za_first_integer;
+		buf++;
+	}
+	if (error == ENOENT)
+		error = 0;
+
+	ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep);
+	*bufsizep = (uintptr_t)buf - (uintptr_t)vbuf;
+	*cookiep = zap_cursor_serialize(&zc);
+	zap_cursor_fini(&zc);
+	return (error);
+}
+
+/*
+ * buf must be big enough (eg, 32 bytes)
+ */
+static int
+id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid,
+    char *buf, boolean_t addok)
+{
+	uint64_t fuid;
+	int domainid = 0;
+
+	if (domain && domain[0]) {
+		domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok);
+		if (domainid == -1)
+			return (ENOENT);
+	}
+	fuid = FUID_ENCODE(domainid, rid);
+	(void) sprintf(buf, "%llx", (longlong_t)fuid);
+	return (0);
+}
+
+int
+zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t *valp)
+{
+	char buf[32];
+	int err;
+	uint64_t obj;
+
+	*valp = 0;
+
+	if (!dmu_objset_userspace_present(zfsvfs->z_os))
+		return (ENOTSUP);
+
+	obj = zfs_userquota_prop_to_obj(zfsvfs, type);
+	if (obj == 0)
+		return (0);
+
+	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE);
+	if (err)
+		return (err);
+
+	err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp);
+	if (err == ENOENT)
+		err = 0;
+	return (err);
+}
+
+int
+zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
+    const char *domain, uint64_t rid, uint64_t quota)
+{
+	char buf[32];
+	int err;
+	dmu_tx_t *tx;
+	uint64_t *objp;
+	boolean_t fuid_dirtied;
+
+	if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA)
+		return (EINVAL);
+
+	if (zfsvfs->z_version < ZPL_VERSION_USERSPACE)
+		return (ENOTSUP);
+
+	objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj :
+	    &zfsvfs->z_groupquota_obj;
+
+	err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE);
+	if (err)
+		return (err);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+
+	tx = dmu_tx_create(zfsvfs->z_os);
+	dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL);
+	if (*objp == 0) {
+		dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE,
+		    zfs_userquota_prop_prefixes[type]);
+	}
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	err = dmu_tx_assign(tx, TXG_WAIT);
+	if (err) {
+		dmu_tx_abort(tx);
+		return (err);
+	}
+
+	mutex_enter(&zfsvfs->z_lock);
+	if (*objp == 0) {
+		*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
+		    DMU_OT_NONE, 0, tx);
+		VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+		    zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
+	}
+	mutex_exit(&zfsvfs->z_lock);
+
+	if (quota == 0) {
+		err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
+		if (err == ENOENT)
+			err = 0;
+	} else {
+		err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
+	}
+	ASSERT(err == 0);
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+	dmu_tx_commit(tx);
+	return (err);
+}
+
+boolean_t
+zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid)
+{
+	char buf[32];
+	uint64_t used, quota, usedobj, quotaobj;
+	int err;
+
+	usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT;
+	quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj;
+
+	if (quotaobj == 0 || zfsvfs->z_replay)
+		return (B_FALSE);
+
+	(void) sprintf(buf, "%llx", (longlong_t)fuid);
+	err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, &quota);
+	if (err != 0)
+		return (B_FALSE);
+
+	err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used);
+	if (err != 0)
+		return (B_FALSE);
+	return (used >= quota);
+}
+
+int
+zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
+{
+	objset_t *os;
+	zfsvfs_t *zfsvfs;
+	uint64_t zval;
+	int i, error;
+
+	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+
+	/*
+	 * We claim to always be readonly so we can open snapshots;
+	 * other ZPL code will prevent us from writing to snapshots.
+	 */
+	error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os);
+	if (error) {
+		kmem_free(zfsvfs, sizeof (zfsvfs_t));
+		return (error);
+	}
+
+	/*
+	 * Initialize the zfs-specific filesystem structure.
+	 * Should probably make this a kmem cache, shuffle fields,
+	 * and just bzero up to z_hold_mtx[].
+	 */
+	zfsvfs->z_vfs = NULL;
+	zfsvfs->z_parent = zfsvfs;
+	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
+	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
+	zfsvfs->z_os = os;
+
+	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
+	if (error) {
+		goto out;
+	} else if (zfsvfs->z_version > ZPL_VERSION) {
+		(void) printf("Mismatched versions:  File system "
+		    "is version %llu on-disk format, which is "
+		    "incompatible with this software version %lld!",
+		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
+		error = ENOTSUP;
+		goto out;
+	}
+
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
+		goto out;
+	zfsvfs->z_norm = (int)zval;
+
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
+		goto out;
+	zfsvfs->z_utf8 = (zval != 0);
+
+	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
+		goto out;
+	zfsvfs->z_case = (uint_t)zval;
+
+	/*
+	 * Fold case on file systems that are always or sometimes case
+	 * insensitive.
+	 */
+	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
+	    zfsvfs->z_case == ZFS_CASE_MIXED)
+		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
+	    &zfsvfs->z_root);
+	if (error)
+		goto out;
+	ASSERT(zfsvfs->z_root != 0);
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
+	    &zfsvfs->z_unlinkedobj);
+	if (error)
+		goto out;
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA],
+	    8, 1, &zfsvfs->z_userquota_obj);
+	if (error && error != ENOENT)
+		goto out;
+
+	error = zap_lookup(os, MASTER_NODE_OBJ,
+	    zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA],
+	    8, 1, &zfsvfs->z_groupquota_obj);
+	if (error && error != ENOENT)
+		goto out;
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
+	    &zfsvfs->z_fuid_obj);
+	if (error && error != ENOENT)
+		goto out;
+
+	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1,
+	    &zfsvfs->z_shares_dir);
+	if (error && error != ENOENT)
+		goto out;
+
+	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
+	mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
+	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
+	    offsetof(znode_t, z_link_node));
+	rrw_init(&zfsvfs->z_teardown_lock);
+	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
+	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
+	*zfvp = zfsvfs;
+	return (0);
+
+out:
+	dmu_objset_disown(os, zfsvfs);
+	*zfvp = NULL;
+	kmem_free(zfsvfs, sizeof (zfsvfs_t));
+	return (error);
+}
+
 static int
 zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 {
@@ -566,9 +945,15 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 	/*
 	 * Set the objset user_ptr to track its zfsvfs.
 	 */
-	mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock);
+	mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
 	dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
-	mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock);
+	mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
+
+	zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
+	if (zil_disable) {
+		zil_destroy(zfsvfs->z_log, B_FALSE);
+		zfsvfs->z_log = NULL;
+	}
 
 	/*
 	 * If we are not mounting (ie: online recv), then we don't
@@ -583,68 +968,106 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting)
 		 * allow replays to succeed.
 		 */
 		readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY;
-		zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
-
-		/*
-		 * Parse and replay the intent log.
-		 */
-		zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign,
-		    zfs_replay_vector, zfs_unlinked_drain);
+		if (readonly != 0)
+			zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY;
+		else
+			zfs_unlinked_drain(zfsvfs);
 
-		zfs_unlinked_drain(zfsvfs);
+		if (zfsvfs->z_log) {
+			/*
+			 * Parse and replay the intent log.
+			 *
+			 * Because of ziltest, this must be done after
+			 * zfs_unlinked_drain().  (Further note: ziltest
+			 * doesn't use readonly mounts, where
+			 * zfs_unlinked_drain() isn't called.)  This is because
+			 * ziltest causes spa_sync() to think it's committed,
+			 * but actually it is not, so the intent log contains
+			 * many txg's worth of changes.
+			 *
+			 * In particular, if object N is in the unlinked set in
+			 * the last txg to actually sync, then it could be
+			 * actually freed in a later txg and then reallocated
+			 * in a yet later txg.  This would write a "create
+			 * object N" record to the intent log.  Normally, this
+			 * would be fine because the spa_sync() would have
+			 * written out the fact that object N is free, before
+			 * we could write the "create object N" intent log
+			 * record.
+			 *
+			 * But when we are in ziltest mode, we advance the "open
+			 * txg" without actually spa_sync()-ing the changes to
+			 * disk.  So we would see that object N is still
+			 * allocated and in the unlinked set, and there is an
+			 * intent log record saying to allocate it.
+			 */
+			zfsvfs->z_replay = B_TRUE;
+			zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector);
+			zfsvfs->z_replay = B_FALSE;
+		}
 		zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */
 	}
 
-	if (!zil_disable)
-		zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data);
-
 	return (0);
 }
 
-static void
-zfs_freezfsvfs(zfsvfs_t *zfsvfs)
+void
+zfsvfs_free(zfsvfs_t *zfsvfs)
 {
+	int i;
+	extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */
+
+	/*
+	 * This is a barrier to prevent the filesystem from going away in
+	 * zfs_znode_move() until we can safely ensure that the filesystem is
+	 * not unmounted. We consider the filesystem valid before the barrier
+	 * and invalid after the barrier.
+	 */
+	rw_enter(&zfsvfs_lock, RW_READER);
+	rw_exit(&zfsvfs_lock);
+
+	zfs_fuid_destroy(zfsvfs);
+
 	mutex_destroy(&zfsvfs->z_znodes_lock);
-	mutex_destroy(&zfsvfs->z_online_recv_lock);
+	mutex_destroy(&zfsvfs->z_lock);
 	list_destroy(&zfsvfs->z_all_znodes);
 	rrw_destroy(&zfsvfs->z_teardown_lock);
 	rw_destroy(&zfsvfs->z_teardown_inactive_lock);
 	rw_destroy(&zfsvfs->z_fuid_lock);
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
 	kmem_free(zfsvfs, sizeof (zfsvfs_t));
 }
 
+static void
+zfs_set_fuid_feature(zfsvfs_t *zfsvfs)
+{
+	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
+	if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) {
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR);
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS);
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS);
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE);
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER);
+		vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE);
+	}
+}
+
 static int
 zfs_domount(vfs_t *vfsp, char *osname)
 {
 	dev_t mount_dev;
-	uint64_t recordsize, readonly;
+	uint64_t recordsize, fsid_guid;
 	int error = 0;
-	int mode;
 	zfsvfs_t *zfsvfs;
-	znode_t *zp = NULL;
 
 	ASSERT(vfsp);
 	ASSERT(osname);
 
-	/*
-	 * Initialize the zfs-specific filesystem structure.
-	 * Should probably make this a kmem cache, shuffle fields,
-	 * and just bzero up to z_hold_mtx[].
-	 */
-	zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP);
+	error = zfsvfs_create(osname, &zfsvfs);
+	if (error)
+		return (error);
 	zfsvfs->z_vfs = vfsp;
-	zfsvfs->z_parent = zfsvfs;
-	zfsvfs->z_assign = TXG_NOWAIT;
-	zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE;
-	zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE;
-
-	mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL);
-	mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL);
-	list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
-	    offsetof(znode_t, z_link_node));
-	rrw_init(&zfsvfs->z_teardown_lock);
-	rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
-	rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
 
 	/* Initialize the generic filesystem structure. */
 	vfsp->vfs_bcount = 0;
@@ -666,39 +1089,24 @@ zfs_domount(vfs_t *vfsp, char *osname)
 	vfsp->vfs_flag |= VFS_NOTRUNC;
 	vfsp->vfs_data = zfsvfs;
 
-	if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL))
-		goto out;
-
-	mode = DS_MODE_OWNER;
-	if (readonly)
-		mode |= DS_MODE_READONLY;
-
-	error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
-	if (error == EROFS) {
-		mode = DS_MODE_OWNER | DS_MODE_READONLY;
-		error = dmu_objset_open(osname, DMU_OST_ZFS, mode,
-		    &zfsvfs->z_os);
-	}
-
-	if (error)
-		goto out;
-
-	if (error = zfs_init_fs(zfsvfs, &zp))
-		goto out;
-
-	/* The call to zfs_init_fs leaves the vnode held, release it here. */
-	VN_RELE(ZTOV(zp));
+	/*
+	 * The fsid is 64 bits, composed of an 8-bit fs type, which
+	 * separates our fsid from any other filesystem types, and a
+	 * 56-bit objset unique ID.  The objset unique ID is unique to
+	 * all objsets open on this system, provided by unique_create().
+	 * The 8-bit fs type must be put in the low bits of fsid[1]
+	 * because that's where other Solaris filesystems put it.
+	 */
+	fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os);
+	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
+	vfsp->vfs_fsid.val[0] = fsid_guid;
+	vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
+	    zfsfstype & 0xFF;
 
 	/*
 	 * Set features for file system.
 	 */
-	zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os);
-	if (zfsvfs->z_use_fuids) {
-		vfs_set_feature(vfsp, VFSFT_XVATTR);
-		vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS);
-		vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS);
-		vfs_set_feature(vfsp, VFSFT_ACLONCREATE);
-	}
+	zfs_set_fuid_feature(zfsvfs);
 	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) {
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
@@ -707,17 +1115,21 @@ zfs_domount(vfs_t *vfsp, char *osname)
 		vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS);
 		vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE);
 	}
+	vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED);
 
 	if (dmu_objset_is_snapshot(zfsvfs->z_os)) {
 		uint64_t pval;
 
-		ASSERT(mode & DS_MODE_READONLY);
 		atime_changed_cb(zfsvfs, B_FALSE);
 		readonly_changed_cb(zfsvfs, B_TRUE);
 		if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL))
 			goto out;
 		xattr_changed_cb(zfsvfs, pval);
 		zfsvfs->z_issnap = B_TRUE;
+
+		mutex_enter(&zfsvfs->z_os->os_user_ptr_lock);
+		dmu_objset_set_user(zfsvfs->z_os, zfsvfs);
+		mutex_exit(&zfsvfs->z_os->os_user_ptr_lock);
 	} else {
 		error = zfsvfs_setup(zfsvfs, B_TRUE);
 	}
@@ -726,9 +1138,8 @@ zfs_domount(vfs_t *vfsp, char *osname)
 		zfsctl_create(zfsvfs);
 out:
 	if (error) {
-		if (zfsvfs->z_os)
-			dmu_objset_close(zfsvfs->z_os);
-		zfs_freezfsvfs(zfsvfs);
+		dmu_objset_disown(zfsvfs->z_os, zfsvfs);
+		zfsvfs_free(zfsvfs);
 	} else {
 		atomic_add_32(&zfs_active_fs_count, 1);
 	}
@@ -837,6 +1248,139 @@ zfs_parse_bootfs(char *bpath, char *outpath)
 	return (error);
 }
 
+/*
+ * zfs_check_global_label:
+ *	Check that the hex label string is appropriate for the dataset
+ *	being mounted into the global_zone proper.
+ *
+ *	Return an error if the hex label string is not default or
+ *	admin_low/admin_high.  For admin_low labels, the corresponding
+ *	dataset must be readonly.
+ */
+int
+zfs_check_global_label(const char *dsname, const char *hexsl)
+{
+	if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+		return (0);
+	if (strcasecmp(hexsl, ADMIN_HIGH) == 0)
+		return (0);
+	if (strcasecmp(hexsl, ADMIN_LOW) == 0) {
+		/* must be readonly */
+		uint64_t rdonly;
+
+		if (dsl_prop_get_integer(dsname,
+		    zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL))
+			return (EACCES);
+		return (rdonly ? 0 : EACCES);
+	}
+	return (EACCES);
+}
+
+/*
+ * zfs_mount_label_policy:
+ *	Determine whether the mount is allowed according to MAC check.
+ *	by comparing (where appropriate) label of the dataset against
+ *	the label of the zone being mounted into.  If the dataset has
+ *	no label, create one.
+ *
+ *	Returns:
+ *		 0 :	access allowed
+ *		>0 :	error code, such as EACCES
+ */
+static int
+zfs_mount_label_policy(vfs_t *vfsp, char *osname)
+{
+	int		error, retv;
+	zone_t		*mntzone = NULL;
+	ts_label_t	*mnt_tsl;
+	bslabel_t	*mnt_sl;
+	bslabel_t	ds_sl;
+	char		ds_hexsl[MAXNAMELEN];
+
+	retv = EACCES;				/* assume the worst */
+
+	/*
+	 * Start by getting the dataset label if it exists.
+	 */
+	error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+	    1, sizeof (ds_hexsl), &ds_hexsl, NULL);
+	if (error)
+		return (EACCES);
+
+	/*
+	 * If labeling is NOT enabled, then disallow the mount of datasets
+	 * which have a non-default label already.  No other label checks
+	 * are needed.
+	 */
+	if (!is_system_labeled()) {
+		if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0)
+			return (0);
+		return (EACCES);
+	}
+
+	/*
+	 * Get the label of the mountpoint.  If mounting into the global
+	 * zone (i.e. mountpoint is not within an active zone and the
+	 * zoned property is off), the label must be default or
+	 * admin_low/admin_high only; no other checks are needed.
+	 */
+	mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE);
+	if (mntzone->zone_id == GLOBAL_ZONEID) {
+		uint64_t zoned;
+
+		zone_rele(mntzone);
+
+		if (dsl_prop_get_integer(osname,
+		    zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL))
+			return (EACCES);
+		if (!zoned)
+			return (zfs_check_global_label(osname, ds_hexsl));
+		else
+			/*
+			 * This is the case of a zone dataset being mounted
+			 * initially, before the zone has been fully created;
+			 * allow this mount into global zone.
+			 */
+			return (0);
+	}
+
+	mnt_tsl = mntzone->zone_slabel;
+	ASSERT(mnt_tsl != NULL);
+	label_hold(mnt_tsl);
+	mnt_sl = label2bslabel(mnt_tsl);
+
+	if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) {
+		/*
+		 * The dataset doesn't have a real label, so fabricate one.
+		 */
+		char *str = NULL;
+
+		if (l_to_str_internal(mnt_sl, &str) == 0 &&
+		    dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL),
+		    ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0)
+			retv = 0;
+		if (str != NULL)
+			kmem_free(str, strlen(str) + 1);
+	} else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) {
+		/*
+		 * Now compare labels to complete the MAC check.  If the
+		 * labels are equal then allow access.  If the mountpoint
+		 * label dominates the dataset label, allow readonly access.
+		 * Otherwise, access is denied.
+		 */
+		if (blequal(mnt_sl, &ds_sl))
+			retv = 0;
+		else if (bldominates(mnt_sl, &ds_sl)) {
+			vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0);
+			retv = 0;
+		}
+	}
+
+	label_rele(mnt_tsl);
+	zone_rele(mntzone);
+	return (retv);
+}
+
 static int
 zfs_mountroot(vfs_t *vfsp, enum whymountroot why)
 {
@@ -1026,6 +1570,10 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 		goto out;
 	}
 
+	error = zfs_mount_label_policy(vfsp, osname);
+	if (error)
+		goto out;
+
 	/*
 	 * When doing a remount, we simply refresh our temporary properties
 	 * according to those options set in the current VFS options.
@@ -1039,6 +1587,13 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr)
 
 	error = zfs_domount(vfsp, osname);
 
+	/*
+	 * Add an extra VFS_HOLD on our parent vfs so that it can't
+	 * disappear due to a forced unmount.
+	 */
+	if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap)
+		VFS_HOLD(mvp->v_vfsp);
+
 out:
 	pn_free(&spn);
 	return (error);
@@ -1288,14 +1843,14 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr)
 		/*
 		 * Unset the objset user_ptr.
 		 */
-		mutex_enter(&os->os->os_user_ptr_lock);
+		mutex_enter(&os->os_user_ptr_lock);
 		dmu_objset_set_user(os, NULL);
-		mutex_exit(&os->os->os_user_ptr_lock);
+		mutex_exit(&os->os_user_ptr_lock);
 
 		/*
 		 * Finally release the objset
 		 */
-		dmu_objset_close(os);
+		dmu_objset_disown(os, zfsvfs);
 	}
 
 	/*
@@ -1398,16 +1953,13 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp)
  * 'z_teardown_inactive_lock' write held.
  */
 int
-zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
+zfs_suspend_fs(zfsvfs_t *zfsvfs)
 {
 	int error;
 
 	if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0)
 		return (error);
-
-	*mode = zfsvfs->z_os->os_mode;
-	dmu_objset_name(zfsvfs->z_os, name);
-	dmu_objset_close(zfsvfs->z_os);
+	dmu_objset_disown(zfsvfs->z_os, zfsvfs);
 
 	return (0);
 }
@@ -1416,14 +1968,15 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode)
  * Reopen zfsvfs_t::z_os and release VOPs.
  */
 int
-zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode)
+zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
 {
 	int err;
 
 	ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
 	ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
 
-	err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os);
+	err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs,
+	    &zfsvfs->z_os);
 	if (err) {
 		zfsvfs->z_os = NULL;
 	} else {
@@ -1465,13 +2018,15 @@ static void
 zfs_freevfs(vfs_t *vfsp)
 {
 	zfsvfs_t *zfsvfs = vfsp->vfs_data;
-	int i;
 
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_destroy(&zfsvfs->z_hold_mtx[i]);
+	/*
+	 * If this is a snapshot, we have an extra VFS_HOLD on our parent
+	 * from zfs_mount().  Release it here.
+	 */
+	if (zfsvfs->z_issnap)
+		VFS_RELE(zfsvfs->z_parent->z_vfs);
 
-	zfs_fuid_destroy(zfsvfs);
-	zfs_freezfsvfs(zfsvfs);
+	zfsvfs_free(zfsvfs);
 
 	atomic_add_32(&zfs_active_fs_count, -1);
 }
@@ -1530,6 +2085,8 @@ zfs_init(void)
 	 * Initialize znode cache, vnode ops, etc...
 	 */
 	zfs_znode_init();
+
+	dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb);
 }
 
 void
@@ -1546,54 +2103,46 @@ zfs_busy(void)
 }
 
 int
-zfs_set_version(const char *name, uint64_t newvers)
+zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers)
 {
 	int error;
-	objset_t *os;
+	objset_t *os = zfsvfs->z_os;
 	dmu_tx_t *tx;
-	uint64_t curvers;
-
-	/*
-	 * XXX for now, require that the filesystem be unmounted.  Would
-	 * be nice to find the zfsvfs_t and just update that if
-	 * possible.
-	 */
 
 	if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION)
 		return (EINVAL);
 
-	error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os);
-	if (error)
-		return (error);
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
-	    8, 1, &curvers);
-	if (error)
-		goto out;
-	if (newvers < curvers) {
-		error = EINVAL;
-		goto out;
-	}
+	if (newvers < zfsvfs->z_version)
+		return (EINVAL);
 
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR);
+	dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
-		goto out;
+		return (error);
+	}
+	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR,
+	    8, 1, &newvers, tx);
+
+	if (error) {
+		dmu_tx_commit(tx);
+		return (error);
 	}
-	error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1,
-	    &newvers, tx);
 
 	spa_history_internal_log(LOG_DS_UPGRADE,
 	    dmu_objset_spa(os), tx, CRED(),
-	    "oldver=%llu newver=%llu dataset = %llu", curvers, newvers,
-	    dmu_objset_id(os));
+	    "oldver=%llu newver=%llu dataset = %llu",
+	    zfsvfs->z_version, newvers, dmu_objset_id(os));
+
 	dmu_tx_commit(tx);
 
-out:
-	dmu_objset_close(os);
-	return (error);
+	zfsvfs->z_version = newvers;
+
+	if (zfsvfs->z_version >= ZPL_VERSION_FUID)
+		zfs_set_fuid_feature(zfsvfs);
+
+	return (0);
 }
 
 /*
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c
index 8e0037e37da52..6883db5cf9a6b 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -73,7 +73,7 @@
 #include <sys/zfs_rlock.h>
 #include <sys/extdirent.h>
 #include <sys/kidmap.h>
-#include <sys/cred_impl.h>
+#include <sys/cred.h>
 #include <sys/attr.h>
 
 /*
@@ -101,13 +101,12 @@
  *	pushing cached pages (which acquires range locks) and syncing out
  *	cached atime changes.  Third, zfs_zinactive() may require a new tx,
  *	which could deadlock the system if you were already holding one.
+ *	If you must call VN_RELE() within a tx then use VN_RELE_ASYNC().
  *
  *  (3)	All range locks must be grabbed before calling dmu_tx_assign(),
  *	as they can span dmu_tx_assign() calls.
  *
- *  (4)	Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign().
- *	In normal operation, this will be TXG_NOWAIT.  During ZIL replay,
- *	it will be a specific txg.  Either way, dmu_tx_assign() never blocks.
+ *  (4)	Always pass TXG_NOWAIT as the second argument to dmu_tx_assign().
  *	This is critical because we don't want to block while holding locks.
  *	Note, in particular, that if a lock is sometimes acquired before
  *	the tx assigns, and sometimes after (e.g. z_lock), then failing to
@@ -124,6 +123,8 @@
  *  (5)	If the operation succeeded, generate the intent log entry for it
  *	before dropping locks.  This ensures that the ordering of events
  *	in the intent log matches the order in which they actually occurred.
+ *      During ZIL replay the zfs_log_* functions will update the sequence
+ *	number to indicate the zil transaction has replayed.
  *
  *  (6)	At the end of each vnode op, the DMU tx must always commit,
  *	regardless of whether there were any errors.
@@ -139,12 +140,12 @@
  *	rw_enter(...);			// grab any other locks you need
  *	tx = dmu_tx_create(...);	// get DMU tx
  *	dmu_tx_hold_*();		// hold each object you might modify
- *	error = dmu_tx_assign(tx, zfsvfs->z_assign);	// try to assign
+ *	error = dmu_tx_assign(tx, TXG_NOWAIT);	// try to assign
  *	if (error) {
  *		rw_exit(...);		// drop locks
  *		zfs_dirent_unlock(dl);	// unlock directory entry
  *		VN_RELE(...);		// release held vnodes
- *		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+ *		if (error == ERESTART) {
  *			dmu_tx_wait(tx);
  *			dmu_tx_abort(tx);
  *			goto top;
@@ -207,6 +208,12 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 	znode_t	*zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
 
+	/*
+	 * Clean up any locks held by this process on the vp.
+	 */
+	cleanlocks(vp, ddi_get_pid(), 0);
+	cleanshares(vp, ddi_get_pid());
+
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
@@ -214,12 +221,6 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr,
 	if ((flag & (FSYNC | FDSYNC)) && (count == 1))
 		atomic_dec_32(&zp->z_sync_cnt);
 
-	/*
-	 * Clean up any locks held by this process on the vp.
-	 */
-	cleanlocks(vp, ddi_get_pid(), 0);
-	cleanshares(vp, ddi_get_pid());
-
 	if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan &&
 	    ZTOV(zp)->v_type == VREG &&
 	    !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) &&
@@ -348,56 +349,29 @@ zfs_unmap_page(page_t *pp, caddr_t addr)
  *
  * On Write:	If we find a memory mapped page, we write to *both*
  *		the page and the dmu buffer.
- *
- * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when
- *	the file is memory mapped.
  */
-static int
-mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx)
+static void
+update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid)
 {
-	znode_t	*zp = VTOZ(vp);
-	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int64_t	start, off;
-	int len = nbytes;
-	int error = 0;
+	int64_t	off;
 
-	start = uio->uio_loffset;
 	off = start & PAGEOFFSET;
 	for (start &= PAGEMASK; len > 0; start += PAGESIZE) {
 		page_t *pp;
-		uint64_t bytes = MIN(PAGESIZE - off, len);
-		uint64_t woff = uio->uio_loffset;
+		uint64_t nbytes = MIN(PAGESIZE - off, len);
 
-		/*
-		 * We don't want a new page to "appear" in the middle of
-		 * the file update (because it may not get the write
-		 * update data), so we grab a lock to block
-		 * zfs_getpage().
-		 */
-		rw_enter(&zp->z_map_lock, RW_WRITER);
 		if (pp = page_lookup(vp, start, SE_SHARED)) {
 			caddr_t va;
 
-			rw_exit(&zp->z_map_lock);
 			va = zfs_map_page(pp, S_WRITE);
-			error = uiomove(va+off, bytes, UIO_WRITE, uio);
-			if (error == 0) {
-				dmu_write(zfsvfs->z_os, zp->z_id,
-				    woff, bytes, va+off, tx);
-			}
+			(void) dmu_read(os, oid, start+off, nbytes, va+off,
+			    DMU_READ_PREFETCH);
 			zfs_unmap_page(pp, va);
 			page_unlock(pp);
-		} else {
-			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
-			    uio, bytes, tx);
-			rw_exit(&zp->z_map_lock);
 		}
-		len -= bytes;
+		len -= nbytes;
 		off = 0;
-		if (error)
-			break;
 	}
-	return (error);
 }
 
 /*
@@ -473,6 +447,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	ssize_t		n, nbytes;
 	int		error;
 	rl_t		*rl;
+	xuio_t		*xuio = NULL;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
@@ -533,6 +508,35 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	ASSERT(uio->uio_loffset < zp->z_phys->zp_size);
 	n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset);
 
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) {
+		int nblk;
+		int blksz = zp->z_blksz;
+		uint64_t offset = uio->uio_loffset;
+
+		xuio = (xuio_t *)uio;
+		if ((ISP2(blksz))) {
+			nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset,
+			    blksz)) / blksz;
+		} else {
+			ASSERT(offset + n <= blksz);
+			nblk = 1;
+		}
+		(void) dmu_xuio_init(xuio, nblk);
+
+		if (vn_has_cached_data(vp)) {
+			/*
+			 * For simplicity, we always allocate a full buffer
+			 * even if we only expect to read a portion of a block.
+			 */
+			while (--nblk >= 0) {
+				(void) dmu_xuio_add(xuio,
+				    dmu_request_arcbuf(zp->z_dbuf, blksz),
+				    0, blksz);
+			}
+		}
+	}
+
 	while (n > 0) {
 		nbytes = MIN(n, zfs_read_chunk_size -
 		    P2PHASE(uio->uio_loffset, zfs_read_chunk_size));
@@ -550,7 +554,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 
 		n -= nbytes;
 	}
-
 out:
 	zfs_range_unlock(rl);
 
@@ -595,6 +598,13 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	int		max_blksz = zfsvfs->z_max_blksz;
 	uint64_t	pflags;
 	int		error;
+	arc_buf_t	*abuf;
+	iovec_t		*aiov;
+	xuio_t		*xuio = NULL;
+	int		i_iov = 0;
+	int		iovcnt = uio->uio_iovcnt;
+	iovec_t		*iovp = uio->uio_iov;
+	int		write_eof;
 
 	/*
 	 * Fasttrack empty write
@@ -622,45 +632,60 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 
 	zilog = zfsvfs->z_log;
 
+	/*
+	 * Validate file offset
+	 */
+	woff = ioflag & FAPPEND ? zp->z_phys->zp_size : uio->uio_loffset;
+	if (woff < 0) {
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	/*
+	 * Check for mandatory locks before calling zfs_range_lock()
+	 * in order to prevent a deadlock with locks set via fcntl().
+	 */
+	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
+	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+
 	/*
 	 * Pre-fault the pages to ensure slow (eg NFS) pages
 	 * don't hold up txg.
+	 * Skip this if uio contains loaned arc_buf.
 	 */
-	uio_prefaultpages(n, uio);
+	if ((uio->uio_extflg == UIO_XUIO) &&
+	    (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY))
+		xuio = (xuio_t *)uio;
+	else
+		uio_prefaultpages(n, uio);
 
 	/*
 	 * If in append mode, set the io offset pointer to eof.
 	 */
 	if (ioflag & FAPPEND) {
 		/*
-		 * Range lock for a file append:
-		 * The value for the start of range will be determined by
-		 * zfs_range_lock() (to guarantee append semantics).
-		 * If this write will cause the block size to increase,
-		 * zfs_range_lock() will lock the entire file, so we must
-		 * later reduce the range after we grow the block size.
+		 * Obtain an appending range lock to guarantee file append
+		 * semantics.  We reset the write offset once we have the lock.
 		 */
 		rl = zfs_range_lock(zp, 0, n, RL_APPEND);
+		woff = rl->r_off;
 		if (rl->r_len == UINT64_MAX) {
-			/* overlocked, zp_size can't change */
-			woff = uio->uio_loffset = zp->z_phys->zp_size;
-		} else {
-			woff = uio->uio_loffset = rl->r_off;
+			/*
+			 * We overlocked the file because this write will cause
+			 * the file block size to increase.
+			 * Note that zp_size cannot change with this lock held.
+			 */
+			woff = zp->z_phys->zp_size;
 		}
+		uio->uio_loffset = woff;
 	} else {
-		woff = uio->uio_loffset;
-		/*
-		 * Validate file offset
-		 */
-		if (woff < 0) {
-			ZFS_EXIT(zfsvfs);
-			return (EINVAL);
-		}
-
 		/*
-		 * If we need to grow the block size then zfs_range_lock()
-		 * will lock a wider range than we request here.
-		 * Later after growing the block size we reduce the range.
+		 * Note that if the file block size will change as a result of
+		 * this write, then this range lock will lock the entire file
+		 * so that we can re-write the block safely.
 		 */
 		rl = zfs_range_lock(zp, woff, n, RL_WRITER);
 	}
@@ -674,15 +699,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	if ((woff + n) > limit || woff > (limit - n))
 		n = limit - woff;
 
-	/*
-	 * Check for mandatory locks
-	 */
-	if (MANDMODE((mode_t)zp->z_phys->zp_mode) &&
-	    (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) {
-		zfs_range_unlock(rl);
-		ZFS_EXIT(zfsvfs);
-		return (error);
-	}
+	/* Will this write extend the file length? */
+	write_eof = (woff + n > zp->z_phys->zp_size);
+
 	end_size = MAX(zp->z_phys->zp_size, woff + n);
 
 	/*
@@ -691,22 +710,70 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	 * and allows us to do more fine-grained space accounting.
 	 */
 	while (n > 0) {
+		abuf = NULL;
+		woff = uio->uio_loffset;
+again:
+		if (zfs_usergroup_overquota(zfsvfs,
+		    B_FALSE, zp->z_phys->zp_uid) ||
+		    zfs_usergroup_overquota(zfsvfs,
+		    B_TRUE, zp->z_phys->zp_gid)) {
+			if (abuf != NULL)
+				dmu_return_arcbuf(abuf);
+			error = EDQUOT;
+			break;
+		}
+
+		if (xuio && abuf == NULL) {
+			ASSERT(i_iov < iovcnt);
+			aiov = &iovp[i_iov];
+			abuf = dmu_xuio_arcbuf(xuio, i_iov);
+			dmu_xuio_clear(xuio, i_iov);
+			DTRACE_PROBE3(zfs_cp_write, int, i_iov,
+			    iovec_t *, aiov, arc_buf_t *, abuf);
+			ASSERT((aiov->iov_base == abuf->b_data) ||
+			    ((char *)aiov->iov_base - (char *)abuf->b_data +
+			    aiov->iov_len == arc_buf_size(abuf)));
+			i_iov++;
+		} else if (abuf == NULL && n >= max_blksz &&
+		    woff >= zp->z_phys->zp_size &&
+		    P2PHASE(woff, max_blksz) == 0 &&
+		    zp->z_blksz == max_blksz) {
+			/*
+			 * This write covers a full block.  "Borrow" a buffer
+			 * from the dmu so that we can fill it before we enter
+			 * a transaction.  This avoids the possibility of
+			 * holding up the transaction if the data copy hangs
+			 * up on a pagefault (e.g., from an NFS server mapping).
+			 */
+			size_t cbytes;
+
+			abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz);
+			ASSERT(abuf != NULL);
+			ASSERT(arc_buf_size(abuf) == max_blksz);
+			if (error = uiocopy(abuf->b_data, max_blksz,
+			    UIO_WRITE, uio, &cbytes)) {
+				dmu_return_arcbuf(abuf);
+				break;
+			}
+			ASSERT(cbytes == max_blksz);
+		}
+
 		/*
 		 * Start a transaction.
 		 */
-		woff = uio->uio_loffset;
 		tx = dmu_tx_create(zfsvfs->z_os);
 		dmu_tx_hold_bonus(tx, zp->z_id);
 		dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz));
-		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		error = dmu_tx_assign(tx, TXG_NOWAIT);
 		if (error) {
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
+			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
-				continue;
+				goto again;
 			}
 			dmu_tx_abort(tx);
+			if (abuf != NULL)
+				dmu_return_arcbuf(abuf);
 			break;
 		}
 
@@ -734,18 +801,39 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 		 * Perhaps we should use SPA_MAXBLOCKSIZE chunks?
 		 */
 		nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz));
-		rw_enter(&zp->z_map_lock, RW_READER);
 
-		tx_bytes = uio->uio_resid;
-		if (vn_has_cached_data(vp)) {
-			rw_exit(&zp->z_map_lock);
-			error = mappedwrite(vp, nbytes, uio, tx);
+		if (abuf == NULL) {
+			tx_bytes = uio->uio_resid;
+			error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio,
+			    nbytes, tx);
+			tx_bytes -= uio->uio_resid;
 		} else {
-			error = dmu_write_uio(zfsvfs->z_os, zp->z_id,
-			    uio, nbytes, tx);
-			rw_exit(&zp->z_map_lock);
+			tx_bytes = nbytes;
+			ASSERT(xuio == NULL || tx_bytes == aiov->iov_len);
+			/*
+			 * If this is not a full block write, but we are
+			 * extending the file past EOF and this data starts
+			 * block-aligned, use assign_arcbuf().  Otherwise,
+			 * write via dmu_write().
+			 */
+			if (tx_bytes < max_blksz && (!write_eof ||
+			    aiov->iov_base != abuf->b_data)) {
+				ASSERT(xuio);
+				dmu_write(zfsvfs->z_os, zp->z_id, woff,
+				    aiov->iov_len, aiov->iov_base, tx);
+				dmu_return_arcbuf(abuf);
+				xuio_stat_wbuf_copied();
+			} else {
+				ASSERT(xuio || tx_bytes == max_blksz);
+				dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx);
+			}
+			ASSERT(tx_bytes <= uio->uio_resid);
+			uioskip(uio, tx_bytes);
+		}
+		if (tx_bytes && vn_has_cached_data(vp)) {
+			update_pages(vp, woff,
+			    tx_bytes, zfsvfs->z_os, zp->z_id);
 		}
-		tx_bytes -= uio->uio_resid;
 
 		/*
 		 * If we made no progress, we're done.  If we made even
@@ -807,7 +895,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 	 * If we're in replay mode, or we made no progress, return error.
 	 * Otherwise, it's at least a partial write, so it's successful.
 	 */
-	if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) {
+	if (zfsvfs->z_replay || uio->uio_resid == start_resid) {
 		ZFS_EXIT(zfsvfs);
 		return (error);
 	}
@@ -820,19 +908,32 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct)
 }
 
 void
-zfs_get_done(dmu_buf_t *db, void *vzgd)
+zfs_get_done(zgd_t *zgd, int error)
 {
-	zgd_t *zgd = (zgd_t *)vzgd;
-	rl_t *rl = zgd->zgd_rl;
-	vnode_t *vp = ZTOV(rl->r_zp);
+	znode_t *zp = zgd->zgd_private;
+	objset_t *os = zp->z_zfsvfs->z_os;
+
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	zfs_range_unlock(zgd->zgd_rl);
+
+	/*
+	 * Release the vnode asynchronously as we currently have the
+	 * txg stopped from syncing.
+	 */
+	VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
+
+	if (error == 0 && zgd->zgd_bp)
+		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 
-	dmu_buf_rele(db, vzgd);
-	zfs_range_unlock(rl);
-	VN_RELE(vp);
-	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
+#ifdef DEBUG
+static int zil_fault_io = 0;
+#endif
+
 /*
  * Get data to generate a TX_WRITE intent log record.
  */
@@ -842,26 +943,36 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 	zfsvfs_t *zfsvfs = arg;
 	objset_t *os = zfsvfs->z_os;
 	znode_t *zp;
-	uint64_t off = lr->lr_offset;
+	uint64_t object = lr->lr_foid;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;
+	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
-	rl_t *rl;
 	zgd_t *zgd;
-	int dlen = lr->lr_length;		/* length of user data */
 	int error = 0;
 
-	ASSERT(zio);
-	ASSERT(dlen != 0);
+	ASSERT(zio != NULL);
+	ASSERT(size != 0);
 
 	/*
 	 * Nothing to do if the file has been removed
 	 */
-	if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0)
+	if (zfs_zget(zfsvfs, object, &zp) != 0)
 		return (ENOENT);
 	if (zp->z_unlinked) {
-		VN_RELE(ZTOV(zp));
+		/*
+		 * Release the vnode asynchronously as we currently have the
+		 * txg stopped from syncing.
+		 */
+		VN_RELE_ASYNC(ZTOV(zp),
+		    dsl_pool_vnrele_taskq(dmu_objset_pool(os)));
 		return (ENOENT);
 	}
 
+	zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_zilog = zfsvfs->z_log;
+	zgd->zgd_private = zp;
+
 	/*
 	 * Write records come in two flavors: immediate and indirect.
 	 * For small writes it's cheaper to store the data with the
@@ -870,16 +981,16 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 	 * we don't have to write the data twice.
 	 */
 	if (buf != NULL) { /* immediate write */
-		rl = zfs_range_lock(zp, off, dlen, RL_READER);
+		zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER);
 		/* test for truncation needs to be done while range locked */
-		if (off >= zp->z_phys->zp_size) {
+		if (offset >= zp->z_phys->zp_size) {
 			error = ENOENT;
-			goto out;
+		} else {
+			error = dmu_read(os, object, offset, size, buf,
+			    DMU_READ_NO_PREFETCH);
 		}
-		VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf));
+		ASSERT(error == 0 || error == ENOENT);
 	} else { /* indirect write */
-		uint64_t boff; /* block starting offset */
-
 		/*
 		 * Have to lock the whole block to ensure when it's
 		 * written out and it's checksum is being calculated
@@ -887,50 +998,58 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 		 * blocksize after we get the lock in case it's changed!
 		 */
 		for (;;) {
-			if (ISP2(zp->z_blksz)) {
-				boff = P2ALIGN_TYPED(off, zp->z_blksz,
-				    uint64_t);
-			} else {
-				boff = 0;
-			}
-			dlen = zp->z_blksz;
-			rl = zfs_range_lock(zp, boff, dlen, RL_READER);
-			if (zp->z_blksz == dlen)
+			uint64_t blkoff;
+			size = zp->z_blksz;
+			blkoff = ISP2(size) ? P2PHASE(offset, size) : offset;
+			offset -= blkoff;
+			zgd->zgd_rl = zfs_range_lock(zp, offset, size,
+			    RL_READER);
+			if (zp->z_blksz == size)
 				break;
-			zfs_range_unlock(rl);
+			offset += blkoff;
+			zfs_range_unlock(zgd->zgd_rl);
 		}
 		/* test for truncation needs to be done while range locked */
-		if (off >= zp->z_phys->zp_size) {
+		if (lr->lr_offset >= zp->z_phys->zp_size)
 			error = ENOENT;
-			goto out;
+#ifdef DEBUG
+		if (zil_fault_io) {
+			error = EIO;
+			zil_fault_io = 0;
 		}
-		zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
-		zgd->zgd_rl = rl;
-		zgd->zgd_zilog = zfsvfs->z_log;
-		zgd->zgd_bp = &lr->lr_blkptr;
-		VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db));
-		ASSERT(boff == db->db_offset);
-		lr->lr_blkoff = off - boff;
-		error = dmu_sync(zio, db, &lr->lr_blkptr,
-		    lr->lr_common.lrc_txg, zfs_get_done, zgd);
-		ASSERT((error && error != EINPROGRESS) ||
-		    lr->lr_length <= zp->z_blksz);
+#endif
 		if (error == 0)
-			zil_add_block(zfsvfs->z_log, &lr->lr_blkptr);
-		/*
-		 * If we get EINPROGRESS, then we need to wait for a
-		 * write IO initiated by dmu_sync() to complete before
-		 * we can release this dbuf.  We will finish everything
-		 * up in the zfs_get_done() callback.
-		 */
-		if (error == EINPROGRESS)
-			return (0);
-		dmu_buf_rele(db, zgd);
-		kmem_free(zgd, sizeof (zgd_t));
+			error = dmu_buf_hold(os, object, offset, zgd, &db);
+
+		if (error == 0) {
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
+
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
+
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zfs_get_done, zgd);
+			ASSERT(error || lr->lr_length <= zp->z_blksz);
+
+			/*
+			 * On success, we need to wait for the write I/O
+			 * initiated by dmu_sync() to complete before we can
+			 * release this dbuf.  We will finish everything up
+			 * in the zfs_get_done() callback.
+			 */
+			if (error == 0)
+				return (0);
+
+			if (error == EALREADY) {
+				lr->lr_common.lrc_txtype = TX_WRITE2;
+				error = 0;
+			}
+		}
 	}
-out:
-	zfs_range_unlock(rl);
-	VN_RELE(ZTOV(zp));
+
+	zfs_get_done(zgd, error);
+
 	return (error);
 }
 
@@ -955,6 +1074,27 @@ zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr,
 	return (error);
 }
 
+/*
+ * If vnode is for a device return a specfs vnode instead.
+ */
+static int
+specvp_check(vnode_t **vpp, cred_t *cr)
+{
+	int error = 0;
+
+	if (IS_DEVVP(*vpp)) {
+		struct vnode *svp;
+
+		svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
+		VN_RELE(*vpp);
+		if (svp == NULL)
+			error = ENOSYS;
+		*vpp = svp;
+	}
+	return (error);
+}
+
+
 /*
  * Lookup an entry in a directory, or an extended attribute directory.
  * If it exists, return a held vnode reference for it.
@@ -985,7 +1125,46 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
 {
 	znode_t *zdp = VTOZ(dvp);
 	zfsvfs_t *zfsvfs = zdp->z_zfsvfs;
-	int	error;
+	int	error = 0;
+
+	/* fast path */
+	if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) {
+
+		if (dvp->v_type != VDIR) {
+			return (ENOTDIR);
+		} else if (zdp->z_dbuf == NULL) {
+			return (EIO);
+		}
+
+		if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) {
+			error = zfs_fastaccesschk_execute(zdp, cr);
+			if (!error) {
+				*vpp = dvp;
+				VN_HOLD(*vpp);
+				return (0);
+			}
+			return (error);
+		} else {
+			vnode_t *tvp = dnlc_lookup(dvp, nm);
+
+			if (tvp) {
+				error = zfs_fastaccesschk_execute(zdp, cr);
+				if (error) {
+					VN_RELE(tvp);
+					return (error);
+				}
+				if (tvp == DNLC_NO_VNODE) {
+					VN_RELE(tvp);
+					return (ENOENT);
+				} else {
+					*vpp = tvp;
+					return (specvp_check(vpp, cr));
+				}
+			}
+		}
+	}
+
+	DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zdp);
@@ -1050,21 +1229,8 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp,
 	}
 
 	error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp);
-	if (error == 0) {
-		/*
-		 * Convert device special files
-		 */
-		if (IS_DEVVP(*vpp)) {
-			vnode_t	*svp;
-
-			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-			VN_RELE(*vpp);
-			if (svp == NULL)
-				error = ENOSYS;
-			else
-				*vpp = svp;
-		}
-	}
+	if (error == 0)
+		error = specvp_check(vpp, cr);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -1108,11 +1274,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
 	zfs_dirlock_t	*dl;
 	dmu_tx_t	*tx;
 	int		error;
-	zfs_acl_t	*aclp = NULL;
-	zfs_fuid_info_t *fuidp = NULL;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
 
 	/*
 	 * If we have an ephemeral id, ACL, or XVATTR then
@@ -1175,21 +1341,9 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
 			if (strcmp(name, "..") == 0)
 				error = EISDIR;
 			ZFS_EXIT(zfsvfs);
-			if (aclp)
-				zfs_acl_free(aclp);
-			return (error);
-		}
-	}
-	if (vsecp && aclp == NULL) {
-		error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
-		if (error) {
-			ZFS_EXIT(zfsvfs);
-			if (dl)
-				zfs_dirent_unlock(dl);
 			return (error);
 		}
 	}
-
 	if (zp == NULL) {
 		uint64_t txtype;
 
@@ -1211,52 +1365,52 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
 			goto out;
 		}
 
+		if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
+		    &acl_ids)) != 0)
+			goto out;
+		if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+			zfs_acl_ids_free(&acl_ids);
+			error = EDQUOT;
+			goto out;
+		}
+
 		tx = dmu_tx_create(os);
 		dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-		if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) ||
-		    IS_EPHEMERAL(gid)) {
-			if (zfsvfs->z_fuid_obj == 0) {
-				dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-				dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-				    FUID_SIZE_ESTIMATE(zfsvfs));
-				dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
-				    FALSE, NULL);
-			} else {
-				dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-				dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-				    FUID_SIZE_ESTIMATE(zfsvfs));
-			}
-		}
+		fuid_dirtied = zfsvfs->z_fuid_dirty;
+		if (fuid_dirtied)
+			zfs_fuid_txhold(zfsvfs, tx);
 		dmu_tx_hold_bonus(tx, dzp->z_id);
 		dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-		if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) {
+		if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) {
 			dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 			    0, SPA_MAXBLOCKSIZE);
 		}
-		error = dmu_tx_assign(tx, zfsvfs->z_assign);
+		error = dmu_tx_assign(tx, TXG_NOWAIT);
 		if (error) {
+			zfs_acl_ids_free(&acl_ids);
 			zfs_dirent_unlock(dl);
-			if (error == ERESTART &&
-			    zfsvfs->z_assign == TXG_NOWAIT) {
+			if (error == ERESTART) {
 				dmu_tx_wait(tx);
 				dmu_tx_abort(tx);
 				goto top;
 			}
 			dmu_tx_abort(tx);
 			ZFS_EXIT(zfsvfs);
-			if (aclp)
-				zfs_acl_free(aclp);
 			return (error);
 		}
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+
+		if (fuid_dirtied)
+			zfs_fuid_sync(zfsvfs, tx);
+
 		(void) zfs_link_create(dl, zp, tx, ZNEW);
+
 		txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap);
 		if (flag & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_create(zilog, tx, txtype, dzp, zp, name,
-		    vsecp, fuidp, vap);
-		if (fuidp)
-			zfs_fuid_info_free(fuidp);
+		    vsecp, acl_ids.z_fuidp, vap);
+		zfs_acl_ids_free(&acl_ids);
 		dmu_tx_commit(tx);
 	} else {
 		int aflags = (flag & FAPPEND) ? V_APPEND : 0;
@@ -1313,22 +1467,8 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl,
 			VN_RELE(ZTOV(zp));
 	} else {
 		*vpp = ZTOV(zp);
-		/*
-		 * If vnode is for a device return a specfs vnode instead.
-		 */
-		if (IS_DEVVP(*vpp)) {
-			struct vnode *svp;
-
-			svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr);
-			VN_RELE(*vpp);
-			if (svp == NULL) {
-				error = ENOSYS;
-			}
-			*vpp = svp;
-		}
+		error = specvp_check(vpp, cr);
 	}
-	if (aclp)
-		zfs_acl_free(aclp);
 
 	ZFS_EXIT(zfsvfs);
 	return (error);
@@ -1449,11 +1589,11 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct,
 	/* charge as an update -- would be nice not to charge at all */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
 
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1563,12 +1703,12 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	uint64_t	txtype;
 	dmu_tx_t	*tx;
 	int		error;
-	zfs_acl_t	*aclp = NULL;
-	zfs_fuid_info_t	*fuidp = NULL;
 	int		zf = ZNEW;
 	ksid_t		*ksid;
 	uid_t		uid;
 	gid_t		gid = crgetgid(cr);
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
 
 	ASSERT(vap->va_type == VDIR);
 
@@ -1629,59 +1769,52 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 		return (error);
 	}
 
-	if (vsecp && aclp == NULL) {
-		error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp);
-		if (error) {
-			zfs_dirent_unlock(dl);
-			ZFS_EXIT(zfsvfs);
-			return (error);
-		}
+	if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp,
+	    &acl_ids)) != 0) {
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (error);
+	}
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (EDQUOT);
 	}
+
 	/*
 	 * Add a new entry to the directory.
 	 */
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname);
 	dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL);
-	if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) ||
-	    IS_EPHEMERAL(gid)) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
-	if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp)
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT,
 		    0, SPA_MAXBLOCKSIZE);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
+		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
 		}
 		dmu_tx_abort(tx);
 		ZFS_EXIT(zfsvfs);
-		if (aclp)
-			zfs_acl_free(aclp);
 		return (error);
 	}
 
 	/*
 	 * Create new node.
 	 */
-	zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp);
-
-	if (aclp)
-		zfs_acl_free(aclp);
+	zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
 
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
 	/*
 	 * Now put new name in parent dir.
 	 */
@@ -1692,10 +1825,10 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr,
 	txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap);
 	if (flags & FIGNORECASE)
 		txtype |= TX_CI;
-	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap);
+	zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp,
+	    acl_ids.z_fuidp, vap);
 
-	if (fuidp)
-		zfs_fuid_info_free(fuidp);
+	zfs_acl_ids_free(&acl_ids);
 	dmu_tx_commit(tx);
 
 	zfs_dirent_unlock(dl);
@@ -1789,13 +1922,13 @@ zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr,
 	dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name);
 	dmu_tx_hold_bonus(tx, zp->z_id);
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		rw_exit(&zp->z_parent_lock);
 		rw_exit(&zp->z_name_lock);
 		zfs_dirent_unlock(dl);
 		VN_RELE(vp);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -2004,6 +2137,21 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
 			}
 		}
 
+		if (flags & V_RDDIR_ACCFILTER) {
+			/*
+			 * If we have no access at all, don't include
+			 * this entry in the returned information
+			 */
+			znode_t	*ezp;
+			if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0)
+				goto skip_entry;
+			if (!zfs_has_access(ezp, cr)) {
+				VN_RELE(ZTOV(ezp));
+				goto skip_entry;
+			}
+			VN_RELE(ZTOV(ezp));
+		}
+
 		if (flags & V_RDDIR_ENTFLAGS)
 			reclen = EDIRENT_RECLEN(strlen(zap.za_name));
 		else
@@ -2055,6 +2203,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp,
 		if (prefetch)
 			dmu_prefetch(os, objnum, 0, 0);
 
+	skip_entry:
 		/*
 		 * Move to the next entry, fill in the previous offset.
 		 */
@@ -2155,8 +2304,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	ZFS_VERIFY_ZP(zp);
 	pzp = zp->z_phys;
 
-	mutex_enter(&zp->z_lock);
-
 	/*
 	 * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES.
 	 * Also, if we are the owner don't bother, since owner should
@@ -2166,7 +2313,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	    (pzp->zp_uid != crgetuid(cr))) {
 		if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0,
 		    skipaclchk, cr)) {
-			mutex_exit(&zp->z_lock);
 			ZFS_EXIT(zfsvfs);
 			return (error);
 		}
@@ -2177,6 +2323,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	 * than to determine whether we were asked the question.
 	 */
 
+	mutex_enter(&zp->z_lock);
 	vap->va_type = vp->v_type;
 	vap->va_mode = pzp->zp_mode & MODEMASK;
 	zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid);
@@ -2292,6 +2439,12 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 			ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime);
 			XVA_SET_RTN(xvap, XAT_CREATETIME);
 		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+			xoap->xoa_reparse =
+			    ((pzp->zp_flags & ZFS_REPARSE) != 0);
+			XVA_SET_RTN(xvap, XAT_REPARSE);
+		}
 	}
 
 	ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime);
@@ -2342,10 +2495,12 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	zilog_t		*zilog;
 	dmu_tx_t	*tx;
 	vattr_t		oldva;
+	xvattr_t	tmpxvattr;
 	uint_t		mask = vap->va_mask;
 	uint_t		saved_mask;
 	int		trim_mask = 0;
 	uint64_t	new_mode;
+	uint64_t	new_uid, new_gid;
 	znode_t		*attrzp;
 	int		need_policy = FALSE;
 	int		err;
@@ -2354,6 +2509,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	xoptattr_t	*xoap;
 	zfs_acl_t	*aclp = NULL;
 	boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE;
+	boolean_t fuid_dirtied = B_FALSE;
 
 	if (mask == 0)
 		return (0);
@@ -2396,6 +2552,8 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	 */
 	xoap = xva_getxoptattr(xvap);
 
+	xva_init(&tmpxvattr);
+
 	/*
 	 * Immutable files can only alter immutable bit and atime
 	 */
@@ -2428,6 +2586,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 top:
 	attrzp = NULL;
 
+	/* Can this be moved to before the top label? */
 	if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) {
 		ZFS_EXIT(zfsvfs);
 		return (EROFS);
@@ -2518,45 +2677,101 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	oldva.va_mode = pzp->zp_mode;
 	zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid);
 	if (mask & AT_XVATTR) {
-		if ((need_policy == FALSE) &&
-		    (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) &&
-		    xoap->xoa_appendonly !=
-		    ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) &&
-		    xoap->xoa_nounlink !=
-		    ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) &&
-		    xoap->xoa_immutable !=
-		    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_NODUMP) &&
-		    xoap->xoa_nodump !=
-		    ((pzp->zp_flags & ZFS_NODUMP) != 0)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) &&
-		    xoap->xoa_av_modified !=
-		    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) ||
-		    ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) &&
-		    ((vp->v_type != VREG && xoap->xoa_av_quarantined) ||
-		    xoap->xoa_av_quarantined !=
-		    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) ||
-		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) ||
-		    (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
-			need_policy = TRUE;
+		/*
+		 * Update xvattr mask to include only those attributes
+		 * that are actually changing.
+		 *
+		 * the bits will be restored prior to actually setting
+		 * the attributes so the caller thinks they were set.
+		 */
+		if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) {
+			if (xoap->xoa_appendonly !=
+			    ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_APPENDONLY);
+				XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY);
+			}
 		}
-	}
-
-	mutex_exit(&zp->z_lock);
 
-	if (mask & AT_MODE) {
-		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
-			err = secpolicy_setid_setsticky_clear(vp, vap,
-			    &oldva, cr);
-			if (err) {
-				ZFS_EXIT(zfsvfs);
-				return (err);
+		if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) {
+			if (xoap->xoa_nounlink !=
+			    ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NOUNLINK);
+				XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK);
 			}
-			trim_mask |= AT_MODE;
-		} else {
-			need_policy = TRUE;
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) {
+			if (xoap->xoa_immutable !=
+			    ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_IMMUTABLE);
+				XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) {
+			if (xoap->xoa_nodump !=
+			    ((pzp->zp_flags & ZFS_NODUMP) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_NODUMP);
+				XVA_SET_REQ(&tmpxvattr, XAT_NODUMP);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) {
+			if (xoap->xoa_av_modified !=
+			    ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_MODIFIED);
+				XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) {
+			if ((vp->v_type != VREG &&
+			    xoap->xoa_av_quarantined) ||
+			    xoap->xoa_av_quarantined !=
+			    ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) {
+				need_policy = TRUE;
+			} else {
+				XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED);
+				XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED);
+			}
+		}
+
+		if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+			mutex_exit(&zp->z_lock);
+			ZFS_EXIT(zfsvfs);
+			return (EPERM);
+		}
+
+		if (need_policy == FALSE &&
+		    (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) ||
+		    XVA_ISSET_REQ(xvap, XAT_OPAQUE))) {
+			need_policy = TRUE;
+		}
+	}
+
+	mutex_exit(&zp->z_lock);
+
+	if (mask & AT_MODE) {
+		if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) {
+			err = secpolicy_setid_setsticky_clear(vp, vap,
+			    &oldva, cr);
+			if (err) {
+				ZFS_EXIT(zfsvfs);
+				return (err);
+			}
+			trim_mask |= AT_MODE;
+		} else {
+			need_policy = TRUE;
 		}
 	}
 
@@ -2592,30 +2807,14 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) ||
-	    ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
 
 	if (mask & AT_MODE) {
 		uint64_t pmode = pzp->zp_mode;
 
 		new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT);
 
-		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) {
-			dmu_tx_abort(tx);
-			ZFS_EXIT(zfsvfs);
-			return (err);
-		}
+		if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode))
+			goto out;
 		if (pzp->zp_acl.z_acl_extern_obj) {
 			/* Are we upgrading ACL from old V0 format to new V1 */
 			if (zfsvfs->z_version <= ZPL_VERSION_FUID &&
@@ -2637,36 +2836,53 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 		}
 	}
 
-	if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) {
-		err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
-		if (err) {
-			dmu_tx_abort(tx);
-			ZFS_EXIT(zfsvfs);
-			if (aclp)
-				zfs_acl_free(aclp);
-			return (err);
+	if (mask & (AT_UID | AT_GID)) {
+		if (pzp->zp_xattr) {
+			err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp);
+			if (err)
+				goto out;
+			dmu_tx_hold_bonus(tx, attrzp->z_id);
+		}
+		if (mask & AT_UID) {
+			new_uid = zfs_fuid_create(zfsvfs,
+			    (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp);
+			if (new_uid != pzp->zp_uid &&
+			    zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) {
+				err = EDQUOT;
+				goto out;
+			}
 		}
-		dmu_tx_hold_bonus(tx, attrzp->z_id);
-	}
-
-	err = dmu_tx_assign(tx, zfsvfs->z_assign);
-	if (err) {
-		if (attrzp)
-			VN_RELE(ZTOV(attrzp));
 
-		if (aclp) {
-			zfs_acl_free(aclp);
-			aclp = NULL;
+		if (mask & AT_GID) {
+			new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid,
+			    cr, ZFS_GROUP, &fuidp);
+			if (new_gid != pzp->zp_gid &&
+			    zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) {
+				err = EDQUOT;
+				goto out;
+			}
 		}
+		fuid_dirtied = zfsvfs->z_fuid_dirty;
+		if (fuid_dirtied) {
+			if (zfsvfs->z_fuid_obj == 0) {
+				dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
+				dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
+				    FUID_SIZE_ESTIMATE(zfsvfs));
+				dmu_tx_hold_zap(tx, MASTER_NODE_OBJ,
+				    FALSE, NULL);
+			} else {
+				dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
+				dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
+				    FUID_SIZE_ESTIMATE(zfsvfs));
+			}
+		}
+	}
 
-		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+	err = dmu_tx_assign(tx, TXG_NOWAIT);
+	if (err) {
+		if (err == ERESTART)
 			dmu_tx_wait(tx);
-			dmu_tx_abort(tx);
-			goto top;
-		}
-		dmu_tx_abort(tx);
-		ZFS_EXIT(zfsvfs);
-		return (err);
+		goto out;
 	}
 
 	dmu_buf_will_dirty(zp->z_dbuf, tx);
@@ -2684,8 +2900,10 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	if (mask & AT_MODE) {
 		mutex_enter(&zp->z_acl_lock);
 		zp->z_phys->zp_mode = new_mode;
-		err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx);
+		err = zfs_aclset_common(zp, aclp, cr, tx);
 		ASSERT3U(err, ==, 0);
+		zp->z_acl_cached = aclp;
+		aclp = NULL;
 		mutex_exit(&zp->z_acl_lock);
 	}
 
@@ -2693,25 +2911,17 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 		mutex_enter(&attrzp->z_lock);
 
 	if (mask & AT_UID) {
-		pzp->zp_uid = zfs_fuid_create(zfsvfs,
-		    vap->va_uid, cr, ZFS_OWNER, tx, &fuidp);
-		if (attrzp) {
-			attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs,
-			    vap->va_uid,  cr, ZFS_OWNER, tx, &fuidp);
-		}
+		pzp->zp_uid = new_uid;
+		if (attrzp)
+			attrzp->z_phys->zp_uid = new_uid;
 	}
 
 	if (mask & AT_GID) {
-		pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid,
-		    cr, ZFS_GROUP, tx, &fuidp);
+		pzp->zp_gid = new_gid;
 		if (attrzp)
-			attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs,
-			    vap->va_gid, cr, ZFS_GROUP, tx, &fuidp);
+			attrzp->z_phys->zp_gid = new_gid;
 	}
 
-	if (aclp)
-		zfs_acl_free(aclp);
-
 	if (attrzp)
 		mutex_exit(&attrzp->z_lock);
 
@@ -2732,6 +2942,31 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 	 */
 
 	if (xoap && (mask & AT_XVATTR)) {
+
+		/*
+		 * restore trimmed off masks
+		 * so that return masks can be set for caller.
+		 */
+
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) {
+			XVA_SET_REQ(xvap, XAT_APPENDONLY);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) {
+			XVA_SET_REQ(xvap, XAT_NOUNLINK);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) {
+			XVA_SET_REQ(xvap, XAT_IMMUTABLE);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) {
+			XVA_SET_REQ(xvap, XAT_NODUMP);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) {
+			XVA_SET_REQ(xvap, XAT_AV_MODIFIED);
+		}
+		if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) {
+			XVA_SET_REQ(xvap, XAT_AV_QUARANTINED);
+		}
+
 		if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) {
 			size_t len;
 			dmu_object_info_t doi;
@@ -2748,17 +2983,33 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr,
 		zfs_xvattr_set(zp, xvap);
 	}
 
+	if (fuid_dirtied)
+		zfs_fuid_sync(zfsvfs, tx);
+
 	if (mask != 0)
 		zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp);
 
-	if (fuidp)
-		zfs_fuid_info_free(fuidp);
 	mutex_exit(&zp->z_lock);
 
+out:
 	if (attrzp)
 		VN_RELE(ZTOV(attrzp));
 
-	dmu_tx_commit(tx);
+	if (aclp)
+		zfs_acl_free(aclp);
+
+	if (fuidp) {
+		zfs_fuid_info_free(fuidp);
+		fuidp = NULL;
+	}
+
+	if (err)
+		dmu_tx_abort(tx);
+	else
+		dmu_tx_commit(tx);
+
+	if (err == ERESTART)
+		goto top;
 
 	ZFS_EXIT(zfsvfs);
 	return (err);
@@ -2998,6 +3249,15 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
 		}
 	}
 
+	/*
+	 * If the source and destination directories are the same, we should
+	 * grab the z_name_lock of that directory only once.
+	 */
+	if (sdzp == tdzp) {
+		zflg |= ZHAVELOCK;
+		rw_enter(&sdzp->z_name_lock, RW_READER);
+	}
+
 	if (cmp < 0) {
 		serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp,
 		    ZEXISTS | zflg, NULL, NULL);
@@ -3020,6 +3280,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
 			if (tzp)
 				VN_RELE(ZTOV(tzp));
 		}
+
+		if (sdzp == tdzp)
+			rw_exit(&sdzp->z_name_lock);
+
 		if (strcmp(snm, "..") == 0)
 			serr = EINVAL;
 		ZFS_EXIT(zfsvfs);
@@ -3028,6 +3292,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
 	if (terr) {
 		zfs_dirent_unlock(sdl);
 		VN_RELE(ZTOV(szp));
+
+		if (sdzp == tdzp)
+			rw_exit(&sdzp->z_name_lock);
+
 		if (strcmp(tnm, "..") == 0)
 			terr = EINVAL;
 		ZFS_EXIT(zfsvfs);
@@ -3104,16 +3372,20 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
 	if (tzp)
 		dmu_tx_hold_bonus(tx, tzp->z_id);	/* parent changes */
 	dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		if (zl != NULL)
 			zfs_rename_unlock(&zl);
 		zfs_dirent_unlock(sdl);
 		zfs_dirent_unlock(tdl);
+
+		if (sdzp == tdzp)
+			rw_exit(&sdzp->z_name_lock);
+
 		VN_RELE(ZTOV(szp));
 		if (tzp)
 			VN_RELE(ZTOV(tzp));
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -3151,6 +3423,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr,
 	zfs_dirent_unlock(sdl);
 	zfs_dirent_unlock(tdl);
 
+	if (sdzp == tdzp)
+		rw_exit(&sdzp->z_name_lock);
+
+
 	VN_RELE(ZTOV(szp));
 	if (tzp)
 		VN_RELE(ZTOV(tzp));
@@ -3189,7 +3465,8 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
 	int		len = strlen(link);
 	int		error;
 	int		zflg = ZNEW;
-	zfs_fuid_info_t *fuidp = NULL;
+	zfs_acl_ids_t	acl_ids;
+	boolean_t	fuid_dirtied;
 
 	ASSERT(vap->va_type == VLNK);
 
@@ -3224,28 +3501,27 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
 		return (error);
 	}
 
+	VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids));
+	if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) {
+		zfs_acl_ids_free(&acl_ids);
+		zfs_dirent_unlock(dl);
+		ZFS_EXIT(zfsvfs);
+		return (EDQUOT);
+	}
 	tx = dmu_tx_create(zfsvfs->z_os);
+	fuid_dirtied = zfsvfs->z_fuid_dirty;
 	dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len));
 	dmu_tx_hold_bonus(tx, dzp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)
+	if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE)
 		dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE);
-	if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) {
-		if (zfsvfs->z_fuid_obj == 0) {
-			dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT);
-			dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-			dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL);
-		} else {
-			dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj);
-			dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0,
-			    FUID_SIZE_ESTIMATE(zfsvfs));
-		}
-	}
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	if (fuid_dirtied)
+		zfs_fuid_txhold(zfsvfs, tx);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
+		zfs_acl_ids_free(&acl_ids);
 		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -3263,13 +3539,16 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
 	 * otherwise, store it just like any other file data.
 	 */
 	if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) {
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp);
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids);
 		if (len != 0)
 			bcopy(link, zp->z_phys + 1, len);
 	} else {
 		dmu_buf_t *dbp;
 
-		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp);
+		zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids);
+
+		if (fuid_dirtied)
+			zfs_fuid_sync(zfsvfs, tx);
 		/*
 		 * Nothing can access the znode yet so no locking needed
 		 * for growing the znode's blocksize.
@@ -3290,15 +3569,14 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr,
 	 * Insert the new object into the directory.
 	 */
 	(void) zfs_link_create(dl, zp, tx, ZNEW);
-out:
 	if (error == 0) {
 		uint64_t txtype = TX_SYMLINK;
 		if (flags & FIGNORECASE)
 			txtype |= TX_CI;
 		zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link);
 	}
-	if (fuidp)
-		zfs_fuid_info_free(fuidp);
+
+	zfs_acl_ids_free(&acl_ids);
 
 	dmu_tx_commit(tx);
 
@@ -3462,10 +3740,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr,
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, szp->z_id);
 	dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
 		zfs_dirent_unlock(dl);
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -3534,9 +3812,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	zilog_t		*zilog = zfsvfs->z_log;
 	dmu_tx_t	*tx;
-	rl_t		*rl;
 	u_offset_t	off, koff;
 	size_t		len, klen;
 	uint64_t	filesz;
@@ -3547,30 +3823,22 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
 	len = PAGESIZE;
 	/*
 	 * If our blocksize is bigger than the page size, try to kluster
-	 * muiltiple pages so that we write a full block (thus avoiding
+	 * multiple pages so that we write a full block (thus avoiding
 	 * a read-modify-write).
 	 */
 	if (off < filesz && zp->z_blksz > PAGESIZE) {
-		if (!ISP2(zp->z_blksz)) {
-			/* Only one block in the file. */
-			klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
-			koff = 0;
-		} else {
-			klen = zp->z_blksz;
-			koff = P2ALIGN(off, (u_offset_t)klen);
-		}
+		klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
+		koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0;
 		ASSERT(koff <= filesz);
 		if (koff + klen > filesz)
 			klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE);
 		pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags);
 	}
 	ASSERT3U(btop(len), ==, btopr(len));
-top:
-	rl = zfs_range_lock(zp, off, len, RL_WRITER);
+
 	/*
 	 * Can't push pages past end-of-file.
 	 */
-	filesz = zp->z_phys->zp_size;
 	if (off >= filesz) {
 		/* ignore all pages */
 		err = 0;
@@ -3586,16 +3854,20 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
 		len = filesz - off;
 	}
 
+	if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) ||
+	    zfs_usergroup_overquota(zfsvfs, B_TRUE, zp->z_phys->zp_gid)) {
+		err = EDQUOT;
+		goto out;
+	}
+top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_write(tx, zp->z_id, off, len);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	err = dmu_tx_assign(tx, zfsvfs->z_assign);
+	err = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (err != 0) {
-		if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
-			zfs_range_unlock(rl);
+		if (err == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
-			err = 0;
 			goto top;
 		}
 		dmu_tx_abort(tx);
@@ -3613,12 +3885,11 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp,
 
 	if (err == 0) {
 		zfs_time_stamper(zp, CONTENT_MODIFIED, tx);
-		zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0);
-		dmu_tx_commit(tx);
+		zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0);
 	}
+	dmu_tx_commit(tx);
 
 out:
-	zfs_range_unlock(rl);
 	pvn_write_done(pp, (err ? B_ERROR : 0) | flags);
 	if (offp)
 		*offp = off;
@@ -3655,31 +3926,50 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
 	page_t		*pp;
 	size_t		io_len;
 	u_offset_t	io_off;
-	uint64_t	filesz;
+	uint_t		blksz;
+	rl_t		*rl;
 	int		error = 0;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
 
-	if (len == 0) {
+	/*
+	 * Align this request to the file block size in case we kluster.
+	 * XXX - this can result in pretty aggresive locking, which can
+	 * impact simultanious read/write access.  One option might be
+	 * to break up long requests (len == 0) into block-by-block
+	 * operations to get narrower locking.
+	 */
+	blksz = zp->z_blksz;
+	if (ISP2(blksz))
+		io_off = P2ALIGN_TYPED(off, blksz, u_offset_t);
+	else
+		io_off = 0;
+	if (len > 0 && ISP2(blksz))
+		io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t);
+	else
+		io_len = 0;
+
+	if (io_len == 0) {
 		/*
-		 * Search the entire vp list for pages >= off.
+		 * Search the entire vp list for pages >= io_off.
 		 */
-		error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage,
-		    flags, cr);
+		rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER);
+		error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr);
 		goto out;
 	}
+	rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER);
 
-	filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */
-	if (off > filesz) {
+	if (off > zp->z_phys->zp_size) {
 		/* past end of file */
+		zfs_range_unlock(rl);
 		ZFS_EXIT(zfsvfs);
 		return (0);
 	}
 
-	len = MIN(len, filesz - off);
+	len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off);
 
-	for (io_off = off; io_off < off + len; io_off += io_len) {
+	for (off = io_off; io_off < off + len; io_off += io_len) {
 		if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
 			pp = page_lookup(vp, io_off,
 			    (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED);
@@ -3702,6 +3992,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr,
 		}
 	}
 out:
+	zfs_range_unlock(rl);
 	if ((flags & B_ASYNC) == 0)
 		zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id);
 	ZFS_EXIT(zfsvfs);
@@ -3728,7 +4019,10 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct)
 		}
 
 		mutex_enter(&zp->z_lock);
-		vp->v_count = 0; /* count arrives as 1 */
+		mutex_enter(&vp->v_lock);
+		ASSERT(vp->v_count == 1);
+		vp->v_count = 0;
+		mutex_exit(&vp->v_lock);
 		mutex_exit(&zp->z_lock);
 		rw_exit(&zfsvfs->z_teardown_inactive_lock);
 		zfs_znode_free(zp);
@@ -3795,7 +4089,6 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
 {
 	znode_t *zp = VTOZ(vp);
 	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
-	int error;
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
@@ -3810,15 +4103,16 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset,
 		ZFS_EXIT(zfsvfs);
 		return (EAGAIN);
 	}
-	error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct);
 	ZFS_EXIT(zfsvfs);
-	return (error);
+	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
 }
 
 /*
  * If we can't find a page in the cache, we will create a new page
  * and fill it with file data.  For efficiency, we may try to fill
- * multiple pages at once (klustering).
+ * multiple pages at once (klustering) to fill up the supplied page
+ * list.  Note that the pages to be filled are held with an exclusive
+ * lock to prevent access by other threads while they are being filled.
  */
 static int
 zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
@@ -3827,57 +4121,28 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
 	znode_t *zp = VTOZ(vp);
 	page_t *pp, *cur_pp;
 	objset_t *os = zp->z_zfsvfs->z_os;
-	caddr_t va;
 	u_offset_t io_off, total;
-	uint64_t oid = zp->z_id;
 	size_t io_len;
-	uint64_t filesz;
 	int err;
 
-	/*
-	 * If we are only asking for a single page don't bother klustering.
-	 */
-	filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */
-	if (off >= filesz)
-		return (EFAULT);
 	if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) {
+		/*
+		 * We only have a single page, don't bother klustering
+		 */
 		io_off = off;
 		io_len = PAGESIZE;
-		pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr);
+		pp = page_create_va(vp, io_off, io_len,
+		    PG_EXCL | PG_WAIT, seg, addr);
 	} else {
 		/*
-		 * Try to fill a kluster of pages (a blocks worth).
+		 * Try to find enough pages to fill the page list
 		 */
-		size_t klen;
-		u_offset_t koff;
-
-		if (!ISP2(zp->z_blksz)) {
-			/* Only one block in the file. */
-			klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE);
-			koff = 0;
-		} else {
-			/*
-			 * It would be ideal to align our offset to the
-			 * blocksize but doing so has resulted in some
-			 * strange application crashes. For now, we
-			 * leave the offset as is and only adjust the
-			 * length if we are off the end of the file.
-			 */
-			koff = off;
-			klen = plsz;
-		}
-		ASSERT(koff <= filesz);
-		if (koff + klen > filesz)
-			klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff;
-		ASSERT3U(off, >=, koff);
-		ASSERT3U(off, <, koff + klen);
 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off,
-		    &io_len, koff, klen, 0);
+		    &io_len, off, plsz, 0);
 	}
 	if (pp == NULL) {
 		/*
-		 * Some other thread entered the page before us.
-		 * Return to zfs_getpage to retry the lookup.
+		 * The page already exists, nothing to do here.
 		 */
 		*pl = NULL;
 		return (0);
@@ -3888,9 +4153,12 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
 	 */
 	cur_pp = pp;
 	for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) {
+		caddr_t va;
+
 		ASSERT3U(io_off, ==, cur_pp->p_offset);
 		va = zfs_map_page(cur_pp, S_WRITE);
-		err = dmu_read(os, oid, io_off, PAGESIZE, va);
+		err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va,
+		    DMU_READ_PREFETCH);
 		zfs_unmap_page(cur_pp, va);
 		if (err) {
 			/* On error, toss the entire kluster */
@@ -3902,15 +4170,14 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
 		}
 		cur_pp = cur_pp->p_next;
 	}
-out:
+
 	/*
-	 * Fill in the page list array from the kluster.  If
-	 * there are too many pages in the kluster, return
-	 * as many pages as possible starting from the desired
-	 * offset `off'.
+	 * Fill in the page list array from the kluster starting
+	 * from the desired offset `off'.
 	 * NOTE: the page list will always be null terminated.
 	 */
 	pvn_plist_init(pp, pl, plsz, off, io_len, rw);
+	ASSERT(pl == NULL || (*pl)->p_offset == off);
 
 	return (0);
 }
@@ -3918,10 +4185,10 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg,
 /*
  * Return pointers to the pages for the file region [off, off + len]
  * in the pl array.  If plsz is greater than len, this function may
- * also return page pointers from before or after the specified
- * region (i.e. some region [off', off' + plsz]).  These additional
- * pages are only returned if they are already in the cache, or were
- * created as part of a klustered read.
+ * also return page pointers from after the specified region
+ * (i.e. the region [off, off + plsz]).  These additional pages are
+ * only returned if they are already in the cache, or were created as
+ * part of a klustered read.
  *
  *	IN:	vp	- vnode of file to get data from.
  *		off	- position in file to get data from.
@@ -3950,9 +4217,17 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
 {
 	znode_t		*zp = VTOZ(vp);
 	zfsvfs_t	*zfsvfs = zp->z_zfsvfs;
-	page_t		*pp, **pl0 = pl;
-	int		need_unlock = 0, err = 0;
-	offset_t	orig_off;
+	page_t		**pl0 = pl;
+	int		err = 0;
+
+	/* we do our own caching, faultahead is unnecessary */
+	if (pl == NULL)
+		return (0);
+	else if (len > plsz)
+		len = plsz;
+	else
+		len = P2ROUNDUP(len, PAGESIZE);
+	ASSERT(plsz >= len);
 
 	ZFS_ENTER(zfsvfs);
 	ZFS_VERIFY_ZP(zp);
@@ -3960,104 +4235,51 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp,
 	if (protp)
 		*protp = PROT_ALL;
 
-	/* no faultahead (for now) */
-	if (pl == NULL) {
-		ZFS_EXIT(zfsvfs);
-		return (0);
-	}
-
-	/* can't fault past EOF */
-	if (off >= zp->z_phys->zp_size) {
-		ZFS_EXIT(zfsvfs);
-		return (EFAULT);
-	}
-	orig_off = off;
-
-	/*
-	 * If we already own the lock, then we must be page faulting
-	 * in the middle of a write to this file (i.e., we are writing
-	 * to this file using data from a mapped region of the file).
-	 */
-	if (rw_owner(&zp->z_map_lock) != curthread) {
-		rw_enter(&zp->z_map_lock, RW_WRITER);
-		need_unlock = TRUE;
-	}
-
 	/*
-	 * Loop through the requested range [off, off + len] looking
+	 * Loop through the requested range [off, off + len) looking
 	 * for pages.  If we don't find a page, we will need to create
 	 * a new page and fill it with data from the file.
 	 */
 	while (len > 0) {
-		if (plsz < PAGESIZE)
-			break;
-		if (pp = page_lookup(vp, off, SE_SHARED)) {
-			*pl++ = pp;
+		if (*pl = page_lookup(vp, off, SE_SHARED))
+			*(pl+1) = NULL;
+		else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw))
+			goto out;
+		while (*pl) {
+			ASSERT3U((*pl)->p_offset, ==, off);
 			off += PAGESIZE;
 			addr += PAGESIZE;
-			len -= PAGESIZE;
-			plsz -= PAGESIZE;
-		} else {
-			err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw);
-			if (err)
-				goto out;
-			/*
-			 * klustering may have changed our region
-			 * to be block aligned.
-			 */
-			if (((pp = *pl) != 0) && (off != pp->p_offset)) {
-				int delta = off - pp->p_offset;
-				len += delta;
-				off -= delta;
-				addr -= delta;
-			}
-			while (*pl) {
-				pl++;
-				off += PAGESIZE;
-				addr += PAGESIZE;
-				plsz -= PAGESIZE;
-				if (len > PAGESIZE)
-					len -= PAGESIZE;
-				else
-					len = 0;
+			if (len > 0) {
+				ASSERT3U(len, >=, PAGESIZE);
+				len -= PAGESIZE;
 			}
+			ASSERT3U(plsz, >=, PAGESIZE);
+			plsz -= PAGESIZE;
+			pl++;
 		}
 	}
 
 	/*
 	 * Fill out the page array with any pages already in the cache.
 	 */
-	while (plsz > 0) {
-		pp = page_lookup_nowait(vp, off, SE_SHARED);
-		if (pp == NULL)
-			break;
-		*pl++ = pp;
-		off += PAGESIZE;
-		plsz -= PAGESIZE;
+	while (plsz > 0 &&
+	    (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) {
+			off += PAGESIZE;
+			plsz -= PAGESIZE;
 	}
-
-	ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 out:
-	/*
-	 * We can't grab the range lock for the page as reader which would
-	 * stop truncation as this leads to deadlock. So we need to recheck
-	 * the file size.
-	 */
-	if (orig_off >= zp->z_phys->zp_size)
-		err = EFAULT;
 	if (err) {
 		/*
 		 * Release any pages we have previously locked.
 		 */
 		while (pl > pl0)
 			page_unlock(*--pl);
+	} else {
+		ZFS_ACCESSTIME_STAMP(zfsvfs, zp);
 	}
 
 	*pl = NULL;
 
-	if (need_unlock)
-		rw_exit(&zp->z_map_lock);
-
 	ZFS_EXIT(zfsvfs);
 	return (err);
 }
@@ -4360,6 +4582,11 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 		    (vp->v_type == VREG || vp->v_type == VDIR);
 		return (0);
 
+	case _PC_ACCESS_FILTERING:
+		*valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) &&
+		    vp->v_type == VDIR;
+		return (0);
+
 	case _PC_ACL_ENABLED:
 		*valp = _ACL_ACE_ENABLED;
 		return (0);
@@ -4368,6 +4595,11 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr,
 		*valp = (ulong_t)SPA_MINBLOCKSIZE;
 		return (0);
 
+	case _PC_TIMESTAMP_RESOLUTION:
+		/* nanosecond timestamp resolution */
+		*valp = 1L;
+		return (0);
+
 	default:
 		return (fs_pathconf(vp, cmd, valp, cr, ct));
 	}
@@ -4408,6 +4640,161 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr,
 	return (error);
 }
 
+/*
+ * Tunable, both must be a power of 2.
+ *
+ * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf
+ * zcr_blksz_max: if set to less than the file block size, allow loaning out of
+ *                an arcbuf for a partial block read
+ */
+int zcr_blksz_min = (1 << 10);	/* 1K */
+int zcr_blksz_max = (1 << 17);	/* 128K */
+
+/*ARGSUSED*/
+static int
+zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr,
+    caller_context_t *ct)
+{
+	znode_t	*zp = VTOZ(vp);
+	zfsvfs_t *zfsvfs = zp->z_zfsvfs;
+	int max_blksz = zfsvfs->z_max_blksz;
+	uio_t *uio = &xuio->xu_uio;
+	ssize_t size = uio->uio_resid;
+	offset_t offset = uio->uio_loffset;
+	int blksz;
+	int fullblk, i;
+	arc_buf_t *abuf;
+	ssize_t maxsize;
+	int preamble, postamble;
+
+	if (xuio->xu_type != UIOTYPE_ZEROCOPY)
+		return (EINVAL);
+
+	ZFS_ENTER(zfsvfs);
+	ZFS_VERIFY_ZP(zp);
+	switch (ioflag) {
+	case UIO_WRITE:
+		/*
+		 * Loan out an arc_buf for write if write size is bigger than
+		 * max_blksz, and the file's block size is also max_blksz.
+		 */
+		blksz = max_blksz;
+		if (size < blksz || zp->z_blksz != blksz) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+		/*
+		 * Caller requests buffers for write before knowing where the
+		 * write offset might be (e.g. NFS TCP write).
+		 */
+		if (offset == -1) {
+			preamble = 0;
+		} else {
+			preamble = P2PHASE(offset, blksz);
+			if (preamble) {
+				preamble = blksz - preamble;
+				size -= preamble;
+			}
+		}
+
+		postamble = P2PHASE(size, blksz);
+		size -= postamble;
+
+		fullblk = size / blksz;
+		(void) dmu_xuio_init(xuio,
+		    (preamble != 0) + fullblk + (postamble != 0));
+		DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble,
+		    int, postamble, int,
+		    (preamble != 0) + fullblk + (postamble != 0));
+
+		/*
+		 * Have to fix iov base/len for partial buffers.  They
+		 * currently represent full arc_buf's.
+		 */
+		if (preamble) {
+			/* data begins in the middle of the arc_buf */
+			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+			ASSERT(abuf);
+			(void) dmu_xuio_add(xuio, abuf,
+			    blksz - preamble, preamble);
+		}
+
+		for (i = 0; i < fullblk; i++) {
+			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+			ASSERT(abuf);
+			(void) dmu_xuio_add(xuio, abuf, 0, blksz);
+		}
+
+		if (postamble) {
+			/* data ends in the middle of the arc_buf */
+			abuf = dmu_request_arcbuf(zp->z_dbuf, blksz);
+			ASSERT(abuf);
+			(void) dmu_xuio_add(xuio, abuf, 0, postamble);
+		}
+		break;
+	case UIO_READ:
+		/*
+		 * Loan out an arc_buf for read if the read size is larger than
+		 * the current file block size.  Block alignment is not
+		 * considered.  Partial arc_buf will be loaned out for read.
+		 */
+		blksz = zp->z_blksz;
+		if (blksz < zcr_blksz_min)
+			blksz = zcr_blksz_min;
+		if (blksz > zcr_blksz_max)
+			blksz = zcr_blksz_max;
+		/* avoid potential complexity of dealing with it */
+		if (blksz > max_blksz) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+
+		maxsize = zp->z_phys->zp_size - uio->uio_loffset;
+		if (size > maxsize)
+			size = maxsize;
+
+		if (size < blksz || vn_has_cached_data(vp)) {
+			ZFS_EXIT(zfsvfs);
+			return (EINVAL);
+		}
+		break;
+	default:
+		ZFS_EXIT(zfsvfs);
+		return (EINVAL);
+	}
+
+	uio->uio_extflg = UIO_XUIO;
+	XUIO_XUZC_RW(xuio) = ioflag;
+	ZFS_EXIT(zfsvfs);
+	return (0);
+}
+
+/*ARGSUSED*/
+static int
+zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct)
+{
+	int i;
+	arc_buf_t *abuf;
+	int ioflag = XUIO_XUZC_RW(xuio);
+
+	ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY);
+
+	i = dmu_xuio_cnt(xuio);
+	while (i-- > 0) {
+		abuf = dmu_xuio_arcbuf(xuio, i);
+		/*
+		 * if abuf == NULL, it must be a write buffer
+		 * that has been returned in zfs_write().
+		 */
+		if (abuf)
+			dmu_return_arcbuf(abuf);
+		ASSERT(abuf || ioflag == UIO_WRITE);
+	}
+
+	dmu_xuio_fini(xuio);
+	return (0);
+}
+
 /*
  * Predeclare these here so that the compiler assumes that
  * this is an "old style" function declaration that does
@@ -4491,6 +4878,8 @@ const fs_operation_def_t zfs_fvnodeops_template[] = {
 	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
 	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
 	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
+	VOPNAME_REQZCBUF, 	{ .vop_reqzcbuf = zfs_reqzcbuf },
+	VOPNAME_RETZCBUF, 	{ .vop_retzcbuf = zfs_retzcbuf },
 	NULL,			NULL
 };
 
@@ -4511,6 +4900,22 @@ const fs_operation_def_t zfs_symvnodeops_template[] = {
 	NULL,			NULL
 };
 
+/*
+ * special share hidden files vnode operations template
+ */
+vnodeops_t *zfs_sharevnodeops;
+const fs_operation_def_t zfs_sharevnodeops_template[] = {
+	VOPNAME_GETATTR,	{ .vop_getattr = zfs_getattr },
+	VOPNAME_ACCESS,		{ .vop_access = zfs_access },
+	VOPNAME_INACTIVE,	{ .vop_inactive = zfs_inactive },
+	VOPNAME_FID,		{ .vop_fid = zfs_fid },
+	VOPNAME_PATHCONF,	{ .vop_pathconf = zfs_pathconf },
+	VOPNAME_GETSECATTR,	{ .vop_getsecattr = zfs_getsecattr },
+	VOPNAME_SETSECATTR,	{ .vop_setsecattr = zfs_setsecattr },
+	VOPNAME_VNEVENT,	{ .vop_vnevent = fs_vnevent_support },
+	NULL,			NULL
+};
+
 /*
  * Extended attribute directory vnode operations template
  *	This template is identical to the directory vnodes
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c
index 25751ae5f8541..1ff237e163cce 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -87,6 +87,12 @@
  * (such as VFS logic) that will not compile easily in userland.
  */
 #ifdef _KERNEL
+/*
+ * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to
+ * be freed before it can be safely accessed.
+ */
+krwlock_t zfsvfs_lock;
+
 static kmem_cache_t *znode_cache = NULL;
 
 /*ARGSUSED*/
@@ -117,7 +123,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 	list_link_init(&zp->z_link_node);
 
 	mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL);
-	rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL);
 	rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL);
 	mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL);
@@ -128,6 +133,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags)
 
 	zp->z_dbuf = NULL;
 	zp->z_dirlocks = NULL;
+	zp->z_acl_cached = NULL;
 	return (0);
 }
 
@@ -142,7 +148,6 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 	vn_free(ZTOV(zp));
 	ASSERT(!list_link_active(&zp->z_link_node));
 	mutex_destroy(&zp->z_lock);
-	rw_destroy(&zp->z_map_lock);
 	rw_destroy(&zp->z_parent_lock);
 	rw_destroy(&zp->z_name_lock);
 	mutex_destroy(&zp->z_acl_lock);
@@ -151,13 +156,15 @@ zfs_znode_cache_destructor(void *buf, void *arg)
 
 	ASSERT(zp->z_dbuf == NULL);
 	ASSERT(zp->z_dirlocks == NULL);
+	ASSERT(zp->z_acl_cached == NULL);
 }
 
 #ifdef	ZNODE_STATS
 static struct {
 	uint64_t zms_zfsvfs_invalid;
+	uint64_t zms_zfsvfs_recheck1;
 	uint64_t zms_zfsvfs_unmounted;
-	uint64_t zms_zfsvfs_recheck_invalid;
+	uint64_t zms_zfsvfs_recheck2;
 	uint64_t zms_obj_held;
 	uint64_t zms_vnode_locked;
 	uint64_t zms_not_only_dnlc;
@@ -194,6 +201,15 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
 	nzp->z_phys = ozp->z_phys;
 	nzp->z_dbuf = ozp->z_dbuf;
 
+	/*
+	 * Since this is just an idle znode and kmem is already dealing with
+	 * memory pressure, release any cached ACL.
+	 */
+	if (ozp->z_acl_cached) {
+		zfs_acl_free(ozp->z_acl_cached);
+		ozp->z_acl_cached = NULL;
+	}
+
 	/* Update back pointers. */
 	(void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys,
 	    znode_evict_error);
@@ -208,17 +224,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp)
 	POINTER_INVALIDATE(&ozp->z_zfsvfs);
 }
 
-/*
- * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise
- * returns a non-zero error code.
- */
-static int
-zfs_enter(zfsvfs_t *zfsvfs)
-{
-	ZFS_ENTER(zfsvfs);
-	return (0);
-}
-
 /*ARGSUSED*/
 static kmem_cbrc_t
 zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
@@ -242,12 +247,32 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
 	}
 
 	/*
-	 * Ensure that the filesystem is not unmounted during the move.
+	 * Close a small window in which it's possible that the filesystem could
+	 * be unmounted and freed, and zfsvfs, though valid in the previous
+	 * statement, could point to unrelated memory by the time we try to
+	 * prevent the filesystem from being unmounted.
+	 */
+	rw_enter(&zfsvfs_lock, RW_WRITER);
+	if (zfsvfs != ozp->z_zfsvfs) {
+		rw_exit(&zfsvfs_lock);
+		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1);
+		return (KMEM_CBRC_DONT_KNOW);
+	}
+
+	/*
+	 * If the znode is still valid, then so is the file system. We know that
+	 * no valid file system can be freed while we hold zfsvfs_lock, so we
+	 * can safely ensure that the filesystem is not and will not be
+	 * unmounted. The next statement is equivalent to ZFS_ENTER().
 	 */
-	if (zfs_enter(zfsvfs) != 0) {		/* ZFS_ENTER */
+	rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
+	if (zfsvfs->z_unmounted) {
+		ZFS_EXIT(zfsvfs);
+		rw_exit(&zfsvfs_lock);
 		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
+	rw_exit(&zfsvfs_lock);
 
 	mutex_enter(&zfsvfs->z_znodes_lock);
 	/*
@@ -257,7 +282,7 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
 	if (zfsvfs != ozp->z_zfsvfs) {
 		mutex_exit(&zfsvfs->z_znodes_lock);
 		ZFS_EXIT(zfsvfs);
-		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid);
+		ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2);
 		return (KMEM_CBRC_DONT_KNOW);
 	}
 
@@ -313,6 +338,7 @@ zfs_znode_init(void)
 	/*
 	 * Initialize zcache
 	 */
+	rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL);
 	ASSERT(znode_cache == NULL);
 	znode_cache = kmem_cache_create("zfs_znode_cache",
 	    sizeof (znode_t), 0, zfs_znode_cache_constructor,
@@ -334,6 +360,7 @@ zfs_znode_fini(void)
 	if (znode_cache)
 		kmem_cache_destroy(znode_cache);
 	znode_cache = NULL;
+	rw_destroy(&zfsvfs_lock);
 }
 
 struct vnodeops *zfs_dvnodeops;
@@ -341,6 +368,7 @@ struct vnodeops *zfs_fvnodeops;
 struct vnodeops *zfs_symvnodeops;
 struct vnodeops *zfs_xdvnodeops;
 struct vnodeops *zfs_evnodeops;
+struct vnodeops *zfs_sharevnodeops;
 
 void
 zfs_remove_op_tables()
@@ -365,12 +393,15 @@ zfs_remove_op_tables()
 		vn_freevnodeops(zfs_xdvnodeops);
 	if (zfs_evnodeops)
 		vn_freevnodeops(zfs_evnodeops);
+	if (zfs_sharevnodeops)
+		vn_freevnodeops(zfs_sharevnodeops);
 
 	zfs_dvnodeops = NULL;
 	zfs_fvnodeops = NULL;
 	zfs_symvnodeops = NULL;
 	zfs_xdvnodeops = NULL;
 	zfs_evnodeops = NULL;
+	zfs_sharevnodeops = NULL;
 }
 
 extern const fs_operation_def_t zfs_dvnodeops_template[];
@@ -378,6 +409,7 @@ extern const fs_operation_def_t zfs_fvnodeops_template[];
 extern const fs_operation_def_t zfs_xdvnodeops_template[];
 extern const fs_operation_def_t zfs_symvnodeops_template[];
 extern const fs_operation_def_t zfs_evnodeops_template[];
+extern const fs_operation_def_t zfs_sharevnodeops_template[];
 
 int
 zfs_create_op_tables()
@@ -414,103 +446,58 @@ zfs_create_op_tables()
 
 	error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template,
 	    &zfs_evnodeops);
+	if (error)
+		return (error);
+
+	error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template,
+	    &zfs_sharevnodeops);
 
 	return (error);
 }
 
-/*
- * zfs_init_fs - Initialize the zfsvfs struct and the file system
- *	incore "master" object.  Verify version compatibility.
- */
 int
-zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp)
+zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx)
 {
-	extern int zfsfstype;
-
-	objset_t	*os = zfsvfs->z_os;
-	int		i, error;
-	uint64_t fsid_guid;
-	uint64_t zval;
-
-	*zpp = NULL;
-
-	error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version);
-	if (error) {
-		return (error);
-	} else if (zfsvfs->z_version > ZPL_VERSION) {
-		(void) printf("Mismatched versions:  File system "
-		    "is version %llu on-disk format, which is "
-		    "incompatible with this software version %lld!",
-		    (u_longlong_t)zfsvfs->z_version, ZPL_VERSION);
-		return (ENOTSUP);
-	}
-
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0)
-		return (error);
-	zfsvfs->z_norm = (int)zval;
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0)
-		return (error);
-	zfsvfs->z_utf8 = (zval != 0);
-	if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0)
-		return (error);
-	zfsvfs->z_case = (uint_t)zval;
-	/*
-	 * Fold case on file systems that are always or sometimes case
-	 * insensitive.
-	 */
-	if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE ||
-	    zfsvfs->z_case == ZFS_CASE_MIXED)
-		zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER;
+	zfs_acl_ids_t acl_ids;
+	vattr_t vattr;
+	znode_t *sharezp;
+	vnode_t *vp;
+	znode_t *zp;
+	int error;
 
-	/*
-	 * The fsid is 64 bits, composed of an 8-bit fs type, which
-	 * separates our fsid from any other filesystem types, and a
-	 * 56-bit objset unique ID.  The objset unique ID is unique to
-	 * all objsets open on this system, provided by unique_create().
-	 * The 8-bit fs type must be put in the low bits of fsid[1]
-	 * because that's where other Solaris filesystems put it.
-	 */
-	fsid_guid = dmu_objset_fsid_guid(os);
-	ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0);
-	zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid;
-	zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) |
-	    zfsfstype & 0xFF;
-
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1,
-	    &zfsvfs->z_root);
-	if (error)
-		return (error);
-	ASSERT(zfsvfs->z_root != 0);
+	vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE;
+	vattr.va_type = VDIR;
+	vattr.va_mode = S_IFDIR|0555;
+	vattr.va_uid = crgetuid(kcred);
+	vattr.va_gid = crgetgid(kcred);
 
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1,
-	    &zfsvfs->z_unlinkedobj);
-	if (error)
-		return (error);
+	sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP);
+	sharezp->z_unlinked = 0;
+	sharezp->z_atime_dirty = 0;
+	sharezp->z_zfsvfs = zfsvfs;
 
-	/*
-	 * Initialize zget mutex's
-	 */
-	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-		mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+	vp = ZTOV(sharezp);
+	vn_reinit(vp);
+	vp->v_type = VDIR;
 
-	error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp);
-	if (error) {
-		/*
-		 * On error, we destroy the mutexes here since it's not
-		 * possible for the caller to determine if the mutexes were
-		 * initialized properly.
-		 */
-		for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
-			mutex_destroy(&zfsvfs->z_hold_mtx[i]);
-		return (error);
-	}
-	ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root);
-	error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1,
-	    &zfsvfs->z_fuid_obj);
-	if (error == ENOENT)
-		error = 0;
+	VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr,
+	    kcred, NULL, &acl_ids));
+	zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE,
+	    &zp, 0, &acl_ids);
+	ASSERT3P(zp, ==, sharezp);
+	ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */
+	POINTER_INVALIDATE(&sharezp->z_zfsvfs);
+	error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
+	    ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx);
+	zfsvfs->z_shares_dir = sharezp->z_id;
+
+	zfs_acl_ids_free(&acl_ids);
+	ZTOV(sharezp)->v_count = 0;
+	dmu_buf_rele(sharezp->z_dbuf, NULL);
+	sharezp->z_dbuf = NULL;
+	kmem_cache_free(znode_cache, sharezp);
 
-	return (0);
+	return (error);
 }
 
 /*
@@ -581,6 +568,7 @@ zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db)
 	mutex_enter(&zp->z_lock);
 
 	ASSERT(zp->z_dbuf == NULL);
+	ASSERT(zp->z_acl_cached == NULL);
 	zp->z_dbuf = db;
 	nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error);
 
@@ -678,7 +666,10 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
 		break;
 	case VREG:
 		vp->v_flag |= VMODSORT;
-		vn_setops(vp, zfs_fvnodeops);
+		if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir)
+			vn_setops(vp, zfs_sharevnodeops);
+		else
+			vn_setops(vp, zfs_fvnodeops);
 		break;
 	case VLNK:
 		vn_setops(vp, zfs_symvnodeops);
@@ -712,7 +703,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
  *		flag	- flags:
  *			  IS_ROOT_NODE	- new object will be root
  *			  IS_XATTR	- new object is an attribute
- *			  IS_REPLAY	- intent log replay
  *		bonuslen - length of bonus buffer
  *		setaclp  - File/Dir initial ACL
  *		fuidp	 - Tracks fuid allocation.
@@ -722,8 +712,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz)
  */
 void
 zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
-    uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp,
-    zfs_fuid_info_t **fuidp)
+    uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids)
 {
 	dmu_buf_t	*db;
 	znode_phys_t	*pzp;
@@ -734,9 +723,8 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 
 	ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE));
 
-	if (zfsvfs->z_assign >= TXG_INITIAL) {		/* ZIL replay */
+	if (zfsvfs->z_replay) {
 		obj = vap->va_nodeid;
-		flag |= IS_REPLAY;
 		now = vap->va_ctime;		/* see zfs_replay_create() */
 		gen = vap->va_nblocks;		/* ditto */
 	} else {
@@ -755,7 +743,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 	 * assertions below.
 	 */
 	if (vap->va_type == VDIR) {
-		if (flag & IS_REPLAY) {
+		if (zfsvfs->z_replay) {
 			err = zap_create_claim_norm(zfsvfs->z_os, obj,
 			    zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
@@ -766,7 +754,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	} else {
-		if (flag & IS_REPLAY) {
+		if (zfsvfs->z_replay) {
 			err = dmu_object_claim(zfsvfs->z_os, obj,
 			    DMU_OT_PLAIN_FILE_CONTENTS, 0,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
@@ -777,6 +765,8 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 			    DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx);
 		}
 	}
+
+	ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
 	VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db));
 	dmu_buf_will_dirty(db, tx);
 
@@ -835,12 +825,11 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 	} else {
 		ZFS_TIME_ENCODE(&now, pzp->zp_mtime);
 	}
-
-	pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode);
+	pzp->zp_uid = acl_ids->z_fuid;
+	pzp->zp_gid = acl_ids->z_fgid;
+	pzp->zp_mode = acl_ids->z_mode;
 	if (!(flag & IS_ROOT_NODE)) {
-		ZFS_OBJ_HOLD_ENTER(zfsvfs, obj);
 		*zpp = zfs_znode_alloc(zfsvfs, db, 0);
-		ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 	} else {
 		/*
 		 * If we are creating the root node, the "parent" we
@@ -848,7 +837,11 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr,
 		 */
 		*zpp = dzp;
 	}
-	zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp);
+	VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx));
+	if (vap->va_mask & AT_XVATTR)
+		zfs_xvattr_set(*zpp, (xvattr_t *)vap);
+
+	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj);
 }
 
 void
@@ -914,6 +907,10 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap)
 		zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP;
 		XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP);
 	}
+	if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) {
+		ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse);
+		XVA_SET_RTN(xvap, XAT_REPARSE);
+	}
 }
 
 int
@@ -968,11 +965,25 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp)
 
 	/*
 	 * Not found create new znode/vnode
+	 * but only if file exists.
+	 *
+	 * There is a small window where zfs_vget() could
+	 * find this object while a file create is still in
+	 * progress.  Since a gen number can never be zero
+	 * we will check that to determine if its an allocated
+	 * file.
 	 */
-	zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
+
+	if (((znode_phys_t *)db->db_data)->zp_gen != 0) {
+		zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size);
+		*zpp = zp;
+		err = 0;
+	} else {
+		dmu_buf_rele(db, NULL);
+		err = ENOENT;
+	}
 	ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num);
-	*zpp = zp;
-	return (0);
+	return (err);
 }
 
 int
@@ -1006,6 +1017,13 @@ zfs_rezget(znode_t *zp)
 		return (EIO);
 	}
 
+	mutex_enter(&zp->z_acl_lock);
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+	mutex_exit(&zp->z_acl_lock);
+
 	zfs_znode_dmu_init(zfsvfs, zp, db);
 	zp->z_unlinked = (zp->z_phys->zp_links == 0);
 	zp->z_blksz = doi.doi_data_block_size;
@@ -1098,6 +1116,11 @@ zfs_znode_free(znode_t *zp)
 	list_remove(&zfsvfs->z_all_znodes, zp);
 	mutex_exit(&zfsvfs->z_znodes_lock);
 
+	if (zp->z_acl_cached) {
+		zfs_acl_free(zp->z_acl_cached);
+		zp->z_acl_cached = NULL;
+	}
+
 	kmem_cache_free(znode_cache, zp);
 
 	VFS_RELE(zfsvfs->z_vfs);
@@ -1254,9 +1277,9 @@ zfs_extend(znode_t *zp, uint64_t end)
 		newblksz = 0;
 	}
 
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1358,9 +1381,9 @@ zfs_trunc(znode_t *zp, uint64_t end)
 top:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto top;
@@ -1375,15 +1398,12 @@ zfs_trunc(znode_t *zp, uint64_t end)
 
 	dmu_tx_commit(tx);
 
-	zfs_range_unlock(rl);
-
 	/*
 	 * Clear any mapped pages in the truncated region.  This has to
 	 * happen outside of the transaction to avoid the possibility of
 	 * a deadlock with someone trying to push a page that we are
 	 * about to invalidate.
 	 */
-	rw_enter(&zp->z_map_lock, RW_WRITER);
 	if (vn_has_cached_data(vp)) {
 		page_t *pp;
 		uint64_t start = end & PAGEMASK;
@@ -1401,7 +1421,8 @@ zfs_trunc(znode_t *zp, uint64_t end)
 		    B_INVAL | B_TRUNC, NULL);
 		ASSERT(error == 0);
 	}
-	rw_exit(&zp->z_map_lock);
+
+	zfs_range_unlock(rl);
 
 	return (0);
 }
@@ -1456,9 +1477,9 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log)
 log:
 	tx = dmu_tx_create(zfsvfs->z_os);
 	dmu_tx_hold_bonus(tx, zp->z_id);
-	error = dmu_tx_assign(tx, zfsvfs->z_assign);
+	error = dmu_tx_assign(tx, TXG_NOWAIT);
 	if (error) {
-		if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) {
+		if (error == ERESTART) {
 			dmu_tx_wait(tx);
 			dmu_tx_abort(tx);
 			goto log;
@@ -1478,15 +1499,17 @@ void
 zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 {
 	zfsvfs_t	zfsvfs;
-	uint64_t	moid, doid, version;
+	uint64_t	moid, obj, version;
 	uint64_t	sense = ZFS_CASE_SENSITIVE;
 	uint64_t	norm = 0;
 	nvpair_t	*elem;
 	int		error;
+	int		i;
 	znode_t		*rootzp = NULL;
 	vnode_t		*vp;
 	vattr_t		vattr;
 	znode_t		*zp;
+	zfs_acl_ids_t	acl_ids;
 
 	/*
 	 * First attempt to create master node.
@@ -1503,12 +1526,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	/*
 	 * Set starting attributes.
 	 */
-	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+	if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE)
 		version = ZPL_VERSION;
+	else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID)
+		version = ZPL_VERSION_USERSPACE - 1;
 	else
 		version = ZPL_VERSION_FUID - 1;
-	error = zap_update(os, moid, ZPL_VERSION_STR,
-	    8, 1, &version, tx);
 	elem = NULL;
 	while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) {
 		/* For the moment we expect all zpl props to be uint64_ts */
@@ -1519,9 +1542,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 		VERIFY(nvpair_value_uint64(elem, &val) == 0);
 		name = nvpair_name(elem);
 		if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) {
-			version = val;
-			error = zap_update(os, moid, ZPL_VERSION_STR,
-			    8, 1, &version, tx);
+			if (val < version)
+				version = val;
 		} else {
 			error = zap_update(os, moid, name, 8, 1, &val, tx);
 		}
@@ -1532,13 +1554,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 			sense = val;
 	}
 	ASSERT(version != 0);
+	error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx);
 
 	/*
 	 * Create a delete queue.
 	 */
-	doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
+	obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx);
 
-	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx);
+	error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx);
 	ASSERT(error == 0);
 
 	/*
@@ -1562,7 +1585,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	bzero(&zfsvfs, sizeof (zfsvfs_t));
 
 	zfsvfs.z_os = os;
-	zfsvfs.z_assign = TXG_NOWAIT;
 	zfsvfs.z_parent = &zfsvfs;
 	zfsvfs.z_version = version;
 	zfsvfs.z_use_fuids = USE_FUIDS(version, os);
@@ -1578,19 +1600,36 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx)
 	list_create(&zfsvfs.z_all_znodes, sizeof (znode_t),
 	    offsetof(znode_t, z_link_node));
 
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL);
+
 	ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs));
 	rootzp->z_zfsvfs = &zfsvfs;
-	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL);
+	VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr,
+	    cr, NULL, &acl_ids));
+	zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids);
 	ASSERT3P(zp, ==, rootzp);
 	ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */
 	error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx);
 	ASSERT(error == 0);
+	zfs_acl_ids_free(&acl_ids);
 	POINTER_INVALIDATE(&rootzp->z_zfsvfs);
 
 	ZTOV(rootzp)->v_count = 0;
 	dmu_buf_rele(rootzp->z_dbuf, NULL);
 	rootzp->z_dbuf = NULL;
 	kmem_cache_free(znode_cache, rootzp);
+
+	/*
+	 * Create shares directory
+	 */
+
+	error = zfs_create_share_dir(&zfsvfs, tx);
+
+	ASSERT(error == 0);
+
+	for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
+		mutex_destroy(&zfsvfs.z_hold_mtx[i]);
 }
 
 #endif /* _KERNEL */
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c
index 043cdb12f33a5..d5459465b9eea 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -76,11 +76,17 @@ boolean_t zfs_nocacheflush = B_FALSE;
 
 static kmem_cache_t *zil_lwb_cache;
 
+static boolean_t zil_empty(zilog_t *zilog);
+
+#define	LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \
+    sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused))
+
+
 static int
-zil_dva_compare(const void *x1, const void *x2)
+zil_bp_compare(const void *x1, const void *x2)
 {
-	const dva_t *dva1 = x1;
-	const dva_t *dva2 = x2;
+	const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva;
+	const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva;
 
 	if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2))
 		return (-1);
@@ -96,34 +102,37 @@ zil_dva_compare(const void *x1, const void *x2)
 }
 
 static void
-zil_dva_tree_init(avl_tree_t *t)
+zil_bp_tree_init(zilog_t *zilog)
 {
-	avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t),
-	    offsetof(zil_dva_node_t, zn_node));
+	avl_create(&zilog->zl_bp_tree, zil_bp_compare,
+	    sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node));
 }
 
 static void
-zil_dva_tree_fini(avl_tree_t *t)
+zil_bp_tree_fini(zilog_t *zilog)
 {
-	zil_dva_node_t *zn;
+	avl_tree_t *t = &zilog->zl_bp_tree;
+	zil_bp_node_t *zn;
 	void *cookie = NULL;
 
 	while ((zn = avl_destroy_nodes(t, &cookie)) != NULL)
-		kmem_free(zn, sizeof (zil_dva_node_t));
+		kmem_free(zn, sizeof (zil_bp_node_t));
 
 	avl_destroy(t);
 }
 
-static int
-zil_dva_tree_add(avl_tree_t *t, dva_t *dva)
+int
+zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp)
 {
-	zil_dva_node_t *zn;
+	avl_tree_t *t = &zilog->zl_bp_tree;
+	const dva_t *dva = BP_IDENTITY(bp);
+	zil_bp_node_t *zn;
 	avl_index_t where;
 
 	if (avl_find(t, dva, &where) != NULL)
 		return (EEXIST);
 
-	zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP);
+	zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP);
 	zn->zn_dva = *dva;
 	avl_insert(t, zn, where);
 
@@ -148,35 +157,31 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp)
 }
 
 /*
- * Read a log block, make sure it's valid, and byteswap it if necessary.
+ * Read a log block and make sure it's valid.
  */
 static int
-zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
+zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst,
+    char **end)
 {
-	blkptr_t blk = *bp;
-	zbookmark_t zb;
+	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
 	uint32_t aflags = ARC_WAIT;
+	arc_buf_t *abuf = NULL;
+	zbookmark_t zb;
 	int error;
 
-	zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET];
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ];
+	if (zilog->zl_header->zh_claim_txg == 0)
+		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
 
-	*abufpp = NULL;
+	if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+		zio_flags |= ZIO_FLAG_SPECULATIVE;
 
-	/*
-	 * We shouldn't be doing any scrubbing while we're doing log
-	 * replay, it's OK to not lock.
-	 */
-	error = arc_read_nolock(NULL, zilog->zl_spa, &blk,
-	    arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL |
-	    ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb);
+	SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]);
+
+	error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
 
 	if (error == 0) {
-		char *data = (*abufpp)->b_data;
-		uint64_t blksz = BP_GET_LSIZE(bp);
-		zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1;
 		zio_cksum_t cksum = bp->blk_cksum;
 
 		/*
@@ -189,43 +194,102 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp)
 		 */
 		cksum.zc_word[ZIL_ZC_SEQ]++;
 
-		if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum,
-		    sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) ||
-		    (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) {
-			error = ECKSUM;
-		}
+		if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+			zil_chain_t *zilc = abuf->b_data;
+			char *lr = (char *)(zilc + 1);
+			uint64_t len = zilc->zc_nused - sizeof (zil_chain_t);
 
-		if (error) {
-			VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1);
-			*abufpp = NULL;
+			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) {
+				error = ECKSUM;
+			} else {
+				bcopy(lr, dst, len);
+				*end = (char *)dst + len;
+				*nbp = zilc->zc_next_blk;
+			}
+		} else {
+			char *lr = abuf->b_data;
+			uint64_t size = BP_GET_LSIZE(bp);
+			zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1;
+
+			if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum,
+			    sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) ||
+			    (zilc->zc_nused > (size - sizeof (*zilc)))) {
+				error = ECKSUM;
+			} else {
+				bcopy(lr, dst, zilc->zc_nused);
+				*end = (char *)dst + zilc->zc_nused;
+				*nbp = zilc->zc_next_blk;
+			}
 		}
+
+		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+	}
+
+	return (error);
+}
+
+/*
+ * Read a TX_WRITE log data block.
+ */
+static int
+zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf)
+{
+	enum zio_flag zio_flags = ZIO_FLAG_CANFAIL;
+	const blkptr_t *bp = &lr->lr_blkptr;
+	uint32_t aflags = ARC_WAIT;
+	arc_buf_t *abuf = NULL;
+	zbookmark_t zb;
+	int error;
+
+	if (BP_IS_HOLE(bp)) {
+		if (wbuf != NULL)
+			bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length));
+		return (0);
 	}
 
-	dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid);
+	if (zilog->zl_header->zh_claim_txg == 0)
+		zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB;
+
+	SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid,
+	    ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp));
+
+	error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf,
+	    ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb);
+
+	if (error == 0) {
+		if (wbuf != NULL)
+			bcopy(abuf->b_data, wbuf, arc_buf_size(abuf));
+		(void) arc_buf_remove_ref(abuf, &abuf);
+	}
 
 	return (error);
 }
 
 /*
  * Parse the intent log, and call parse_func for each valid record within.
- * Return the highest sequence number.
  */
-uint64_t
+int
 zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
     zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg)
 {
 	const zil_header_t *zh = zilog->zl_header;
-	uint64_t claim_seq = zh->zh_claim_seq;
-	uint64_t seq = 0;
-	uint64_t max_seq = 0;
-	blkptr_t blk = zh->zh_log;
-	arc_buf_t *abuf;
+	boolean_t claimed = !!zh->zh_claim_txg;
+	uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX;
+	uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX;
+	uint64_t max_blk_seq = 0;
+	uint64_t max_lr_seq = 0;
+	uint64_t blk_count = 0;
+	uint64_t lr_count = 0;
+	blkptr_t blk, next_blk;
 	char *lrbuf, *lrp;
-	zil_trailer_t *ztp;
-	int reclen, error;
+	int error = 0;
 
-	if (BP_IS_HOLE(&blk))
-		return (max_seq);
+	/*
+	 * Old logs didn't record the maximum zh_claim_lr_seq.
+	 */
+	if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID))
+		claim_lr_seq = UINT64_MAX;
 
 	/*
 	 * Starting at the block pointed to by zh_log we read the log chain.
@@ -236,105 +300,156 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func,
 	 * If the log has been claimed, stop if we encounter a sequence
 	 * number greater than the highest claimed sequence number.
 	 */
-	zil_dva_tree_init(&zilog->zl_dva_tree);
-	for (;;) {
-		seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
-
-		if (claim_seq != 0 && seq > claim_seq)
-			break;
+	lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE);
+	zil_bp_tree_init(zilog);
 
-		ASSERT(max_seq < seq);
-		max_seq = seq;
+	for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) {
+		uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+		int reclen;
+		char *end;
 
-		error = zil_read_log_block(zilog, &blk, &abuf);
+		if (blk_seq > claim_blk_seq)
+			break;
+		if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0)
+			break;
+		ASSERT3U(max_blk_seq, <, blk_seq);
+		max_blk_seq = blk_seq;
+		blk_count++;
 
-		if (parse_blk_func != NULL)
-			parse_blk_func(zilog, &blk, arg, txg);
+		if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq)
+			break;
 
+		error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end);
 		if (error)
 			break;
 
-		lrbuf = abuf->b_data;
-		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
-		blk = ztp->zit_next_blk;
-
-		if (parse_lr_func == NULL) {
-			VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-			continue;
-		}
-
-		for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) {
+		for (lrp = lrbuf; lrp < end; lrp += reclen) {
 			lr_t *lr = (lr_t *)lrp;
 			reclen = lr->lrc_reclen;
 			ASSERT3U(reclen, >=, sizeof (lr_t));
-			parse_lr_func(zilog, lr, arg, txg);
+			if (lr->lrc_seq > claim_lr_seq)
+				goto done;
+			if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0)
+				goto done;
+			ASSERT3U(max_lr_seq, <, lr->lrc_seq);
+			max_lr_seq = lr->lrc_seq;
+			lr_count++;
 		}
-		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
 	}
-	zil_dva_tree_fini(&zilog->zl_dva_tree);
+done:
+	zilog->zl_parse_error = error;
+	zilog->zl_parse_blk_seq = max_blk_seq;
+	zilog->zl_parse_lr_seq = max_lr_seq;
+	zilog->zl_parse_blk_count = blk_count;
+	zilog->zl_parse_lr_count = lr_count;
+
+	ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) ||
+	    (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq));
 
-	return (max_seq);
+	zil_bp_tree_fini(zilog);
+	zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE);
+
+	return (error);
 }
 
-/* ARGSUSED */
-static void
+static int
 zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg)
 {
-	spa_t *spa = zilog->zl_spa;
-	int err;
-
 	/*
 	 * Claim log block if not already committed and not already claimed.
+	 * If tx == NULL, just verify that the block is claimable.
 	 */
-	if (bp->blk_birth >= first_txg &&
-	    zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) {
-		err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL,
-		    ZIO_FLAG_MUSTSUCCEED));
-		ASSERT(err == 0);
-	}
+	if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0)
+		return (0);
+
+	return (zio_wait(zio_claim(NULL, zilog->zl_spa,
+	    tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL,
+	    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB)));
 }
 
-static void
+static int
 zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg)
 {
-	if (lrc->lrc_txtype == TX_WRITE) {
-		lr_write_t *lr = (lr_write_t *)lrc;
-		zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg);
-	}
+	lr_write_t *lr = (lr_write_t *)lrc;
+	int error;
+
+	if (lrc->lrc_txtype != TX_WRITE)
+		return (0);
+
+	/*
+	 * If the block is not readable, don't claim it.  This can happen
+	 * in normal operation when a log block is written to disk before
+	 * some of the dmu_sync() blocks it points to.  In this case, the
+	 * transaction cannot have been committed to anyone (we would have
+	 * waited for all writes to be stable first), so it is semantically
+	 * correct to declare this the end of the log.
+	 */
+	if (lr->lr_blkptr.blk_birth >= first_txg &&
+	    (error = zil_read_log_data(zilog, lr, NULL)) != 0)
+		return (error);
+	return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg));
 }
 
 /* ARGSUSED */
-static void
+static int
 zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg)
 {
-	zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx));
+	zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+	return (0);
 }
 
-static void
+static int
 zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg)
 {
+	lr_write_t *lr = (lr_write_t *)lrc;
+	blkptr_t *bp = &lr->lr_blkptr;
+
 	/*
 	 * If we previously claimed it, we need to free it.
 	 */
-	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) {
-		lr_write_t *lr = (lr_write_t *)lrc;
-		blkptr_t *bp = &lr->lr_blkptr;
-		if (bp->blk_birth >= claim_txg &&
-		    !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) {
-			(void) arc_free(NULL, zilog->zl_spa,
-			    dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT);
-		}
+	if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE &&
+	    bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0)
+		zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp);
+
+	return (0);
+}
+
+static lwb_t *
+zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg)
+{
+	lwb_t *lwb;
+
+	lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
+	lwb->lwb_zilog = zilog;
+	lwb->lwb_blk = *bp;
+	lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp));
+	lwb->lwb_max_txg = txg;
+	lwb->lwb_zio = NULL;
+	lwb->lwb_tx = NULL;
+	if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) {
+		lwb->lwb_nused = sizeof (zil_chain_t);
+		lwb->lwb_sz = BP_GET_LSIZE(bp);
+	} else {
+		lwb->lwb_nused = 0;
+		lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t);
 	}
+
+	mutex_enter(&zilog->zl_lock);
+	list_insert_tail(&zilog->zl_lwb_list, lwb);
+	mutex_exit(&zilog->zl_lock);
+
+	return (lwb);
 }
 
 /*
  * Create an on-disk intent log.
  */
-static void
+static lwb_t *
 zil_create(zilog_t *zilog)
 {
 	const zil_header_t *zh = zilog->zl_header;
-	lwb_t *lwb;
+	lwb_t *lwb = NULL;
 	uint64_t txg = 0;
 	dmu_tx_t *tx = NULL;
 	blkptr_t blk;
@@ -351,22 +466,23 @@ zil_create(zilog_t *zilog)
 	blk = zh->zh_log;
 
 	/*
-	 * If we don't already have an initial log block or we have one
-	 * but it's the wrong endianness then allocate one.
+	 * Allocate an initial log block if:
+	 *    - there isn't one already
+	 *    - the existing block is the wrong endianess
 	 */
 	if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) {
 		tx = dmu_tx_create(zilog->zl_os);
-		(void) dmu_tx_assign(tx, TXG_WAIT);
+		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 
 		if (!BP_IS_HOLE(&blk)) {
-			zio_free_blk(zilog->zl_spa, &blk, txg);
+			zio_free_zil(zilog->zl_spa, txg, &blk);
 			BP_ZERO(&blk);
 		}
 
-		error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk,
-		    NULL, txg);
+		error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL,
+		    ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
 
 		if (error == 0)
 			zil_init_log_chain(zilog, &blk);
@@ -375,20 +491,8 @@ zil_create(zilog_t *zilog)
 	/*
 	 * Allocate a log write buffer (lwb) for the first log block.
 	 */
-	if (error == 0) {
-		lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-		lwb->lwb_zilog = zilog;
-		lwb->lwb_blk = blk;
-		lwb->lwb_nused = 0;
-		lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk);
-		lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz);
-		lwb->lwb_max_txg = txg;
-		lwb->lwb_zio = NULL;
-
-		mutex_enter(&zilog->zl_lock);
-		list_insert_tail(&zilog->zl_lwb_list, lwb);
-		mutex_exit(&zilog->zl_lock);
-	}
+	if (error == 0)
+		lwb = zil_alloc_lwb(zilog, &blk, txg);
 
 	/*
 	 * If we just allocated the first log block, commit our transaction
@@ -401,6 +505,8 @@ zil_create(zilog_t *zilog)
 	}
 
 	ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0);
+
+	return (lwb);
 }
 
 /*
@@ -425,26 +531,18 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
 
+	zilog->zl_old_header = *zh;		/* debugging aid */
+
 	if (BP_IS_HOLE(&zh->zh_log))
 		return;
 
 	tx = dmu_tx_create(zilog->zl_os);
-	(void) dmu_tx_assign(tx, TXG_WAIT);
+	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
 	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 	txg = dmu_tx_get_txg(tx);
 
 	mutex_enter(&zilog->zl_lock);
 
-	/*
-	 * It is possible for the ZIL to get the previously mounted zilog
-	 * structure of the same dataset if quickly remounted and the dbuf
-	 * eviction has not completed. In this case we can see a non
-	 * empty lwb list and keep_first will be set. We fix this by
-	 * clearing the keep_first. This will be slower but it's very rare.
-	 */
-	if (!list_is_empty(&zilog->zl_lwb_list) && keep_first)
-		keep_first = B_FALSE;
-
 	ASSERT3U(zilog->zl_destroy_txg, <, txg);
 	zilog->zl_destroy_txg = txg;
 	zilog->zl_keep_first = keep_first;
@@ -456,53 +554,20 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first)
 			list_remove(&zilog->zl_lwb_list, lwb);
 			if (lwb->lwb_buf != NULL)
 				zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
-			zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg);
+			zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk);
 			kmem_cache_free(zil_lwb_cache, lwb);
 		}
-	} else {
-		if (!keep_first) {
-			(void) zil_parse(zilog, zil_free_log_block,
-			    zil_free_log_record, tx, zh->zh_claim_txg);
-		}
+	} else if (!keep_first) {
+		(void) zil_parse(zilog, zil_free_log_block,
+		    zil_free_log_record, tx, zh->zh_claim_txg);
 	}
 	mutex_exit(&zilog->zl_lock);
 
 	dmu_tx_commit(tx);
 }
 
-/*
- * zil_rollback_destroy() is only called by the rollback code.
- * We already have a syncing tx. Rollback has exclusive access to the
- * dataset, so we don't have to worry about concurrent zil access.
- * The actual freeing of any log blocks occurs in zil_sync() later in
- * this txg syncing phase.
- */
-void
-zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	uint64_t txg;
-
-	if (BP_IS_HOLE(&zh->zh_log))
-		return;
-
-	txg = dmu_tx_get_txg(tx);
-	ASSERT3U(zilog->zl_destroy_txg, <, txg);
-	zilog->zl_destroy_txg = txg;
-	zilog->zl_keep_first = B_FALSE;
-
-	/*
-	 * Ensure there's no outstanding ZIL IO.  No lwbs or just the
-	 * unused one that allocated in advance is ok.
-	 */
-	ASSERT(zilog->zl_lwb_list.list_head.list_next ==
-	    zilog->zl_lwb_list.list_head.list_prev);
-	(void) zil_parse(zilog, zil_free_log_block, zil_free_log_record,
-	    tx, zh->zh_claim_txg);
-}
-
 int
-zil_claim(char *osname, void *txarg)
+zil_claim(const char *osname, void *txarg)
 {
 	dmu_tx_t *tx = txarg;
 	uint64_t first_txg = dmu_tx_get_txg(tx);
@@ -511,7 +576,7 @@ zil_claim(char *osname, void *txarg)
 	objset_t *os;
 	int error;
 
-	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+	error = dmu_objset_hold(osname, FTAG, &os);
 	if (error) {
 		cmn_err(CE_WARN, "can't open objset for %s", osname);
 		return (0);
@@ -520,6 +585,15 @@ zil_claim(char *osname, void *txarg)
 	zilog = dmu_objset_zil(os);
 	zh = zil_header_in_syncing_context(zilog);
 
+	if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) {
+		if (!BP_IS_HOLE(&zh->zh_log))
+			zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log);
+		BP_ZERO(&zh->zh_log);
+		dsl_dataset_dirty(dmu_objset_ds(os), tx);
+		dmu_objset_rele(os, FTAG);
+		return (0);
+	}
+
 	/*
 	 * Claim all log blocks if we haven't already done so, and remember
 	 * the highest claimed sequence number.  This ensures that if we can
@@ -529,14 +603,19 @@ zil_claim(char *osname, void *txarg)
 	 */
 	ASSERT3U(zh->zh_claim_txg, <=, first_txg);
 	if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) {
-		zh->zh_claim_txg = first_txg;
-		zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block,
+		(void) zil_parse(zilog, zil_claim_log_block,
 		    zil_claim_log_record, tx, first_txg);
+		zh->zh_claim_txg = first_txg;
+		zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq;
+		zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq;
+		if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1)
+			zh->zh_flags |= ZIL_REPLAY_NEEDED;
+		zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID;
 		dsl_dataset_dirty(dmu_objset_ds(os), tx);
 	}
 
 	ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1));
-	dmu_objset_close(os);
+	dmu_objset_rele(os, FTAG);
 	return (0);
 }
 
@@ -545,76 +624,36 @@ zil_claim(char *osname, void *txarg)
  * Checksum errors are ok as they indicate the end of the chain.
  * Any other error (no device or read failure) returns an error.
  */
-/* ARGSUSED */
 int
-zil_check_log_chain(char *osname, void *txarg)
+zil_check_log_chain(const char *osname, void *tx)
 {
 	zilog_t *zilog;
-	zil_header_t *zh;
-	blkptr_t blk;
-	arc_buf_t *abuf;
 	objset_t *os;
-	char *lrbuf;
-	zil_trailer_t *ztp;
 	int error;
 
-	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
+	ASSERT(tx == NULL);
+
+	error = dmu_objset_hold(osname, FTAG, &os);
 	if (error) {
 		cmn_err(CE_WARN, "can't open objset for %s", osname);
 		return (0);
 	}
 
 	zilog = dmu_objset_zil(os);
-	zh = zil_header_in_syncing_context(zilog);
-	blk = zh->zh_log;
-	if (BP_IS_HOLE(&blk)) {
-		dmu_objset_close(os);
-		return (0); /* no chain */
-	}
-
-	for (;;) {
-		error = zil_read_log_block(zilog, &blk, &abuf);
-		if (error)
-			break;
-		lrbuf = abuf->b_data;
-		ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1;
-		blk = ztp->zit_next_blk;
-		VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-	}
-	dmu_objset_close(os);
-	if (error == ECKSUM)
-		return (0); /* normal end of chain */
-	return (error);
-}
 
-/*
- * Clear a log chain
- */
-/* ARGSUSED */
-int
-zil_clear_log_chain(char *osname, void *txarg)
-{
-	zilog_t *zilog;
-	zil_header_t *zh;
-	objset_t *os;
-	dmu_tx_t *tx;
-	int error;
+	/*
+	 * Because tx == NULL, zil_claim_log_block() will not actually claim
+	 * any blocks, but just determine whether it is possible to do so.
+	 * In addition to checking the log chain, zil_claim_log_block()
+	 * will invoke zio_claim() with a done func of spa_claim_notify(),
+	 * which will update spa_max_claim_txg.  See spa_load() for details.
+	 */
+	error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx,
+	    zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa));
 
-	error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os);
-	if (error) {
-		cmn_err(CE_WARN, "can't open objset for %s", osname);
-		return (0);
-	}
+	dmu_objset_rele(os, FTAG);
 
-	zilog = dmu_objset_zil(os);
-	tx = dmu_tx_create(zilog->zl_os);
-	(void) dmu_tx_assign(tx, TXG_WAIT);
-	zh = zil_header_in_syncing_context(zilog);
-	BP_ZERO(&zh->zh_log);
-	dsl_dataset_dirty(dmu_objset_ds(os), tx);
-	dmu_tx_commit(tx);
-	dmu_objset_close(os);
-	return (0);
+	return ((error == ECKSUM || error == ENOENT) ? 0 : error);
 }
 
 static int
@@ -632,7 +671,7 @@ zil_vdev_compare(const void *x1, const void *x2)
 }
 
 void
-zil_add_block(zilog_t *zilog, blkptr_t *bp)
+zil_add_block(zilog_t *zilog, const blkptr_t *bp)
 {
 	avl_tree_t *t = &zilog->zl_vdev_tree;
 	avl_index_t where;
@@ -708,9 +747,9 @@ zil_lwb_write_done(zio_t *zio)
 {
 	lwb_t *lwb = zio->io_private;
 	zilog_t *zilog = lwb->lwb_zilog;
+	dmu_tx_t *tx = lwb->lwb_tx;
 
 	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
-	ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG);
 	ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG);
 	ASSERT(BP_GET_LEVEL(zio->io_bp) == 0);
 	ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER);
@@ -719,18 +758,25 @@ zil_lwb_write_done(zio_t *zio)
 	ASSERT(zio->io_bp->blk_fill == 0);
 
 	/*
-	 * Now that we've written this log block, we have a stable pointer
-	 * to the next block in the chain, so it's OK to let the txg in
-	 * which we allocated the next block sync.
+	 * Ensure the lwb buffer pointer is cleared before releasing
+	 * the txg. If we have had an allocation failure and
+	 * the txg is waiting to sync then we want want zil_sync()
+	 * to remove the lwb so that it's not picked up as the next new
+	 * one in zil_commit_writer(). zil_sync() will only remove
+	 * the lwb if lwb_buf is null.
 	 */
-	txg_rele_to_sync(&lwb->lwb_txgh);
-
 	zio_buf_free(lwb->lwb_buf, lwb->lwb_sz);
 	mutex_enter(&zilog->zl_lock);
 	lwb->lwb_buf = NULL;
-	if (zio->io_error)
-		zilog->zl_log_error = B_TRUE;
+	lwb->lwb_tx = NULL;
 	mutex_exit(&zilog->zl_lock);
+
+	/*
+	 * Now that we've written this log block, we have a stable pointer
+	 * to the next block in the chain, so it's OK to let the txg in
+	 * which we allocated the next block sync.
+	 */
+	dmu_tx_commit(tx);
 }
 
 /*
@@ -741,10 +787,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 {
 	zbookmark_t zb;
 
-	zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET];
-	zb.zb_object = 0;
-	zb.zb_level = -1;
-	zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ];
+	SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET],
+	    ZB_ZIL_OBJECT, ZB_ZIL_LEVEL,
+	    lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]);
 
 	if (zilog->zl_root_zio == NULL) {
 		zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL,
@@ -752,12 +797,35 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 	}
 	if (lwb->lwb_zio == NULL) {
 		lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa,
-		    0, &lwb->lwb_blk, lwb->lwb_buf,
-		    lwb->lwb_sz, zil_lwb_write_done, lwb,
-		    ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb);
+		    0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk),
+		    zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb);
 	}
 }
 
+/*
+ * Define a limited set of intent log block sizes.
+ * These must be a multiple of 4KB. Note only the amount used (again
+ * aligned to 4KB) actually gets written. However, we can't always just
+ * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted.
+ */
+uint64_t zil_block_buckets[] = {
+    4096,		/* non TX_WRITE */
+    8192+4096,		/* data base */
+    32*1024 + 4096, 	/* NFS writes */
+    UINT64_MAX
+};
+
+/*
+ * Use the slog as long as the logbias is 'latency' and the current commit size
+ * is less than the limit or the total list size is less than 2X the limit.
+ * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX.
+ */
+uint64_t zil_slog_limit = 1024 * 1024;
+#define	USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \
+	(((zilog)->zl_cur_used < zil_slog_limit) || \
+	((zilog)->zl_itx_list_sz < (zil_slog_limit << 1))))
+
 /*
  * Start a log block write and advance to the next log block.
  * Calls are serialized.
@@ -765,105 +833,105 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb)
 static lwb_t *
 zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb)
 {
-	lwb_t *nlwb;
-	zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1;
+	lwb_t *nlwb = NULL;
+	zil_chain_t *zilc;
 	spa_t *spa = zilog->zl_spa;
-	blkptr_t *bp = &ztp->zit_next_blk;
+	blkptr_t *bp;
+	dmu_tx_t *tx;
 	uint64_t txg;
 	uint64_t zil_blksz;
-	int error;
+	int i, error;
+
+	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+		zilc = (zil_chain_t *)lwb->lwb_buf;
+		bp = &zilc->zc_next_blk;
+	} else {
+		zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz);
+		bp = &zilc->zc_next_blk;
+	}
 
-	ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb));
+	ASSERT(lwb->lwb_nused <= lwb->lwb_sz);
 
 	/*
 	 * Allocate the next block and save its address in this block
 	 * before writing it in order to establish the log chain.
 	 * Note that if the allocation of nlwb synced before we wrote
 	 * the block that points at it (lwb), we'd leak it if we crashed.
-	 * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done().
+	 * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done().
+	 * We dirty the dataset to ensure that zil_sync() will be called
+	 * to clean up in the event of allocation failure or I/O failure.
 	 */
-	txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh);
-	txg_rele_to_quiesce(&lwb->lwb_txgh);
+	tx = dmu_tx_create(zilog->zl_os);
+	VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
+	dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+	txg = dmu_tx_get_txg(tx);
+
+	lwb->lwb_tx = tx;
 
 	/*
-	 * Pick a ZIL blocksize. We request a size that is the
-	 * maximum of the previous used size, the current used size and
-	 * the amount waiting in the queue.
+	 * Log blocks are pre-allocated. Here we select the size of the next
+	 * block, based on size used in the last block.
+	 * - first find the smallest bucket that will fit the block from a
+	 *   limited set of block sizes. This is because it's faster to write
+	 *   blocks allocated from the same metaslab as they are adjacent or
+	 *   close.
+	 * - next find the maximum from the new suggested size and an array of
+	 *   previous sizes. This lessens a picket fence effect of wrongly
+	 *   guesssing the size if we have a stream of say 2k, 64k, 2k, 64k
+	 *   requests.
+	 *
+	 * Note we only write what is used, but we can't just allocate
+	 * the maximum block size because we can exhaust the available
+	 * pool log space.
 	 */
-	zil_blksz = MAX(zilog->zl_prev_used,
-	    zilog->zl_cur_used + sizeof (*ztp));
-	zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp));
-	zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t);
-	if (zil_blksz > ZIL_MAX_BLKSZ)
-		zil_blksz = ZIL_MAX_BLKSZ;
+	zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t);
+	for (i = 0; zil_blksz > zil_block_buckets[i]; i++)
+		continue;
+	zil_blksz = zil_block_buckets[i];
+	if (zil_blksz == UINT64_MAX)
+		zil_blksz = SPA_MAXBLOCKSIZE;
+	zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz;
+	for (i = 0; i < ZIL_PREV_BLKS; i++)
+		zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]);
+	zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1);
 
 	BP_ZERO(bp);
 	/* pass the old blkptr in order to spread log blocks across devs */
-	error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg);
-	if (error) {
-		dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg);
+	error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz,
+	    USE_SLOG(zilog));
+	if (!error) {
+		ASSERT3U(bp->blk_birth, ==, txg);
+		bp->blk_cksum = lwb->lwb_blk.blk_cksum;
+		bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
 
 		/*
-		 * We dirty the dataset to ensure that zil_sync() will
-		 * be called to remove this lwb from our zl_lwb_list.
-		 * Failing to do so, may leave an lwb with a NULL lwb_buf
-		 * hanging around on the zl_lwb_list.
+		 * Allocate a new log write buffer (lwb).
 		 */
-		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
-		dmu_tx_commit(tx);
+		nlwb = zil_alloc_lwb(zilog, bp, txg);
 
-		/*
-		 * Since we've just experienced an allocation failure so we
-		 * terminate the current lwb and send it on its way.
-		 */
-		ztp->zit_pad = 0;
-		ztp->zit_nused = lwb->lwb_nused;
-		ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
-		zio_nowait(lwb->lwb_zio);
-
-		/*
-		 * By returning NULL the caller will call tx_wait_synced()
-		 */
-		return (NULL);
+		/* Record the block for later vdev flushing */
+		zil_add_block(zilog, &lwb->lwb_blk);
 	}
 
-	ASSERT3U(bp->blk_birth, ==, txg);
-	ztp->zit_pad = 0;
-	ztp->zit_nused = lwb->lwb_nused;
-	ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum;
-	bp->blk_cksum = lwb->lwb_blk.blk_cksum;
-	bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++;
+	if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) {
+		uint64_t len;
 
-	/*
-	 * Allocate a new log write buffer (lwb).
-	 */
-	nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP);
-
-	nlwb->lwb_zilog = zilog;
-	nlwb->lwb_blk = *bp;
-	nlwb->lwb_nused = 0;
-	nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk);
-	nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz);
-	nlwb->lwb_max_txg = txg;
-	nlwb->lwb_zio = NULL;
+		/* For Slim ZIL only write what is used. */
+		len = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t);
+		ASSERT3U(len, <=, lwb->lwb_sz);
+		zio_shrink(lwb->lwb_zio, len);
 
-	/*
-	 * Put new lwb at the end of the log chain
-	 */
-	mutex_enter(&zilog->zl_lock);
-	list_insert_tail(&zilog->zl_lwb_list, nlwb);
-	mutex_exit(&zilog->zl_lock);
+	}
+	zilc->zc_pad = 0;
+	zilc->zc_nused = lwb->lwb_nused;
+	zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum;
 
-	/* Record the block for later vdev flushing */
-	zil_add_block(zilog, &lwb->lwb_blk);
+	zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */
 
 	/*
-	 * kick off the write for the old log block
+	 * If there was an allocation failure then nlwb will be null which
+	 * forces a txg_wait_synced().
 	 */
-	dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg);
-	ASSERT(lwb->lwb_zio);
-	zio_nowait(lwb->lwb_zio);
-
 	return (nlwb);
 }
 
@@ -871,20 +939,20 @@ static lwb_t *
 zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 {
 	lr_t *lrc = &itx->itx_lr; /* common log record */
-	lr_write_t *lr = (lr_write_t *)lrc;
+	lr_write_t *lrw = (lr_write_t *)lrc;
+	char *lr_buf;
 	uint64_t txg = lrc->lrc_txg;
 	uint64_t reclen = lrc->lrc_reclen;
-	uint64_t dlen;
+	uint64_t dlen = 0;
 
 	if (lwb == NULL)
 		return (NULL);
+
 	ASSERT(lwb->lwb_buf != NULL);
 
 	if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY)
 		dlen = P2ROUNDUP_TYPED(
-		    lr->lr_length, sizeof (uint64_t), uint64_t);
-	else
-		dlen = 0;
+		    lrw->lr_length, sizeof (uint64_t), uint64_t);
 
 	zilog->zl_cur_used += (reclen + dlen);
 
@@ -893,24 +961,22 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 	/*
 	 * If this record won't fit in the current log block, start a new one.
 	 */
-	if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+	if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
 		lwb = zil_lwb_write_start(zilog, lwb);
 		if (lwb == NULL)
 			return (NULL);
 		zil_lwb_write_init(zilog, lwb);
-		ASSERT(lwb->lwb_nused == 0);
-		if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) {
+		ASSERT(LWB_EMPTY(lwb));
+		if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) {
 			txg_wait_synced(zilog->zl_dmu_pool, txg);
 			return (lwb);
 		}
 	}
 
-	/*
-	 * Update the lrc_seq, to be log record sequence number. See zil.h
-	 * Then copy the record to the log buffer.
-	 */
-	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
-	bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen);
+	lr_buf = lwb->lwb_buf + lwb->lwb_nused;
+	bcopy(lrc, lr_buf, reclen);
+	lrc = (lr_t *)lr_buf;
+	lrw = (lr_write_t *)lrc;
 
 	/*
 	 * If it's a write, fetch the data or get its blkptr as appropriate.
@@ -922,18 +988,20 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 			char *dbuf;
 			int error;
 
-			/* alignment is guaranteed */
-			lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused);
 			if (dlen) {
 				ASSERT(itx->itx_wr_state == WR_NEED_COPY);
-				dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen;
-				lr->lr_common.lrc_reclen += dlen;
+				dbuf = lr_buf + reclen;
+				lrw->lr_common.lrc_reclen += dlen;
 			} else {
 				ASSERT(itx->itx_wr_state == WR_INDIRECT);
 				dbuf = NULL;
 			}
 			error = zilog->zl_get_data(
-			    itx->itx_private, lr, dbuf, lwb->lwb_zio);
+			    itx->itx_private, lrw, dbuf, lwb->lwb_zio);
+			if (error == EIO) {
+				txg_wait_synced(zilog->zl_dmu_pool, txg);
+				return (lwb);
+			}
 			if (error) {
 				ASSERT(error == ENOENT || error == EEXIST ||
 				    error == EALREADY);
@@ -942,9 +1010,16 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb)
 		}
 	}
 
+	/*
+	 * We're actually making an entry, so update lrc_seq to be the
+	 * log record sequence number.  Note that this is generally not
+	 * equal to the itx sequence number because not all transactions
+	 * are synchronous, and sometimes spa_sync() gets there first.
+	 */
+	lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */
 	lwb->lwb_nused += reclen + dlen;
 	lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg);
-	ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb));
+	ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz);
 	ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0);
 
 	return (lwb);
@@ -966,12 +1041,19 @@ zil_itx_create(uint64_t txtype, size_t lrsize)
 	return (itx);
 }
 
+void
+zil_itx_destroy(itx_t *itx)
+{
+	kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen);
+}
+
 uint64_t
 zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx)
 {
 	uint64_t seq;
 
 	ASSERT(itx->itx_lr.lrc_seq == 0);
+	ASSERT(!zilog->zl_replay);
 
 	mutex_enter(&zilog->zl_lock);
 	list_insert_tail(&zilog->zl_itx_list, itx);
@@ -1020,8 +1102,7 @@ zil_itx_clean(zilog_t *zilog)
 	/* destroy sync'd log transactions */
 	while ((itx = list_head(&clean_list)) != NULL) {
 		list_remove(&clean_list, itx);
-		kmem_free(itx, offsetof(itx_t, itx_lr)
-		    + itx->itx_lr.lrc_reclen);
+		zil_itx_destroy(itx);
 	}
 	list_destroy(&clean_list);
 }
@@ -1040,7 +1121,7 @@ zil_clean(zilog_t *zilog)
 	if ((itx != NULL) &&
 	    (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) {
 		(void) taskq_dispatch(zilog->zl_clean_taskq,
-		    (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP);
+		    (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP);
 	}
 	mutex_exit(&zilog->zl_lock);
 }
@@ -1050,9 +1131,10 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 {
 	uint64_t txg;
 	uint64_t commit_seq = 0;
-	itx_t *itx, *itx_next = (itx_t *)-1;
+	itx_t *itx, *itx_next;
 	lwb_t *lwb;
 	spa_t *spa;
+	int error = 0;
 
 	zilog->zl_writer = B_TRUE;
 	ASSERT(zilog->zl_root_zio == NULL);
@@ -1072,77 +1154,64 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 				return;
 			}
 			mutex_exit(&zilog->zl_lock);
-			zil_create(zilog);
+			lwb = zil_create(zilog);
 			mutex_enter(&zilog->zl_lock);
-			lwb = list_tail(&zilog->zl_lwb_list);
 		}
 	}
+	ASSERT(lwb == NULL || lwb->lwb_zio == NULL);
 
 	/* Loop through in-memory log transactions filling log blocks. */
 	DTRACE_PROBE1(zil__cw1, zilog_t *, zilog);
-	for (;;) {
+
+	for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) {
 		/*
-		 * Find the next itx to push:
-		 * Push all transactions related to specified foid and all
-		 * other transactions except TX_WRITE, TX_TRUNCATE,
-		 * TX_SETATTR and TX_ACL for all other files.
+		 * Save the next pointer.  Even though we drop zl_lock below,
+		 * all threads that can remove itx list entries (other writers
+		 * and zil_itx_clean()) can't do so until they have zl_writer.
 		 */
-		if (itx_next != (itx_t *)-1)
-			itx = itx_next;
-		else
-			itx = list_head(&zilog->zl_itx_list);
-		for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) {
-			if (foid == 0) /* push all foids? */
-				break;
-			if (itx->itx_sync) /* push all O_[D]SYNC */
-				break;
-			switch (itx->itx_lr.lrc_txtype) {
-			case TX_SETATTR:
-			case TX_WRITE:
-			case TX_TRUNCATE:
-			case TX_ACL:
-				/* lr_foid is same offset for these records */
-				if (((lr_write_t *)&itx->itx_lr)->lr_foid
-				    != foid) {
-					continue; /* skip this record */
-				}
-			}
-			break;
-		}
-		if (itx == NULL)
-			break;
+		itx_next = list_next(&zilog->zl_itx_list, itx);
+
+		/*
+		 * Determine whether to push this itx.
+		 * Push all transactions related to specified foid and
+		 * all other transactions except those that can be logged
+		 * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL)
+		 * for all other files.
+		 *
+		 * If foid == 0 (meaning "push all foids") or
+		 * itx->itx_sync is set (meaning O_[D]SYNC), push regardless.
+		 */
+		if (foid != 0 && !itx->itx_sync &&
+		    TX_OOO(itx->itx_lr.lrc_txtype) &&
+		    ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid)
+			continue; /* skip this record */
 
 		if ((itx->itx_lr.lrc_seq > seq) &&
-		    ((lwb == NULL) || (lwb->lwb_nused == 0) ||
-		    (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) {
+		    ((lwb == NULL) || (LWB_EMPTY(lwb)) ||
+		    (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz)))
 			break;
-		}
 
-		/*
-		 * Save the next pointer.  Even though we soon drop
-		 * zl_lock all threads that may change the list
-		 * (another writer or zil_itx_clean) can't do so until
-		 * they have zl_writer.
-		 */
-		itx_next = list_next(&zilog->zl_itx_list, itx);
 		list_remove(&zilog->zl_itx_list, itx);
 		zilog->zl_itx_list_sz -= itx->itx_sod;
+
 		mutex_exit(&zilog->zl_lock);
+
 		txg = itx->itx_lr.lrc_txg;
 		ASSERT(txg);
 
 		if (txg > spa_last_synced_txg(spa) ||
 		    txg > spa_freeze_txg(spa))
 			lwb = zil_lwb_commit(zilog, itx, lwb);
-		kmem_free(itx, offsetof(itx_t, itx_lr)
-		    + itx->itx_lr.lrc_reclen);
+
+		zil_itx_destroy(itx);
+
 		mutex_enter(&zilog->zl_lock);
 	}
 	DTRACE_PROBE1(zil__cw2, zilog_t *, zilog);
 	/* determine commit sequence number */
 	itx = list_head(&zilog->zl_itx_list);
 	if (itx)
-		commit_seq = itx->itx_lr.lrc_seq;
+		commit_seq = itx->itx_lr.lrc_seq - 1;
 	else
 		commit_seq = zilog->zl_itx_seq;
 	mutex_exit(&zilog->zl_lock);
@@ -1159,22 +1228,28 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid)
 	 */
 	if (zilog->zl_root_zio) {
 		DTRACE_PROBE1(zil__cw3, zilog_t *, zilog);
-		(void) zio_wait(zilog->zl_root_zio);
+		error = zio_wait(zilog->zl_root_zio);
 		zilog->zl_root_zio = NULL;
 		DTRACE_PROBE1(zil__cw4, zilog_t *, zilog);
 		zil_flush_vdevs(zilog);
 	}
 
-	if (zilog->zl_log_error || lwb == NULL) {
-		zilog->zl_log_error = 0;
+	if (error || lwb == NULL)
 		txg_wait_synced(zilog->zl_dmu_pool, 0);
-	}
 
 	mutex_enter(&zilog->zl_lock);
 	zilog->zl_writer = B_FALSE;
 
 	ASSERT3U(commit_seq, >=, zilog->zl_commit_seq);
 	zilog->zl_commit_seq = commit_seq;
+
+	/*
+	 * Remember the highest committed log sequence number for ztest.
+	 * We only update this value when all the log writes succeeded,
+	 * because ztest wants to ASSERT that it got the whole log chain.
+	 */
+	if (error == 0 && lwb != NULL)
+		zilog->zl_commit_lr_seq = zilog->zl_lr_seq;
 }
 
 /*
@@ -1194,7 +1269,7 @@ zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
 
 	while (zilog->zl_writer) {
 		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
-		if (seq < zilog->zl_commit_seq) {
+		if (seq <= zilog->zl_commit_seq) {
 			mutex_exit(&zilog->zl_lock);
 			return;
 		}
@@ -1205,6 +1280,33 @@ zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid)
 	mutex_exit(&zilog->zl_lock);
 }
 
+/*
+ * Report whether all transactions are committed.
+ */
+static boolean_t
+zil_is_committed(zilog_t *zilog)
+{
+	lwb_t *lwb;
+	boolean_t committed;
+
+	mutex_enter(&zilog->zl_lock);
+
+	while (zilog->zl_writer)
+		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+
+	if (!list_is_empty(&zilog->zl_itx_list))
+		committed = B_FALSE;		/* unpushed transactions */
+	else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL)
+		committed = B_TRUE;		/* intent log never used */
+	else if (list_next(&zilog->zl_lwb_list, lwb) != NULL)
+		committed = B_FALSE;		/* zil_sync() not done yet */
+	else
+		committed = B_TRUE;		/* everything synced */
+
+	mutex_exit(&zilog->zl_lock);
+	return (committed);
+}
+
 /*
  * Called in syncing context to free committed log blocks and update log header.
  */
@@ -1214,22 +1316,33 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 	zil_header_t *zh = zil_header_in_syncing_context(zilog);
 	uint64_t txg = dmu_tx_get_txg(tx);
 	spa_t *spa = zilog->zl_spa;
+	uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK];
 	lwb_t *lwb;
 
+	/*
+	 * We don't zero out zl_destroy_txg, so make sure we don't try
+	 * to destroy it twice.
+	 */
+	if (spa_sync_pass(spa) != 1)
+		return;
+
 	mutex_enter(&zilog->zl_lock);
 
 	ASSERT(zilog->zl_stop_sync == 0);
 
-	zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK];
+	if (*replayed_seq != 0) {
+		ASSERT(zh->zh_replay_seq < *replayed_seq);
+		zh->zh_replay_seq = *replayed_seq;
+		*replayed_seq = 0;
+	}
 
 	if (zilog->zl_destroy_txg == txg) {
 		blkptr_t blk = zh->zh_log;
 
 		ASSERT(list_head(&zilog->zl_lwb_list) == NULL);
-		ASSERT(spa_sync_pass(spa) == 1);
 
 		bzero(zh, sizeof (zil_header_t));
-		bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq));
+		bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq));
 
 		if (zilog->zl_keep_first) {
 			/*
@@ -1245,17 +1358,12 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx)
 		}
 	}
 
-	for (;;) {
-		lwb = list_head(&zilog->zl_lwb_list);
-		if (lwb == NULL) {
-			mutex_exit(&zilog->zl_lock);
-			return;
-		}
+	while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) {
 		zh->zh_log = lwb->lwb_blk;
 		if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg)
 			break;
 		list_remove(&zilog->zl_lwb_list, lwb);
-		zio_free_blk(spa, &lwb->lwb_blk, txg);
+		zio_free_zil(spa, txg, &lwb->lwb_blk);
 		kmem_cache_free(zil_lwb_cache, lwb);
 
 		/*
@@ -1283,6 +1391,12 @@ zil_fini(void)
 	kmem_cache_destroy(zil_lwb_cache);
 }
 
+void
+zil_set_logbias(zilog_t *zilog, uint64_t logbias)
+{
+	zilog->zl_logbias = logbias;
+}
+
 zilog_t *
 zil_alloc(objset_t *os, zil_header_t *zh_phys)
 {
@@ -1295,6 +1409,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys)
 	zilog->zl_spa = dmu_objset_spa(os);
 	zilog->zl_dmu_pool = dmu_objset_pool(os);
 	zilog->zl_destroy_txg = TXG_INITIAL - 1;
+	zilog->zl_logbias = dmu_objset_logbias(os);
 
 	mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL);
 
@@ -1343,25 +1458,6 @@ zil_free(zilog_t *zilog)
 	kmem_free(zilog, sizeof (zilog_t));
 }
 
-/*
- * return true if the initial log block is not valid
- */
-static boolean_t
-zil_empty(zilog_t *zilog)
-{
-	const zil_header_t *zh = zilog->zl_header;
-	arc_buf_t *abuf = NULL;
-
-	if (BP_IS_HOLE(&zh->zh_log))
-		return (B_TRUE);
-
-	if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0)
-		return (B_TRUE);
-
-	VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
-	return (B_FALSE);
-}
-
 /*
  * Open an intent log.
  */
@@ -1390,7 +1486,7 @@ zil_close(zilog_t *zilog)
 	if (!zil_is_committed(zilog)) {
 		uint64_t txg;
 		dmu_tx_t *tx = dmu_tx_create(zilog->zl_os);
-		(void) dmu_tx_assign(tx, TXG_WAIT);
+		VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0);
 		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
 		txg = dmu_tx_get_txg(tx);
 		dmu_tx_commit(tx);
@@ -1417,7 +1513,7 @@ zil_suspend(zilog_t *zilog)
 	const zil_header_t *zh = zilog->zl_header;
 
 	mutex_enter(&zilog->zl_lock);
-	if (zh->zh_claim_txg != 0) {		/* unplayed log */
+	if (zh->zh_flags & ZIL_REPLAY_NEEDED) {		/* unplayed log */
 		mutex_exit(&zilog->zl_lock);
 		return (EBUSY);
 	}
@@ -1464,278 +1560,191 @@ zil_resume(zilog_t *zilog)
 }
 
 typedef struct zil_replay_arg {
-	objset_t	*zr_os;
 	zil_replay_func_t **zr_replay;
-	zil_replay_cleaner_t *zr_replay_cleaner;
 	void		*zr_arg;
-	uint64_t	*zr_txgp;
 	boolean_t	zr_byteswap;
-	char		*zr_lrbuf;
+	char		*zr_lr;
 } zil_replay_arg_t;
 
-static void
+static int
+zil_replay_error(zilog_t *zilog, lr_t *lr, int error)
+{
+	char name[MAXNAMELEN];
+
+	zilog->zl_replaying_seq--;	/* didn't actually replay this one */
+
+	dmu_objset_name(zilog->zl_os, name);
+
+	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
+	    "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name,
+	    (u_longlong_t)lr->lrc_seq,
+	    (u_longlong_t)(lr->lrc_txtype & ~TX_CI),
+	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
+
+	return (error);
+}
+
+static int
 zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg)
 {
 	zil_replay_arg_t *zr = zra;
 	const zil_header_t *zh = zilog->zl_header;
 	uint64_t reclen = lr->lrc_reclen;
 	uint64_t txtype = lr->lrc_txtype;
-	char *name;
-	int pass, error, sunk;
-
-	if (zilog->zl_stop_replay)
-		return;
+	int error = 0;
 
-	if (lr->lrc_txg < claim_txg)		/* already committed */
-		return;
+	zilog->zl_replaying_seq = lr->lrc_seq;
 
 	if (lr->lrc_seq <= zh->zh_replay_seq)	/* already replayed */
-		return;
+		return (0);
+
+	if (lr->lrc_txg < claim_txg)		/* already committed */
+		return (0);
 
 	/* Strip case-insensitive bit, still present in log record */
 	txtype &= ~TX_CI;
 
+	if (txtype == 0 || txtype >= TX_MAX_TYPE)
+		return (zil_replay_error(zilog, lr, EINVAL));
+
 	/*
-	 * Make a copy of the data so we can revise and extend it.
+	 * If this record type can be logged out of order, the object
+	 * (lr_foid) may no longer exist.  That's legitimate, not an error.
 	 */
-	bcopy(lr, zr->zr_lrbuf, reclen);
+	if (TX_OOO(txtype)) {
+		error = dmu_object_info(zilog->zl_os,
+		    ((lr_ooo_t *)lr)->lr_foid, NULL);
+		if (error == ENOENT || error == EEXIST)
+			return (0);
+	}
 
 	/*
-	 * The log block containing this lr may have been byteswapped
-	 * so that we can easily examine common fields like lrc_txtype.
-	 * However, the log is a mix of different data types, and only the
-	 * replay vectors know how to byteswap their records.  Therefore, if
-	 * the lr was byteswapped, undo it before invoking the replay vector.
+	 * Make a copy of the data so we can revise and extend it.
 	 */
-	if (zr->zr_byteswap)
-		byteswap_uint64_array(zr->zr_lrbuf, reclen);
+	bcopy(lr, zr->zr_lr, reclen);
 
 	/*
 	 * If this is a TX_WRITE with a blkptr, suck in the data.
 	 */
 	if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) {
-		lr_write_t *lrw = (lr_write_t *)lr;
-		blkptr_t *wbp = &lrw->lr_blkptr;
-		uint64_t wlen = lrw->lr_length;
-		char *wbuf = zr->zr_lrbuf + reclen;
-
-		if (BP_IS_HOLE(wbp)) {	/* compressed to a hole */
-			bzero(wbuf, wlen);
-		} else {
-			/*
-			 * A subsequent write may have overwritten this block,
-			 * in which case wbp may have been been freed and
-			 * reallocated, and our read of wbp may fail with a
-			 * checksum error.  We can safely ignore this because
-			 * the later write will provide the correct data.
-			 */
-			zbookmark_t zb;
-
-			zb.zb_objset = dmu_objset_id(zilog->zl_os);
-			zb.zb_object = lrw->lr_foid;
-			zb.zb_level = -1;
-			zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp);
-
-			(void) zio_wait(zio_read(NULL, zilog->zl_spa,
-			    wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL,
-			    ZIO_PRIORITY_SYNC_READ,
-			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb));
-			(void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen);
-		}
+		error = zil_read_log_data(zilog, (lr_write_t *)lr,
+		    zr->zr_lr + reclen);
+		if (error)
+			return (zil_replay_error(zilog, lr, error));
 	}
 
 	/*
-	 * Replay of large truncates can end up needing additional txs
-	 * and a different txg. If they are nested within the replay tx
-	 * as below then a hang is possible. So we do the truncate here
-	 * and redo the truncate later (a no-op) and update the sequence
-	 * number whilst in the replay tx. Fortunately, it's safe to repeat
-	 * a truncate if we crash and the truncate commits. A create over
-	 * an existing file will also come in as a TX_TRUNCATE record.
-	 *
-	 * Note, remove of large files and renames over large files is
-	 * handled by putting the deleted object on a stable list
-	 * and if necessary force deleting the object outside of the replay
-	 * transaction using the zr_replay_cleaner.
+	 * The log block containing this lr may have been byteswapped
+	 * so that we can easily examine common fields like lrc_txtype.
+	 * However, the log is a mix of different record types, and only the
+	 * replay vectors know how to byteswap their records.  Therefore, if
+	 * the lr was byteswapped, undo it before invoking the replay vector.
 	 */
-	if (txtype == TX_TRUNCATE) {
-		*zr->zr_txgp = TXG_NOWAIT;
-		error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf,
-		    zr->zr_byteswap);
-		if (error)
-			goto bad;
-		zr->zr_byteswap = 0; /* only byteswap once */
-	}
+	if (zr->zr_byteswap)
+		byteswap_uint64_array(zr->zr_lr, reclen);
 
 	/*
 	 * We must now do two things atomically: replay this log record,
-	 * and update the log header to reflect the fact that we did so.
-	 * We use the DMU's ability to assign into a specific txg to do this.
+	 * and update the log header sequence number to reflect the fact that
+	 * we did so. At the end of each replay function the sequence number
+	 * is updated if we are in replay mode.
 	 */
-	for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) {
-		uint64_t replay_txg;
-		dmu_tx_t *replay_tx;
-
-		replay_tx = dmu_tx_create(zr->zr_os);
-		error = dmu_tx_assign(replay_tx, TXG_WAIT);
-		if (error) {
-			dmu_tx_abort(replay_tx);
-			break;
-		}
-
-		replay_txg = dmu_tx_get_txg(replay_tx);
-
-		if (txtype == 0 || txtype >= TX_MAX_TYPE) {
-			error = EINVAL;
-		} else {
-			/*
-			 * On the first pass, arrange for the replay vector
-			 * to fail its dmu_tx_assign().  That's the only way
-			 * to ensure that those code paths remain well tested.
-			 *
-			 * Only byteswap (if needed) on the 1st pass.
-			 */
-			*zr->zr_txgp = replay_txg - (pass == 1);
-			error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf,
-			    zr->zr_byteswap && pass == 1);
-			*zr->zr_txgp = TXG_NOWAIT;
-		}
-
-		if (error == 0) {
-			dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx);
-			zilog->zl_replay_seq[replay_txg & TXG_MASK] =
-			    lr->lrc_seq;
-		}
-
-		dmu_tx_commit(replay_tx);
-
-		if (!error)
-			return;
-
+	error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap);
+	if (error) {
 		/*
 		 * The DMU's dnode layer doesn't see removes until the txg
 		 * commits, so a subsequent claim can spuriously fail with
-		 * EEXIST. So if we receive any error other than ERESTART
-		 * we try syncing out any removes then retrying the
-		 * transaction.
+		 * EEXIST. So if we receive any error we try syncing out
+		 * any removes then retry the transaction.  Note that we
+		 * specify B_FALSE for byteswap now, so we don't do it twice.
 		 */
-		if (error != ERESTART && !sunk) {
-			if (zr->zr_replay_cleaner)
-				zr->zr_replay_cleaner(zr->zr_arg);
-			txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
-			sunk = B_TRUE;
-			continue; /* retry */
-		}
-
-		if (error != ERESTART)
-			break;
-
-		if (pass != 1)
-			txg_wait_open(spa_get_dsl(zilog->zl_spa),
-			    replay_txg + 1);
-
-		dprintf("pass %d, retrying\n", pass);
+		txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0);
+		error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE);
+		if (error)
+			return (zil_replay_error(zilog, lr, error));
 	}
-
-bad:
-	ASSERT(error && error != ERESTART);
-	name = kmem_alloc(MAXNAMELEN, KM_SLEEP);
-	dmu_objset_name(zr->zr_os, name);
-	cmn_err(CE_WARN, "ZFS replay transaction error %d, "
-	    "dataset %s, seq 0x%llx, txtype %llu %s\n",
-	    error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype,
-	    (lr->lrc_txtype & TX_CI) ? "CI" : "");
-	zilog->zl_stop_replay = 1;
-	kmem_free(name, MAXNAMELEN);
+	return (0);
 }
 
 /* ARGSUSED */
-static void
+static int
 zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg)
 {
 	zilog->zl_replay_blks++;
+
+	return (0);
 }
 
 /*
  * If this dataset has a non-empty intent log, replay it and destroy it.
  */
 void
-zil_replay(objset_t *os, void *arg, uint64_t *txgp,
-	zil_replay_func_t *replay_func[TX_MAX_TYPE],
-	zil_replay_cleaner_t *replay_cleaner)
+zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE])
 {
 	zilog_t *zilog = dmu_objset_zil(os);
 	const zil_header_t *zh = zilog->zl_header;
 	zil_replay_arg_t zr;
 
-	if (zil_empty(zilog)) {
+	if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) {
 		zil_destroy(zilog, B_TRUE);
 		return;
 	}
 
-	zr.zr_os = os;
 	zr.zr_replay = replay_func;
-	zr.zr_replay_cleaner = replay_cleaner;
 	zr.zr_arg = arg;
-	zr.zr_txgp = txgp;
 	zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log);
-	zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
+	zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP);
 
 	/*
 	 * Wait for in-progress removes to sync before starting replay.
 	 */
 	txg_wait_synced(zilog->zl_dmu_pool, 0);
 
-	zilog->zl_stop_replay = 0;
-	zilog->zl_replay_time = lbolt;
+	zilog->zl_replay = B_TRUE;
+	zilog->zl_replay_time = ddi_get_lbolt();
 	ASSERT(zilog->zl_replay_blks == 0);
 	(void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr,
 	    zh->zh_claim_txg);
-	kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE);
+	kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE);
 
 	zil_destroy(zilog, B_FALSE);
 	txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg);
+	zilog->zl_replay = B_FALSE;
 }
 
-/*
- * Report whether all transactions are committed
- */
-int
-zil_is_committed(zilog_t *zilog)
+boolean_t
+zil_replaying(zilog_t *zilog, dmu_tx_t *tx)
 {
-	lwb_t *lwb;
-	int ret;
-
-	mutex_enter(&zilog->zl_lock);
-	while (zilog->zl_writer)
-		cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock);
+	if (zilog == NULL)
+		return (B_TRUE);
 
-	/* recent unpushed intent log transactions? */
-	if (!list_is_empty(&zilog->zl_itx_list)) {
-		ret = B_FALSE;
-		goto out;
+	if (zilog->zl_replay) {
+		dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx);
+		zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] =
+		    zilog->zl_replaying_seq;
+		return (B_TRUE);
 	}
 
-	/* intent log never used? */
-	lwb = list_head(&zilog->zl_lwb_list);
-	if (lwb == NULL) {
-		ret = B_TRUE;
-		goto out;
-	}
+	return (B_FALSE);
+}
 
-	/*
-	 * more than 1 log buffer means zil_sync() hasn't yet freed
-	 * entries after a txg has committed
-	 */
-	if (list_next(&zilog->zl_lwb_list, lwb)) {
-		ret = B_FALSE;
-		goto out;
-	}
+/* ARGSUSED */
+int
+zil_vdev_offline(const char *osname, void *arg)
+{
+	objset_t *os;
+	zilog_t *zilog;
+	int error;
 
-	ASSERT(zil_empty(zilog));
-	ret = B_TRUE;
-out:
-	cv_broadcast(&zilog->zl_cv_writer);
-	mutex_exit(&zilog->zl_lock);
-	return (ret);
+	error = dmu_objset_hold(osname, FTAG, &os);
+	if (error)
+		return (error);
+
+	zilog = dmu_objset_zil(os);
+	if (zil_suspend(zilog) != 0)
+		error = EEXIST;
+	else
+		zil_resume(zilog);
+	dmu_objset_rele(os, FTAG);
+	return (error);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c
index d347920ea6bb7..4e481b16b7786 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,6 +32,9 @@
 #include <sys/zio_impl.h>
 #include <sys/zio_compress.h>
 #include <sys/zio_checksum.h>
+#include <sys/dmu_objset.h>
+#include <sys/arc.h>
+#include <sys/ddt.h>
 
 /*
  * ==========================================================================
@@ -42,11 +45,12 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
 	0,	/* ZIO_PRIORITY_NOW		*/
 	0,	/* ZIO_PRIORITY_SYNC_READ	*/
 	0,	/* ZIO_PRIORITY_SYNC_WRITE	*/
-	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
-	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
-	4,	/* ZIO_PRIORITY_FREE		*/
-	0,	/* ZIO_PRIORITY_CACHE_FILL	*/
 	0,	/* ZIO_PRIORITY_LOG_WRITE	*/
+	1,	/* ZIO_PRIORITY_CACHE_FILL	*/
+	1,	/* ZIO_PRIORITY_AGG		*/
+	4,	/* ZIO_PRIORITY_FREE		*/
+	4,	/* ZIO_PRIORITY_ASYNC_WRITE	*/
+	6,	/* ZIO_PRIORITY_ASYNC_READ	*/
 	10,	/* ZIO_PRIORITY_RESILVER	*/
 	20,	/* ZIO_PRIORITY_SCRUB		*/
 };
@@ -57,11 +61,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = {
  * ==========================================================================
  */
 char *zio_type_name[ZIO_TYPES] = {
-	"null", "read", "write", "free", "claim", "ioctl" };
-
-#define	SYNC_PASS_DEFERRED_FREE	1	/* defer frees after this pass */
-#define	SYNC_PASS_DONT_COMPRESS	4	/* don't compress after this pass */
-#define	SYNC_PASS_REWRITE	1	/* rewrite new bps after this pass */
+	"zio_null", "zio_read", "zio_write", "zio_free", "zio_claim",
+	"zio_ioctl"
+};
 
 /*
  * ==========================================================================
@@ -69,6 +71,7 @@ char *zio_type_name[ZIO_TYPES] = {
  * ==========================================================================
  */
 kmem_cache_t *zio_cache;
+kmem_cache_t *zio_link_cache;
 kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT];
 
@@ -80,8 +83,15 @@ extern vmem_t *zio_alloc_arena;
  * An allocating zio is one that either currently has the DVA allocate
  * stage set or will have it later in its lifetime.
  */
-#define	IO_IS_ALLOCATING(zio) \
-	((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE))
+#define	IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE)
+
+boolean_t	zio_requeue_io_start_cut_in_line = B_TRUE;
+
+#ifdef ZFS_DEBUG
+int zio_buf_debug_limit = 16384;
+#else
+int zio_buf_debug_limit = 0;
+#endif
 
 void
 zio_init(void)
@@ -92,8 +102,10 @@ zio_init(void)
 #ifdef _KERNEL
 	data_alloc_arena = zio_alloc_arena;
 #endif
-	zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0,
-	    NULL, NULL, NULL, NULL, NULL, 0);
+	zio_cache = kmem_cache_create("zio_cache",
+	    sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
+	zio_link_cache = kmem_cache_create("zio_link_cache",
+	    sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
 
 	/*
 	 * For small buffers, we want a cache for each multiple of
@@ -121,12 +133,13 @@ zio_init(void)
 			char name[36];
 			(void) sprintf(name, "zio_buf_%lu", (ulong_t)size);
 			zio_buf_cache[c] = kmem_cache_create(name, size,
-			    align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG);
+			    align, NULL, NULL, NULL, NULL, NULL,
+			    size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
 
 			(void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size);
 			zio_data_buf_cache[c] = kmem_cache_create(name, size,
 			    align, NULL, NULL, NULL, NULL, data_alloc_arena,
-			    KMC_NODEBUG);
+			    size > zio_buf_debug_limit ? KMC_NODEBUG : 0);
 		}
 	}
 
@@ -164,6 +177,7 @@ zio_fini(void)
 		zio_data_buf_cache[c] = NULL;
 	}
 
+	kmem_cache_destroy(zio_link_cache);
 	kmem_cache_destroy(zio_cache);
 
 	zio_inject_fini();
@@ -260,7 +274,8 @@ zio_pop_transforms(zio_t *zio)
 			zt->zt_transform(zio,
 			    zt->zt_orig_data, zt->zt_orig_size);
 
-		zio_buf_free(zio->io_data, zt->zt_bufsize);
+		if (zt->zt_bufsize != 0)
+			zio_buf_free(zio->io_data, zt->zt_bufsize);
 
 		zio->io_data = zt->zt_orig_data;
 		zio->io_size = zt->zt_orig_size;
@@ -289,7 +304,7 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
 {
 	if (zio->io_error == 0 &&
 	    zio_decompress_data(BP_GET_COMPRESS(zio->io_bp),
-	    zio->io_data, zio->io_size, data, size) != 0)
+	    zio->io_data, data, zio->io_size, size) != 0)
 		zio->io_error = EIO;
 }
 
@@ -298,41 +313,108 @@ zio_decompress(zio_t *zio, void *data, uint64_t size)
  * I/O parent/child relationships and pipeline interlocks
  * ==========================================================================
  */
+/*
+ * NOTE - Callers to zio_walk_parents() and zio_walk_children must
+ *        continue calling these functions until they return NULL.
+ *        Otherwise, the next caller will pick up the list walk in
+ *        some indeterminate state.  (Otherwise every caller would
+ *        have to pass in a cookie to keep the state represented by
+ *        io_walk_link, which gets annoying.)
+ */
+zio_t *
+zio_walk_parents(zio_t *cio)
+{
+	zio_link_t *zl = cio->io_walk_link;
+	list_t *pl = &cio->io_parent_list;
 
-static void
-zio_add_child(zio_t *pio, zio_t *zio)
+	zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl);
+	cio->io_walk_link = zl;
+
+	if (zl == NULL)
+		return (NULL);
+
+	ASSERT(zl->zl_child == cio);
+	return (zl->zl_parent);
+}
+
+zio_t *
+zio_walk_children(zio_t *pio)
 {
+	zio_link_t *zl = pio->io_walk_link;
+	list_t *cl = &pio->io_child_list;
+
+	zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl);
+	pio->io_walk_link = zl;
+
+	if (zl == NULL)
+		return (NULL);
+
+	ASSERT(zl->zl_parent == pio);
+	return (zl->zl_child);
+}
+
+zio_t *
+zio_unique_parent(zio_t *cio)
+{
+	zio_t *pio = zio_walk_parents(cio);
+
+	VERIFY(zio_walk_parents(cio) == NULL);
+	return (pio);
+}
+
+void
+zio_add_child(zio_t *pio, zio_t *cio)
+{
+	zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP);
+
+	/*
+	 * Logical I/Os can have logical, gang, or vdev children.
+	 * Gang I/Os can have gang or vdev children.
+	 * Vdev I/Os can only have vdev children.
+	 * The following ASSERT captures all of these constraints.
+	 */
+	ASSERT(cio->io_child_type <= pio->io_child_type);
+
+	zl->zl_parent = pio;
+	zl->zl_child = cio;
+
+	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
-	if (zio->io_stage < ZIO_STAGE_READY)
-		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
-	if (zio->io_stage < ZIO_STAGE_DONE)
-		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
-	zio->io_sibling_prev = NULL;
-	zio->io_sibling_next = pio->io_child;
-	if (pio->io_child != NULL)
-		pio->io_child->io_sibling_prev = zio;
-	pio->io_child = zio;
-	zio->io_parent = pio;
+
+	ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0);
+
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_children[cio->io_child_type][w] += !cio->io_state[w];
+
+	list_insert_head(&pio->io_child_list, zl);
+	list_insert_head(&cio->io_parent_list, zl);
+
+	pio->io_child_count++;
+	cio->io_parent_count++;
+
 	mutex_exit(&pio->io_lock);
+	mutex_exit(&cio->io_lock);
 }
 
 static void
-zio_remove_child(zio_t *pio, zio_t *zio)
+zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl)
 {
-	zio_t *next, *prev;
-
-	ASSERT(zio->io_parent == pio);
+	ASSERT(zl->zl_parent == pio);
+	ASSERT(zl->zl_child == cio);
 
+	mutex_enter(&cio->io_lock);
 	mutex_enter(&pio->io_lock);
-	next = zio->io_sibling_next;
-	prev = zio->io_sibling_prev;
-	if (next != NULL)
-		next->io_sibling_prev = prev;
-	if (prev != NULL)
-		prev->io_sibling_next = next;
-	if (pio->io_child == zio)
-		pio->io_child = next;
+
+	list_remove(&pio->io_child_list, zl);
+	list_remove(&cio->io_parent_list, zl);
+
+	pio->io_child_count--;
+	cio->io_parent_count--;
+
 	mutex_exit(&pio->io_lock);
+	mutex_exit(&cio->io_lock);
+
+	kmem_cache_free(zio_link_cache, zl);
 }
 
 static boolean_t
@@ -344,7 +426,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait)
 	mutex_enter(&zio->io_lock);
 	ASSERT(zio->io_stall == NULL);
 	if (*countp != 0) {
-		zio->io_stage--;
+		zio->io_stage >>= 1;
 		zio->io_stall = countp;
 		waiting = B_TRUE;
 	}
@@ -386,10 +468,11 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c)
  * ==========================================================================
  */
 static zio_t *
-zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
+zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset,
-    const zbookmark_t *zb, uint8_t stage, uint32_t pipeline)
+    zio_type_t type, int priority, enum zio_flag flags,
+    vdev_t *vd, uint64_t offset, const zbookmark_t *zb,
+    enum zio_stage stage, enum zio_stage pipeline)
 {
 	zio_t *zio;
 
@@ -407,53 +490,58 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL);
 	cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL);
 
+	list_create(&zio->io_parent_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_parent_node));
+	list_create(&zio->io_child_list, sizeof (zio_link_t),
+	    offsetof(zio_link_t, zl_child_node));
+
 	if (vd != NULL)
 		zio->io_child_type = ZIO_CHILD_VDEV;
 	else if (flags & ZIO_FLAG_GANG_CHILD)
 		zio->io_child_type = ZIO_CHILD_GANG;
+	else if (flags & ZIO_FLAG_DDT_CHILD)
+		zio->io_child_type = ZIO_CHILD_DDT;
 	else
 		zio->io_child_type = ZIO_CHILD_LOGICAL;
 
 	if (bp != NULL) {
-		zio->io_bp = bp;
+		zio->io_bp = (blkptr_t *)bp;
 		zio->io_bp_copy = *bp;
 		zio->io_bp_orig = *bp;
-		if (type != ZIO_TYPE_WRITE)
+		if (type != ZIO_TYPE_WRITE ||
+		    zio->io_child_type == ZIO_CHILD_DDT)
 			zio->io_bp = &zio->io_bp_copy;	/* so caller can free */
-		if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
-			if (BP_IS_GANG(bp))
-				pipeline |= ZIO_GANG_STAGES;
+		if (zio->io_child_type == ZIO_CHILD_LOGICAL)
 			zio->io_logical = zio;
-		}
+		if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp))
+			pipeline |= ZIO_GANG_STAGES;
 	}
 
 	zio->io_spa = spa;
 	zio->io_txg = txg;
-	zio->io_data = data;
-	zio->io_size = size;
 	zio->io_done = done;
 	zio->io_private = private;
 	zio->io_type = type;
 	zio->io_priority = priority;
 	zio->io_vd = vd;
 	zio->io_offset = offset;
+	zio->io_orig_data = zio->io_data = data;
+	zio->io_orig_size = zio->io_size = size;
 	zio->io_orig_flags = zio->io_flags = flags;
 	zio->io_orig_stage = zio->io_stage = stage;
 	zio->io_orig_pipeline = zio->io_pipeline = pipeline;
 
+	zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY);
+	zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE);
+
 	if (zb != NULL)
 		zio->io_bookmark = *zb;
 
 	if (pio != NULL) {
-		/*
-		 * Logical I/Os can have logical, gang, or vdev children.
-		 * Gang I/Os can have gang or vdev children.
-		 * Vdev I/Os can only have vdev children.
-		 * The following ASSERT captures all of these constraints.
-		 */
-		ASSERT(zio->io_child_type <= pio->io_child_type);
 		if (zio->io_logical == NULL)
 			zio->io_logical = pio->io_logical;
+		if (zio->io_child_type == ZIO_CHILD_GANG)
+			zio->io_gang_leader = pio->io_gang_leader;
 		zio_add_child(pio, zio);
 	}
 
@@ -463,70 +551,53 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 static void
 zio_destroy(zio_t *zio)
 {
-	spa_t *spa = zio->io_spa;
-	uint8_t async_root = zio->io_async_root;
-
+	list_destroy(&zio->io_parent_list);
+	list_destroy(&zio->io_child_list);
 	mutex_destroy(&zio->io_lock);
 	cv_destroy(&zio->io_cv);
 	kmem_cache_free(zio_cache, zio);
-
-	if (async_root) {
-		mutex_enter(&spa->spa_async_root_lock);
-		if (--spa->spa_async_root_count == 0)
-			cv_broadcast(&spa->spa_async_root_cv);
-		mutex_exit(&spa->spa_async_root_lock);
-	}
 }
 
 zio_t *
-zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private,
-	int flags)
+zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done,
+    void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
 	zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private,
-	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL,
+	    ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL,
 	    ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
-zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags)
+zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags)
 {
-	return (zio_null(NULL, spa, done, private, flags));
+	return (zio_null(NULL, spa, NULL, done, private, flags));
 }
 
 zio_t *
 zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp,
     void *data, uint64_t size, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb)
+    int priority, enum zio_flag flags, const zbookmark_t *zb)
 {
 	zio_t *zio;
 
-	zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp,
+	zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp,
 	    data, size, done, private,
 	    ZIO_TYPE_READ, priority, flags, NULL, 0, zb,
-	    ZIO_STAGE_OPEN, ZIO_READ_PIPELINE);
+	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+	    ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE);
 
 	return (zio);
 }
 
-void
-zio_skip_write(zio_t *zio)
-{
-	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
-	ASSERT(zio->io_stage == ZIO_STAGE_READY);
-	ASSERT(!BP_IS_GANG(zio->io_bp));
-
-	zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
-}
-
 zio_t *
 zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    void *data, uint64_t size, zio_prop_t *zp,
+    void *data, uint64_t size, const zio_prop_t *zp,
     zio_done_func_t *ready, zio_done_func_t *done, void *private,
-    int priority, int flags, const zbookmark_t *zb)
+    int priority, enum zio_flag flags, const zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -536,13 +607,15 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	    zp->zp_compress < ZIO_COMPRESS_FUNCTIONS &&
 	    zp->zp_type < DMU_OT_NUMTYPES &&
 	    zp->zp_level < 32 &&
-	    zp->zp_ndvas > 0 &&
-	    zp->zp_ndvas <= spa_max_replication(spa));
-	ASSERT(ready != NULL);
+	    zp->zp_copies > 0 &&
+	    zp->zp_copies <= spa_max_replication(spa) &&
+	    zp->zp_dedup <= 1 &&
+	    zp->zp_dedup_verify <= 1);
 
 	zio = zio_create(pio, spa, txg, bp, data, size, done, private,
 	    ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb,
-	    ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE);
+	    ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ?
+	    ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE);
 
 	zio->io_ready = ready;
 	zio->io_prop = *zp;
@@ -553,7 +626,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 zio_t *
 zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
     uint64_t size, zio_done_func_t *done, void *private, int priority,
-    int flags, zbookmark_t *zb)
+    enum zio_flag flags, zbookmark_t *zb)
 {
 	zio_t *zio;
 
@@ -564,33 +637,44 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data,
 	return (zio);
 }
 
+void
+zio_write_override(zio_t *zio, blkptr_t *bp, int copies)
+{
+	ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+	ASSERT(zio->io_stage == ZIO_STAGE_OPEN);
+	ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa));
+
+	zio->io_prop.zp_copies = copies;
+	zio->io_bp_override = bp;
+}
+
+void
+zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp)
+{
+	bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp);
+}
+
 zio_t *
-zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags)
+zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    enum zio_flag flags)
 {
 	zio_t *zio;
 
 	ASSERT(!BP_IS_HOLE(bp));
-
-	if (bp->blk_fill == BLK_FILL_ALREADY_FREED)
-		return (zio_null(pio, spa, NULL, NULL, flags));
-
-	if (txg == spa->spa_syncing_txg &&
-	    spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) {
-		bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-		return (zio_null(pio, spa, NULL, NULL, flags));
-	}
+	ASSERT(spa_syncing_txg(spa) == txg);
+	ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
-	    done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
+	    NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
 	    NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE);
 
 	return (zio);
 }
 
 zio_t *
-zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
-    zio_done_func_t *done, void *private, int flags)
+zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
+    zio_done_func_t *done, void *private, enum zio_flag flags)
 {
 	zio_t *zio;
 
@@ -604,9 +688,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 	 *
 	 * All claims *must* be resolved in the first txg -- before the SPA
 	 * starts allocating blocks -- so that nothing is allocated twice.
+	 * If txg == 0 we just verify that the block is claimable.
 	 */
 	ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa));
-	ASSERT3U(spa_first_txg(spa), <=, txg);
+	ASSERT(txg == spa_first_txg(spa) || txg == 0);
+	ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa));	/* zdb(1M) */
 
 	zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
 	    done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags,
@@ -617,7 +703,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp,
 
 zio_t *
 zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
-    zio_done_func_t *done, void *private, int priority, int flags)
+    zio_done_func_t *done, void *private, int priority, enum zio_flag flags)
 {
 	zio_t *zio;
 	int c;
@@ -629,7 +715,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 
 		zio->io_cmd = cmd;
 	} else {
-		zio = zio_null(pio, spa, NULL, NULL, flags);
+		zio = zio_null(pio, spa, NULL, NULL, NULL, flags);
 
 		for (c = 0; c < vd->vdev_children; c++)
 			zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd,
@@ -642,7 +728,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd,
 zio_t *
 zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags, boolean_t labels)
+    int priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -663,7 +749,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 zio_t *
 zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
     void *data, int checksum, zio_done_func_t *done, void *private,
-    int priority, int flags, boolean_t labels)
+    int priority, enum zio_flag flags, boolean_t labels)
 {
 	zio_t *zio;
 
@@ -678,9 +764,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
 
 	zio->io_prop.zp_checksum = checksum;
 
-	if (zio_checksum_table[checksum].ci_zbt) {
+	if (zio_checksum_table[checksum].ci_eck) {
 		/*
-		 * zbt checksums are necessarily destructive -- they modify
+		 * zec checksums are necessarily destructive -- they modify
 		 * the end of the write buffer to hold the verifier/checksum.
 		 * Therefore, we must make a local copy in case the data is
 		 * being written to multiple places in parallel.
@@ -698,10 +784,10 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size,
  */
 zio_t *
 zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
-	void *data, uint64_t size, int type, int priority, int flags,
+	void *data, uint64_t size, int type, int priority, enum zio_flag flags,
 	zio_done_func_t *done, void *private)
 {
-	uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE;
+	enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE;
 	zio_t *zio;
 
 	ASSERT(vd->vdev_parent ==
@@ -714,26 +800,33 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset,
 		 * detection as close to the leaves as possible and
 		 * eliminates redundant checksums in the interior nodes.
 		 */
-		pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY;
-		pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+		pipeline |= ZIO_STAGE_CHECKSUM_VERIFY;
+		pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 	}
 
 	if (vd->vdev_children == 0)
 		offset += VDEV_LABEL_START_SIZE;
 
+	flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE;
+
+	/*
+	 * If we've decided to do a repair, the write is not speculative --
+	 * even if the original read was.
+	 */
+	if (flags & ZIO_FLAG_IO_REPAIR)
+		flags &= ~ZIO_FLAG_SPECULATIVE;
+
 	zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size,
-	    done, private, type, priority,
-	    (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) |
-	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags,
-	    vd, offset, &pio->io_bookmark,
-	    ZIO_STAGE_VDEV_IO_START - 1, pipeline);
+	    done, private, type, priority, flags, vd, offset, &pio->io_bookmark,
+	    ZIO_STAGE_VDEV_IO_START >> 1, pipeline);
 
 	return (zio);
 }
 
 zio_t *
 zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
-	int type, int priority, int flags, zio_done_func_t *done, void *private)
+	int type, int priority, enum zio_flag flags,
+	zio_done_func_t *done, void *private)
 {
 	zio_t *zio;
 
@@ -743,7 +836,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size,
 	    data, size, done, private, type, priority,
 	    flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY,
 	    vd, offset, NULL,
-	    ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE);
+	    ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE);
 
 	return (zio);
 }
@@ -756,6 +849,23 @@ zio_flush(zio_t *zio, vdev_t *vd)
 	    ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY));
 }
 
+void
+zio_shrink(zio_t *zio, uint64_t size)
+{
+	ASSERT(zio->io_executor == NULL);
+	ASSERT(zio->io_orig_size == zio->io_size);
+	ASSERT(size <= zio->io_size);
+
+	/*
+	 * We don't shrink for raidz because of problems with the
+	 * reconstruction when reading back less than the block size.
+	 * Note, BP_IS_RAIDZ() assumes no compression.
+	 */
+	ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF);
+	if (!BP_IS_RAIDZ(zio->io_bp))
+		zio->io_orig_size = zio->io_size = size;
+}
+
 /*
  * ==========================================================================
  * Prepare to read and write logical blocks
@@ -767,29 +877,36 @@ zio_read_bp_init(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
-	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) {
-		uint64_t csize = BP_GET_PSIZE(bp);
-		void *cbuf = zio_buf_alloc(csize);
+	if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF &&
+	    zio->io_child_type == ZIO_CHILD_LOGICAL &&
+	    !(zio->io_flags & ZIO_FLAG_RAW)) {
+		uint64_t psize = BP_GET_PSIZE(bp);
+		void *cbuf = zio_buf_alloc(psize);
 
-		zio_push_transform(zio, cbuf, csize, csize, zio_decompress);
+		zio_push_transform(zio, cbuf, psize, psize, zio_decompress);
 	}
 
 	if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0)
 		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
 
+	if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP)
+		zio->io_flags |= ZIO_FLAG_DONT_CACHE;
+
+	if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL)
+		zio->io_pipeline = ZIO_DDT_READ_PIPELINE;
+
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
 static int
 zio_write_bp_init(zio_t *zio)
 {
+	spa_t *spa = zio->io_spa;
 	zio_prop_t *zp = &zio->io_prop;
-	int compress = zp->zp_compress;
+	enum zio_compress compress = zp->zp_compress;
 	blkptr_t *bp = zio->io_bp;
-	void *cbuf;
 	uint64_t lsize = zio->io_size;
-	uint64_t csize = lsize;
-	uint64_t cbufsize = 0;
+	uint64_t psize = lsize;
 	int pass = 1;
 
 	/*
@@ -803,7 +920,29 @@ zio_write_bp_init(zio_t *zio)
 	if (!IO_IS_ALLOCATING(zio))
 		return (ZIO_PIPELINE_CONTINUE);
 
-	ASSERT(compress != ZIO_COMPRESS_INHERIT);
+	ASSERT(zio->io_child_type != ZIO_CHILD_DDT);
+
+	if (zio->io_bp_override) {
+		ASSERT(bp->blk_birth != zio->io_txg);
+		ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0);
+
+		*bp = *zio->io_bp_override;
+		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
+
+		if (BP_IS_HOLE(bp) || !zp->zp_dedup)
+			return (ZIO_PIPELINE_CONTINUE);
+
+		ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup ||
+		    zp->zp_dedup_verify);
+
+		if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) {
+			BP_SET_DEDUP(bp, 1);
+			zio->io_pipeline |= ZIO_STAGE_DDT_WRITE;
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+		zio->io_bp_override = NULL;
+		BP_ZERO(bp);
+	}
 
 	if (bp->blk_birth == zio->io_txg) {
 		/*
@@ -815,28 +954,29 @@ zio_write_bp_init(zio_t *zio)
 		 * convergence take longer.  Therefore, after the first
 		 * few passes, stop compressing to ensure convergence.
 		 */
-		pass = spa_sync_pass(zio->io_spa);
-		ASSERT(pass > 1);
+		pass = spa_sync_pass(spa);
+
+		ASSERT(zio->io_txg == spa_syncing_txg(spa));
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+		ASSERT(!BP_GET_DEDUP(bp));
 
 		if (pass > SYNC_PASS_DONT_COMPRESS)
 			compress = ZIO_COMPRESS_OFF;
 
-		/*
-		 * Only MOS (objset 0) data should need to be rewritten.
-		 */
-		ASSERT(zio->io_logical->io_bookmark.zb_objset == 0);
-
 		/* Make sure someone doesn't change their mind on overwrites */
-		ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp),
-		    spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp));
+		ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp),
+		    spa_max_replication(spa)) == BP_GET_NDVAS(bp));
 	}
 
 	if (compress != ZIO_COMPRESS_OFF) {
-		if (!zio_compress_data(compress, zio->io_data, zio->io_size,
-		    &cbuf, &csize, &cbufsize)) {
+		void *cbuf = zio_buf_alloc(lsize);
+		psize = zio_compress_data(compress, zio->io_data, cbuf, lsize);
+		if (psize == 0 || psize == lsize) {
 			compress = ZIO_COMPRESS_OFF;
-		} else if (csize != 0) {
-			zio_push_transform(zio, cbuf, csize, cbufsize, NULL);
+			zio_buf_free(cbuf, lsize);
+		} else {
+			ASSERT(psize < lsize);
+			zio_push_transform(zio, cbuf, psize, lsize, NULL);
 		}
 	}
 
@@ -848,10 +988,10 @@ zio_write_bp_init(zio_t *zio)
 	 * spa_sync() to allocate new blocks, but force rewrites after that.
 	 * There should only be a handful of blocks after pass 1 in any case.
 	 */
-	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize &&
+	if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
 	    pass > SYNC_PASS_REWRITE) {
-		ASSERT(csize != 0);
-		uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
+		ASSERT(psize != 0);
+		enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
 		zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;
 		zio->io_flags |= ZIO_FLAG_IO_REWRITE;
 	} else {
@@ -859,17 +999,38 @@ zio_write_bp_init(zio_t *zio)
 		zio->io_pipeline = ZIO_WRITE_PIPELINE;
 	}
 
-	if (csize == 0) {
+	if (psize == 0) {
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 	} else {
 		ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER);
 		BP_SET_LSIZE(bp, lsize);
-		BP_SET_PSIZE(bp, csize);
+		BP_SET_PSIZE(bp, psize);
 		BP_SET_COMPRESS(bp, compress);
 		BP_SET_CHECKSUM(bp, zp->zp_checksum);
 		BP_SET_TYPE(bp, zp->zp_type);
 		BP_SET_LEVEL(bp, zp->zp_level);
+		BP_SET_DEDUP(bp, zp->zp_dedup);
 		BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER);
+		if (zp->zp_dedup) {
+			ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+			ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE));
+			zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE;
+		}
+	}
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_free_bp_init(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio->io_child_type == ZIO_CHILD_LOGICAL) {
+		if (BP_GET_DEDUP(bp))
+			zio->io_pipeline = ZIO_DDT_FREE_PIPELINE;
+		else
+			arc_free(zio->io_spa, bp);
 	}
 
 	return (ZIO_PIPELINE_CONTINUE);
@@ -882,16 +1043,18 @@ zio_write_bp_init(zio_t *zio)
  */
 
 static void
-zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
+zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline)
 {
+	spa_t *spa = zio->io_spa;
 	zio_type_t t = zio->io_type;
+	int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0);
 
 	/*
-	 * If we're a config writer, the normal issue and interrupt threads
-	 * may all be blocked waiting for the config lock.  In this case,
-	 * select the otherwise-unused taskq for ZIO_TYPE_NULL.
+	 * If we're a config writer or a probe, the normal issue and
+	 * interrupt threads may all be blocked waiting for the config lock.
+	 * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL.
 	 */
-	if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER)
+	if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE))
 		t = ZIO_TYPE_NULL;
 
 	/*
@@ -900,8 +1063,16 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q)
 	if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux)
 		t = ZIO_TYPE_NULL;
 
-	(void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q],
-	    (task_func_t *)zio_execute, zio, TQ_SLEEP);
+	/*
+	 * If this is a high priority I/O, then use the high priority taskq.
+	 */
+	if (zio->io_priority == ZIO_PRIORITY_NOW &&
+	    spa->spa_zio_taskq[t][q + 1] != NULL)
+		q++;
+
+	ASSERT3U(q, <, ZIO_TASKQ_TYPES);
+	(void) taskq_dispatch(spa->spa_zio_taskq[t][q],
+	    (task_func_t *)zio_execute, zio, flags);
 }
 
 static boolean_t
@@ -920,7 +1091,7 @@ zio_taskq_member(zio_t *zio, enum zio_taskq_type q)
 static int
 zio_issue_async(zio_t *zio)
 {
-	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+	zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
 
 	return (ZIO_PIPELINE_STOP);
 }
@@ -928,7 +1099,7 @@ zio_issue_async(zio_t *zio)
 void
 zio_interrupt(zio_t *zio)
 {
-	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT);
+	zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE);
 }
 
 /*
@@ -944,7 +1115,7 @@ zio_interrupt(zio_t *zio)
  * There's no locking on io_stage because there's no legitimate way
  * for multiple threads to be attempting to process the same I/O.
  */
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES];
+static zio_pipe_stage_t *zio_pipeline[];
 
 void
 zio_execute(zio_t *zio)
@@ -952,32 +1123,39 @@ zio_execute(zio_t *zio)
 	zio->io_executor = curthread;
 
 	while (zio->io_stage < ZIO_STAGE_DONE) {
-		uint32_t pipeline = zio->io_pipeline;
-		zio_stage_t stage = zio->io_stage;
+		enum zio_stage pipeline = zio->io_pipeline;
+		enum zio_stage stage = zio->io_stage;
 		int rv;
 
 		ASSERT(!MUTEX_HELD(&zio->io_lock));
+		ASSERT(ISP2(stage));
+		ASSERT(zio->io_stall == NULL);
 
-		while (((1U << ++stage) & pipeline) == 0)
-			continue;
+		do {
+			stage <<= 1;
+		} while ((stage & pipeline) == 0);
 
 		ASSERT(stage <= ZIO_STAGE_DONE);
-		ASSERT(zio->io_stall == NULL);
 
 		/*
 		 * If we are in interrupt context and this pipeline stage
 		 * will grab a config lock that is held across I/O,
-		 * issue async to avoid deadlock.
+		 * or may wait for an I/O that needs an interrupt thread
+		 * to complete, issue async to avoid deadlock.
+		 *
+		 * For VDEV_IO_START, we cut in line so that the io will
+		 * be sent to disk promptly.
 		 */
-		if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) &&
-		    zio->io_vd == NULL &&
+		if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL &&
 		    zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) {
-			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+			boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ?
+			    zio_requeue_io_start_cut_in_line : B_FALSE;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut);
 			return;
 		}
 
 		zio->io_stage = stage;
-		rv = zio_pipeline[stage](zio);
+		rv = zio_pipeline[highbit(stage) - 1](zio);
 
 		if (rv == ZIO_PIPELINE_STOP)
 			return;
@@ -1019,17 +1197,16 @@ zio_nowait(zio_t *zio)
 {
 	ASSERT(zio->io_executor == NULL);
 
-	if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) {
+	if (zio->io_child_type == ZIO_CHILD_LOGICAL &&
+	    zio_unique_parent(zio) == NULL) {
 		/*
 		 * This is a logical async I/O with no parent to wait for it.
-		 * Attach it to the pool's global async root zio so that
-		 * spa_unload() has a way of waiting for async I/O to finish.
+		 * We add it to the spa_async_root_zio "Godfather" I/O which
+		 * will ensure they complete prior to unloading the pool.
 		 */
 		spa_t *spa = zio->io_spa;
-		zio->io_async_root = B_TRUE;
-		mutex_enter(&spa->spa_async_root_lock);
-		spa->spa_async_root_count++;
-		mutex_exit(&spa->spa_async_root_lock);
+
+		zio_add_child(spa->spa_async_zio_root, zio);
 	}
 
 	zio_execute(zio);
@@ -1044,50 +1221,49 @@ zio_nowait(zio_t *zio)
 static void
 zio_reexecute(zio_t *pio)
 {
-	zio_t *zio, *zio_next;
+	zio_t *cio, *cio_next;
+
+	ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL);
+	ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN);
+	ASSERT(pio->io_gang_leader == NULL);
+	ASSERT(pio->io_gang_tree == NULL);
 
 	pio->io_flags = pio->io_orig_flags;
 	pio->io_stage = pio->io_orig_stage;
 	pio->io_pipeline = pio->io_orig_pipeline;
 	pio->io_reexecute = 0;
 	pio->io_error = 0;
+	for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+		pio->io_state[w] = 0;
 	for (int c = 0; c < ZIO_CHILD_TYPES; c++)
 		pio->io_child_error[c] = 0;
 
-	if (IO_IS_ALLOCATING(pio)) {
-		/*
-		 * Remember the failed bp so that the io_ready() callback
-		 * can update its accounting upon reexecution.  The block
-		 * was already freed in zio_done(); we indicate this with
-		 * a fill count of -1 so that zio_free() knows to skip it.
-		 */
-		blkptr_t *bp = pio->io_bp;
-		ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg);
-		bp->blk_fill = BLK_FILL_ALREADY_FREED;
-		pio->io_bp_orig = *bp;
-		BP_ZERO(bp);
-	}
+	if (IO_IS_ALLOCATING(pio))
+		BP_ZERO(pio->io_bp);
 
 	/*
 	 * As we reexecute pio's children, new children could be created.
-	 * New children go to the head of the io_child list, however,
+	 * New children go to the head of pio's io_child_list, however,
 	 * so we will (correctly) not reexecute them.  The key is that
-	 * the remainder of the io_child list, from 'zio_next' onward,
-	 * cannot be affected by any side effects of reexecuting 'zio'.
+	 * the remainder of pio's io_child_list, from 'cio_next' onward,
+	 * cannot be affected by any side effects of reexecuting 'cio'.
 	 */
-	for (zio = pio->io_child; zio != NULL; zio = zio_next) {
-		zio_next = zio->io_sibling_next;
+	for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) {
+		cio_next = zio_walk_children(pio);
 		mutex_enter(&pio->io_lock);
-		pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++;
-		pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++;
+		for (int w = 0; w < ZIO_WAIT_TYPES; w++)
+			pio->io_children[cio->io_child_type][w]++;
 		mutex_exit(&pio->io_lock);
-		zio_reexecute(zio);
+		zio_reexecute(cio);
 	}
 
 	/*
 	 * Now that all children have been reexecuted, execute the parent.
+	 * We don't reexecute "The Godfather" I/O here as it's the
+	 * responsibility of the caller to wait on him.
 	 */
-	zio_execute(pio);
+	if (!(pio->io_flags & ZIO_FLAG_GODFATHER))
+		zio_execute(pio);
 }
 
 void
@@ -1103,14 +1279,17 @@ zio_suspend(spa_t *spa, zio_t *zio)
 	mutex_enter(&spa->spa_suspend_lock);
 
 	if (spa->spa_suspend_zio_root == NULL)
-		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0);
+		spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL,
+		    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE |
+		    ZIO_FLAG_GODFATHER);
 
 	spa->spa_suspended = B_TRUE;
 
 	if (zio != NULL) {
+		ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 		ASSERT(zio != spa->spa_suspend_zio_root);
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
-		ASSERT(zio->io_parent == NULL);
+		ASSERT(zio_unique_parent(zio) == NULL);
 		ASSERT(zio->io_stage == ZIO_STAGE_DONE);
 		zio_add_child(spa->spa_suspend_zio_root, zio);
 	}
@@ -1118,10 +1297,10 @@ zio_suspend(spa_t *spa, zio_t *zio)
 	mutex_exit(&spa->spa_suspend_lock);
 }
 
-void
+int
 zio_resume(spa_t *spa)
 {
-	zio_t *pio, *zio;
+	zio_t *pio;
 
 	/*
 	 * Reexecute all previously suspended i/o.
@@ -1134,17 +1313,10 @@ zio_resume(spa_t *spa)
 	mutex_exit(&spa->spa_suspend_lock);
 
 	if (pio == NULL)
-		return;
-
-	while ((zio = pio->io_child) != NULL) {
-		zio_remove_child(pio, zio);
-		zio->io_parent = NULL;
-		zio_reexecute(zio);
-	}
-
-	ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0);
+		return (0);
 
-	(void) zio_wait(pio);
+	zio_reexecute(pio);
+	return (zio_wait(pio));
 }
 
 void
@@ -1251,10 +1423,16 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 		 * (Presently, nothing actually uses interior data checksums;
 		 * this is just good hygiene.)
 		 */
-		if (gn != pio->io_logical->io_gang_tree) {
+		if (gn != pio->io_gang_leader->io_gang_tree) {
 			zio_checksum_compute(zio, BP_GET_CHECKSUM(bp),
 			    data, BP_GET_PSIZE(bp));
 		}
+		/*
+		 * If we are here to damage data for testing purposes,
+		 * leave the GBH alone so that we can detect the damage.
+		 */
+		if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE)
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
 	} else {
 		zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp,
 		    data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority,
@@ -1268,8 +1446,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 zio_t *
 zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data)
 {
-	return (zio_free(pio, pio->io_spa, pio->io_txg, bp,
-	    NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio)));
+	return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp,
+	    ZIO_GANG_CHILD_FLAGS(pio)));
 }
 
 /* ARGSUSED */
@@ -1333,27 +1511,27 @@ zio_gang_tree_free(zio_gang_node_t **gnpp)
 }
 
 static void
-zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp)
+zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp)
 {
 	zio_gang_node_t *gn = zio_gang_node_alloc(gnpp);
 
-	ASSERT(lio->io_logical == lio);
+	ASSERT(gio->io_gang_leader == gio);
 	ASSERT(BP_IS_GANG(bp));
 
-	zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh,
+	zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh,
 	    SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn,
-	    lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark));
+	    gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark));
 }
 
 static void
 zio_gang_tree_assemble_done(zio_t *zio)
 {
-	zio_t *lio = zio->io_logical;
+	zio_t *gio = zio->io_gang_leader;
 	zio_gang_node_t *gn = zio->io_private;
 	blkptr_t *bp = zio->io_bp;
 
-	ASSERT(zio->io_parent == lio);
-	ASSERT(zio->io_child == NULL);
+	ASSERT(gio == zio_unique_parent(zio));
+	ASSERT(zio->io_child_count == 0);
 
 	if (zio->io_error)
 		return;
@@ -1363,34 +1541,34 @@ zio_gang_tree_assemble_done(zio_t *zio)
 
 	ASSERT(zio->io_data == gn->gn_gbh);
 	ASSERT(zio->io_size == SPA_GANGBLOCKSIZE);
-	ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+	ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 	for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 		blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
 		if (!BP_IS_GANG(gbp))
 			continue;
-		zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]);
+		zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]);
 	}
 }
 
 static void
 zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
 {
-	zio_t *lio = pio->io_logical;
+	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 
 	ASSERT(BP_IS_GANG(bp) == !!gn);
-	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp));
-	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree);
+	ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp));
+	ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree);
 
 	/*
 	 * If you're a gang header, your data is in gn->gn_gbh.
 	 * If you're a gang member, your data is in 'data' and gn == NULL.
 	 */
-	zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data);
+	zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data);
 
 	if (gn != NULL) {
-		ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC);
+		ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC);
 
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
 			blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g];
@@ -1401,8 +1579,8 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data)
 		}
 	}
 
-	if (gn == lio->io_gang_tree)
-		ASSERT3P((char *)lio->io_data + lio->io_size, ==, data);
+	if (gn == gio->io_gang_tree)
+		ASSERT3P((char *)gio->io_data + gio->io_size, ==, data);
 
 	if (zio != pio)
 		zio_nowait(zio);
@@ -1413,7 +1591,10 @@ zio_gang_assemble(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
 
-	ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical);
+	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL);
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+
+	zio->io_gang_leader = zio;
 
 	zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree);
 
@@ -1423,18 +1604,18 @@ zio_gang_assemble(zio_t *zio)
 static int
 zio_gang_issue(zio_t *zio)
 {
-	zio_t *lio = zio->io_logical;
 	blkptr_t *bp = zio->io_bp;
 
 	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
-	ASSERT(BP_IS_GANG(bp) && zio == lio);
+	ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio);
+	ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
 
 	if (zio->io_child_error[ZIO_CHILD_GANG] == 0)
-		zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data);
+		zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data);
 	else
-		zio_gang_tree_free(&lio->io_gang_tree);
+		zio_gang_tree_free(&zio->io_gang_tree);
 
 	zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
@@ -1444,8 +1625,8 @@ zio_gang_issue(zio_t *zio)
 static void
 zio_write_gang_member_ready(zio_t *zio)
 {
-	zio_t *pio = zio->io_parent;
-	zio_t *lio = zio->io_logical;
+	zio_t *pio = zio_unique_parent(zio);
+	zio_t *gio = zio->io_gang_leader;
 	dva_t *cdva = zio->io_bp->blk_dva;
 	dva_t *pdva = pio->io_bp->blk_dva;
 	uint64_t asize;
@@ -1456,9 +1637,9 @@ zio_write_gang_member_ready(zio_t *zio)
 	ASSERT(BP_IS_HOLE(&zio->io_bp_orig));
 
 	ASSERT(zio->io_child_type == ZIO_CHILD_GANG);
-	ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas);
-	ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp));
-	ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp));
+	ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies);
+	ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp));
+	ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp));
 	ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp));
 
 	mutex_enter(&pio->io_lock);
@@ -1476,28 +1657,28 @@ zio_write_gang_block(zio_t *pio)
 {
 	spa_t *spa = pio->io_spa;
 	blkptr_t *bp = pio->io_bp;
-	zio_t *lio = pio->io_logical;
+	zio_t *gio = pio->io_gang_leader;
 	zio_t *zio;
 	zio_gang_node_t *gn, **gnpp;
 	zio_gbh_phys_t *gbh;
 	uint64_t txg = pio->io_txg;
 	uint64_t resid = pio->io_size;
 	uint64_t lsize;
-	int ndvas = lio->io_prop.zp_ndvas;
-	int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa));
+	int copies = gio->io_prop.zp_copies;
+	int gbh_copies = MIN(copies + 1, spa_max_replication(spa));
 	zio_prop_t zp;
 	int error;
 
-	error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE,
-	    bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp,
+	error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE,
+	    bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp,
 	    METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER);
 	if (error) {
 		pio->io_error = error;
 		return (ZIO_PIPELINE_CONTINUE);
 	}
 
-	if (pio == lio) {
-		gnpp = &lio->io_gang_tree;
+	if (pio == gio) {
+		gnpp = &gio->io_gang_tree;
 	} else {
 		gnpp = pio->io_private;
 		ASSERT(pio->io_ready == zio_write_gang_member_ready);
@@ -1521,11 +1702,13 @@ zio_write_gang_block(zio_t *pio)
 		    SPA_MINBLOCKSIZE);
 		ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid);
 
-		zp.zp_checksum = lio->io_prop.zp_checksum;
+		zp.zp_checksum = gio->io_prop.zp_checksum;
 		zp.zp_compress = ZIO_COMPRESS_OFF;
 		zp.zp_type = DMU_OT_NONE;
 		zp.zp_level = 0;
-		zp.zp_ndvas = lio->io_prop.zp_ndvas;
+		zp.zp_copies = gio->io_prop.zp_copies;
+		zp.zp_dedup = 0;
+		zp.zp_dedup_verify = 0;
 
 		zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g],
 		    (char *)pio->io_data + (pio->io_size - resid), lsize, &zp,
@@ -1546,26 +1729,397 @@ zio_write_gang_block(zio_t *pio)
 
 /*
  * ==========================================================================
- * Allocate and free blocks
+ * Dedup
  * ==========================================================================
  */
+static void
+zio_ddt_child_read_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp;
+	zio_t *pio = zio_unique_parent(zio);
+
+	mutex_enter(&pio->io_lock);
+	ddp = ddt_phys_select(dde, bp);
+	if (zio->io_error == 0)
+		ddt_phys_clear(ddp);	/* this ddp doesn't need repair */
+	if (zio->io_error == 0 && dde->dde_repair_data == NULL)
+		dde->dde_repair_data = zio->io_data;
+	else
+		zio_buf_free(zio->io_data, zio->io_size);
+	mutex_exit(&pio->io_lock);
+}
+
+static int
+zio_ddt_read_start(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+		ddt_entry_t *dde = ddt_repair_start(ddt, bp);
+		ddt_phys_t *ddp = dde->dde_phys;
+		ddt_phys_t *ddp_self = ddt_phys_select(dde, bp);
+		blkptr_t blk;
+
+		ASSERT(zio->io_vsd == NULL);
+		zio->io_vsd = dde;
+
+		if (ddp_self == NULL)
+			return (ZIO_PIPELINE_CONTINUE);
+
+		for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) {
+			if (ddp->ddp_phys_birth == 0 || ddp == ddp_self)
+				continue;
+			ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp,
+			    &blk);
+			zio_nowait(zio_read(zio, zio->io_spa, &blk,
+			    zio_buf_alloc(zio->io_size), zio->io_size,
+			    zio_ddt_child_read_done, dde, zio->io_priority,
+			    ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE,
+			    &zio->io_bookmark));
+		}
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	zio_nowait(zio_read(zio, zio->io_spa, bp,
+	    zio->io_data, zio->io_size, NULL, NULL, zio->io_priority,
+	    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark));
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_ddt_read_done(zio_t *zio)
+{
+	blkptr_t *bp = zio->io_bp;
+
+	if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE))
+		return (ZIO_PIPELINE_STOP);
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_PSIZE(bp) == zio->io_size);
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	if (zio->io_child_error[ZIO_CHILD_DDT]) {
+		ddt_t *ddt = ddt_select(zio->io_spa, bp);
+		ddt_entry_t *dde = zio->io_vsd;
+		if (ddt == NULL) {
+			ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE);
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+		if (dde == NULL) {
+			zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1;
+			zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE);
+			return (ZIO_PIPELINE_STOP);
+		}
+		if (dde->dde_repair_data != NULL) {
+			bcopy(dde->dde_repair_data, zio->io_data, zio->io_size);
+			zio->io_child_error[ZIO_CHILD_DDT] = 0;
+		}
+		ddt_repair_done(ddt, dde);
+		zio->io_vsd = NULL;
+	}
+
+	ASSERT(zio->io_vsd == NULL);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static boolean_t
+zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde)
+{
+	spa_t *spa = zio->io_spa;
+
+	/*
+	 * Note: we compare the original data, not the transformed data,
+	 * because when zio->io_bp is an override bp, we will not have
+	 * pushed the I/O transforms.  That's an important optimization
+	 * because otherwise we'd compress/encrypt all dmu_sync() data twice.
+	 */
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		zio_t *lio = dde->dde_lead_zio[p];
+
+		if (lio != NULL) {
+			return (lio->io_orig_size != zio->io_orig_size ||
+			    bcmp(zio->io_orig_data, lio->io_orig_data,
+			    zio->io_orig_size) != 0);
+		}
+	}
+
+	for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) {
+		ddt_phys_t *ddp = &dde->dde_phys[p];
+
+		if (ddp->ddp_phys_birth != 0) {
+			arc_buf_t *abuf = NULL;
+			uint32_t aflags = ARC_WAIT;
+			blkptr_t blk = *zio->io_bp;
+			int error;
+
+			ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth);
+
+			ddt_exit(ddt);
+
+			error = arc_read_nolock(NULL, spa, &blk,
+			    arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ,
+			    ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE,
+			    &aflags, &zio->io_bookmark);
+
+			if (error == 0) {
+				if (arc_buf_size(abuf) != zio->io_orig_size ||
+				    bcmp(abuf->b_data, zio->io_orig_data,
+				    zio->io_orig_size) != 0)
+					error = EEXIST;
+				VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1);
+			}
+
+			ddt_enter(ddt);
+			return (error != 0);
+		}
+	}
+
+	return (B_FALSE);
+}
+
+static void
+zio_ddt_child_write_ready(zio_t *zio)
+{
+	int p = zio->io_prop.zp_copies;
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+	zio_t *pio;
+
+	if (zio->io_error)
+		return;
+
+	ddt_enter(ddt);
+
+	ASSERT(dde->dde_lead_zio[p] == zio);
+
+	ddt_phys_fill(ddp, zio->io_bp);
+
+	while ((pio = zio_walk_parents(zio)) != NULL)
+		ddt_bp_fill(ddp, pio->io_bp, zio->io_txg);
+
+	ddt_exit(ddt);
+}
+
+static void
+zio_ddt_child_write_done(zio_t *zio)
+{
+	int p = zio->io_prop.zp_copies;
+	ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+
+	ddt_enter(ddt);
+
+	ASSERT(ddp->ddp_refcnt == 0);
+	ASSERT(dde->dde_lead_zio[p] == zio);
+	dde->dde_lead_zio[p] = NULL;
+
+	if (zio->io_error == 0) {
+		while (zio_walk_parents(zio) != NULL)
+			ddt_phys_addref(ddp);
+	} else {
+		ddt_phys_clear(ddp);
+	}
+
+	ddt_exit(ddt);
+}
+
+static void
+zio_ddt_ditto_write_done(zio_t *zio)
+{
+	int p = DDT_PHYS_DITTO;
+	zio_prop_t *zp = &zio->io_prop;
+	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt = ddt_select(zio->io_spa, bp);
+	ddt_entry_t *dde = zio->io_private;
+	ddt_phys_t *ddp = &dde->dde_phys[p];
+	ddt_key_t *ddk = &dde->dde_key;
+
+	ddt_enter(ddt);
+
+	ASSERT(ddp->ddp_refcnt == 0);
+	ASSERT(dde->dde_lead_zio[p] == zio);
+	dde->dde_lead_zio[p] = NULL;
+
+	if (zio->io_error == 0) {
+		ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum));
+		ASSERT(zp->zp_copies < SPA_DVAS_PER_BP);
+		ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp));
+		if (ddp->ddp_phys_birth != 0)
+			ddt_phys_free(ddt, ddk, ddp, zio->io_txg);
+		ddt_phys_fill(ddp, bp);
+	}
+
+	ddt_exit(ddt);
+}
 
+static int
+zio_ddt_write(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	uint64_t txg = zio->io_txg;
+	zio_prop_t *zp = &zio->io_prop;
+	int p = zp->zp_copies;
+	int ditto_copies;
+	zio_t *cio = NULL;
+	zio_t *dio = NULL;
+	ddt_t *ddt = ddt_select(spa, bp);
+	ddt_entry_t *dde;
+	ddt_phys_t *ddp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum);
+	ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override);
+
+	ddt_enter(ddt);
+	dde = ddt_lookup(ddt, bp, B_TRUE);
+	ddp = &dde->dde_phys[p];
+
+	if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) {
+		/*
+		 * If we're using a weak checksum, upgrade to a strong checksum
+		 * and try again.  If we're already using a strong checksum,
+		 * we can't resolve it, so just convert to an ordinary write.
+		 * (And automatically e-mail a paper to Nature?)
+		 */
+		if (!zio_checksum_table[zp->zp_checksum].ci_dedup) {
+			zp->zp_checksum = spa_dedup_checksum(spa);
+			zio_pop_transforms(zio);
+			zio->io_stage = ZIO_STAGE_OPEN;
+			BP_ZERO(bp);
+		} else {
+			zp->zp_dedup = 0;
+		}
+		zio->io_pipeline = ZIO_WRITE_PIPELINE;
+		ddt_exit(ddt);
+		return (ZIO_PIPELINE_CONTINUE);
+	}
+
+	ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp);
+	ASSERT(ditto_copies < SPA_DVAS_PER_BP);
+
+	if (ditto_copies > ddt_ditto_copies_present(dde) &&
+	    dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) {
+		zio_prop_t czp = *zp;
+
+		czp.zp_copies = ditto_copies;
+
+		/*
+		 * If we arrived here with an override bp, we won't have run
+		 * the transform stack, so we won't have the data we need to
+		 * generate a child i/o.  So, toss the override bp and restart.
+		 * This is safe, because using the override bp is just an
+		 * optimization; and it's rare, so the cost doesn't matter.
+		 */
+		if (zio->io_bp_override) {
+			zio_pop_transforms(zio);
+			zio->io_stage = ZIO_STAGE_OPEN;
+			zio->io_pipeline = ZIO_WRITE_PIPELINE;
+			zio->io_bp_override = NULL;
+			BP_ZERO(bp);
+			ddt_exit(ddt);
+			return (ZIO_PIPELINE_CONTINUE);
+		}
+
+		dio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+		    zio->io_orig_size, &czp, NULL,
+		    zio_ddt_ditto_write_done, dde, zio->io_priority,
+		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+		zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL);
+		dde->dde_lead_zio[DDT_PHYS_DITTO] = dio;
+	}
+
+	if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) {
+		if (ddp->ddp_phys_birth != 0)
+			ddt_bp_fill(ddp, bp, txg);
+		if (dde->dde_lead_zio[p] != NULL)
+			zio_add_child(zio, dde->dde_lead_zio[p]);
+		else
+			ddt_phys_addref(ddp);
+	} else if (zio->io_bp_override) {
+		ASSERT(bp->blk_birth == txg);
+		ASSERT(BP_EQUAL(bp, zio->io_bp_override));
+		ddt_phys_fill(ddp, bp);
+		ddt_phys_addref(ddp);
+	} else {
+		cio = zio_write(zio, spa, txg, bp, zio->io_orig_data,
+		    zio->io_orig_size, zp, zio_ddt_child_write_ready,
+		    zio_ddt_child_write_done, dde, zio->io_priority,
+		    ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark);
+
+		zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL);
+		dde->dde_lead_zio[p] = cio;
+	}
+
+	ddt_exit(ddt);
+
+	if (cio)
+		zio_nowait(cio);
+	if (dio)
+		zio_nowait(dio);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+static int
+zio_ddt_free(zio_t *zio)
+{
+	spa_t *spa = zio->io_spa;
+	blkptr_t *bp = zio->io_bp;
+	ddt_t *ddt = ddt_select(spa, bp);
+	ddt_entry_t *dde;
+	ddt_phys_t *ddp;
+
+	ASSERT(BP_GET_DEDUP(bp));
+	ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
+
+	ddt_enter(ddt);
+	dde = ddt_lookup(ddt, bp, B_TRUE);
+	ddp = ddt_phys_select(dde, bp);
+	ddt_phys_decref(ddp);
+	ddt_exit(ddt);
+
+	return (ZIO_PIPELINE_CONTINUE);
+}
+
+/*
+ * ==========================================================================
+ * Allocate and free blocks
+ * ==========================================================================
+ */
 static int
 zio_dva_allocate(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
-	metaslab_class_t *mc = spa->spa_normal_class;
+	metaslab_class_t *mc = spa_normal_class(spa);
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
+	if (zio->io_gang_leader == NULL) {
+		ASSERT(zio->io_child_type > ZIO_CHILD_GANG);
+		zio->io_gang_leader = zio;
+	}
+
 	ASSERT(BP_IS_HOLE(bp));
 	ASSERT3U(BP_GET_NDVAS(bp), ==, 0);
-	ASSERT3U(zio->io_prop.zp_ndvas, >, 0);
-	ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa));
+	ASSERT3U(zio->io_prop.zp_copies, >, 0);
+	ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa));
 	ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp));
 
 	error = metaslab_alloc(spa, mc, zio->io_size, bp,
-	    zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0);
+	    zio->io_prop.zp_copies, zio->io_txg, NULL, 0);
 
 	if (error) {
 		if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE)
@@ -1604,36 +2158,11 @@ zio_dva_claim(zio_t *zio)
 static void
 zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
 {
-	spa_t *spa = zio->io_spa;
-	boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE);
-
 	ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
-
-	if (zio->io_bp == bp && !now) {
-		/*
-		 * This is a rewrite for sync-to-convergence.
-		 * We can't do a metaslab_free(NOW) because bp wasn't allocated
-		 * during this sync pass, which means that metaslab_sync()
-		 * already committed the allocation.
-		 */
-		ASSERT(DVA_EQUAL(BP_IDENTITY(bp),
-		    BP_IDENTITY(&zio->io_bp_orig)));
-		ASSERT(spa_sync_pass(spa) > 1);
-
-		if (BP_IS_GANG(bp) && gn == NULL) {
-			/*
-			 * This is a gang leader whose gang header(s) we
-			 * couldn't read now, so defer the free until later.
-			 * The block should still be intact because without
-			 * the headers, we'd never even start the rewrite.
-			 */
-			bplist_enqueue_deferred(&spa->spa_sync_bplist, bp);
-			return;
-		}
-	}
+	ASSERT(zio->io_bp_override == NULL);
 
 	if (!BP_IS_HOLE(bp))
-		metaslab_free(spa, bp, bp->blk_birth, now);
+		metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE);
 
 	if (gn != NULL) {
 		for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) {
@@ -1647,25 +2176,31 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp)
  * Try to allocate an intent log block.  Return 0 on success, errno on failure.
  */
 int
-zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
-    uint64_t txg)
+zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp,
+    uint64_t size, boolean_t use_slog)
 {
-	int error;
+	int error = 1;
+
+	ASSERT(txg > spa_syncing_txg(spa));
 
-	error = metaslab_alloc(spa, spa->spa_log_class, size,
-	    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
+	if (use_slog)
+		error = metaslab_alloc(spa, spa_log_class(spa), size,
+		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
 
 	if (error)
-		error = metaslab_alloc(spa, spa->spa_normal_class, size,
+		error = metaslab_alloc(spa, spa_normal_class(spa), size,
 		    new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID);
 
 	if (error == 0) {
 		BP_SET_LSIZE(new_bp, size);
 		BP_SET_PSIZE(new_bp, size);
 		BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF);
-		BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG);
+		BP_SET_CHECKSUM(new_bp,
+		    spa_version(spa) >= SPA_VERSION_SLIM_ZIL
+		    ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG);
 		BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG);
 		BP_SET_LEVEL(new_bp, 0);
+		BP_SET_DEDUP(new_bp, 0);
 		BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER);
 	}
 
@@ -1673,15 +2208,15 @@ zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp,
 }
 
 /*
- * Free an intent log block.  We know it can't be a gang block, so there's
- * nothing to do except metaslab_free() it.
+ * Free an intent log block.
  */
 void
-zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
+zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp)
 {
+	ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG);
 	ASSERT(!BP_IS_GANG(bp));
 
-	metaslab_free(spa, bp, txg, B_FALSE);
+	zio_free(spa, txg, bp);
 }
 
 /*
@@ -1689,72 +2224,6 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg)
  * Read and write to physical devices
  * ==========================================================================
  */
-
-static void
-zio_vdev_io_probe_done(zio_t *zio)
-{
-	zio_t *dio;
-	vdev_t *vd = zio->io_private;
-
-	mutex_enter(&vd->vdev_probe_lock);
-	ASSERT(vd->vdev_probe_zio == zio);
-	vd->vdev_probe_zio = NULL;
-	mutex_exit(&vd->vdev_probe_lock);
-
-	while ((dio = zio->io_delegate_list) != NULL) {
-		zio->io_delegate_list = dio->io_delegate_next;
-		dio->io_delegate_next = NULL;
-		if (!vdev_accessible(vd, dio))
-			dio->io_error = ENXIO;
-		zio_execute(dio);
-	}
-}
-
-/*
- * Probe the device to determine whether I/O failure is specific to this
- * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged).
- */
-static int
-zio_vdev_io_probe(zio_t *zio)
-{
-	vdev_t *vd = zio->io_vd;
-	zio_t *pio = NULL;
-	boolean_t created_pio = B_FALSE;
-
-	/*
-	 * Don't probe the probe.
-	 */
-	if (zio->io_flags & ZIO_FLAG_PROBE)
-		return (ZIO_PIPELINE_CONTINUE);
-
-	/*
-	 * To prevent 'probe storms' when a device fails, we create
-	 * just one probe i/o at a time.  All zios that want to probe
-	 * this vdev will join the probe zio's io_delegate_list.
-	 */
-	mutex_enter(&vd->vdev_probe_lock);
-
-	if ((pio = vd->vdev_probe_zio) == NULL) {
-		vd->vdev_probe_zio = pio = zio_root(zio->io_spa,
-		    zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL);
-		created_pio = B_TRUE;
-		vd->vdev_probe_wanted = B_TRUE;
-		spa_async_request(zio->io_spa, SPA_ASYNC_PROBE);
-	}
-
-	zio->io_delegate_next = pio->io_delegate_list;
-	pio->io_delegate_list = zio;
-
-	mutex_exit(&vd->vdev_probe_lock);
-
-	if (created_pio) {
-		zio_nowait(vdev_probe(vd, pio));
-		zio_nowait(pio);
-	}
-
-	return (ZIO_PIPELINE_STOP);
-}
-
 static int
 zio_vdev_io_start(zio_t *zio)
 {
@@ -1790,13 +2259,35 @@ zio_vdev_io_start(zio_t *zio)
 
 	ASSERT(P2PHASE(zio->io_offset, align) == 0);
 	ASSERT(P2PHASE(zio->io_size, align) == 0);
-	ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE));
+	ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa));
+
+	/*
+	 * If this is a repair I/O, and there's no self-healing involved --
+	 * that is, we're just resilvering what we expect to resilver --
+	 * then don't do the I/O unless zio's txg is actually in vd's DTL.
+	 * This prevents spurious resilvering with nested replication.
+	 * For example, given a mirror of mirrors, (A+B)+(C+D), if only
+	 * A is out of date, we'll read from C+D, then use the data to
+	 * resilver A+B -- but we don't actually want to resilver B, just A.
+	 * The top-level mirror has no way to know this, so instead we just
+	 * discard unnecessary repairs as we work our way down the vdev tree.
+	 * The same logic applies to any form of nested replication:
+	 * ditto + mirror, RAID-Z + replacing, etc.  This covers them all.
+	 */
+	if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) &&
+	    !(zio->io_flags & ZIO_FLAG_SELF_HEAL) &&
+	    zio->io_txg != 0 &&	/* not a delegated i/o */
+	    !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) {
+		ASSERT(zio->io_type == ZIO_TYPE_WRITE);
+		zio_vdev_io_bypass(zio);
+		return (ZIO_PIPELINE_CONTINUE);
+	}
 
 	if (vd->vdev_ops->vdev_op_leaf &&
 	    (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) {
 
 		if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0)
-			return (ZIO_PIPELINE_STOP);
+			return (ZIO_PIPELINE_CONTINUE);
 
 		if ((zio = vdev_queue_io(zio)) == NULL)
 			return (ZIO_PIPELINE_STOP);
@@ -1806,7 +2297,6 @@ zio_vdev_io_start(zio_t *zio)
 			zio_interrupt(zio);
 			return (ZIO_PIPELINE_STOP);
 		}
-
 	}
 
 	return (vd->vdev_ops->vdev_op_io_start(zio));
@@ -1832,7 +2322,8 @@ zio_vdev_io_done(zio_t *zio)
 			vdev_cache_write(zio);
 
 		if (zio_injection_enabled && zio->io_error == 0)
-			zio->io_error = zio_handle_device_injection(vd, EIO);
+			zio->io_error = zio_handle_device_injection(vd,
+			    zio, EIO);
 
 		if (zio_injection_enabled && zio->io_error == 0)
 			zio->io_error = zio_handle_label_injection(zio, EIO);
@@ -1849,11 +2340,37 @@ zio_vdev_io_done(zio_t *zio)
 	ops->vdev_op_io_done(zio);
 
 	if (unexpected_error)
-		return (zio_vdev_io_probe(zio));
+		VERIFY(vdev_probe(vd, zio) == NULL);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
 
+/*
+ * For non-raidz ZIOs, we can just copy aside the bad data read from the
+ * disk, and use that to finish the checksum ereport later.
+ */
+static void
+zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr,
+    const void *good_buf)
+{
+	/* no processing needed */
+	zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE);
+}
+
+/*ARGSUSED*/
+void
+zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored)
+{
+	void *buf = zio_buf_alloc(zio->io_size);
+
+	bcopy(zio->io_data, buf, zio->io_size);
+
+	zcr->zcr_cbinfo = zio->io_size;
+	zcr->zcr_cbdata = buf;
+	zcr->zcr_finish = zio_vsd_default_cksum_finish;
+	zcr->zcr_free = zio_buf_free;
+}
+
 static int
 zio_vdev_io_assess(zio_t *zio)
 {
@@ -1866,7 +2383,7 @@ zio_vdev_io_assess(zio_t *zio)
 		spa_config_exit(zio->io_spa, SCL_ZIO, zio);
 
 	if (zio->io_vsd != NULL) {
-		zio->io_vsd_free(zio);
+		zio->io_vsd_ops->vsd_free(zio);
 		zio->io_vsd = NULL;
 	}
 
@@ -1875,6 +2392,9 @@ zio_vdev_io_assess(zio_t *zio)
 
 	/*
 	 * If the I/O failed, determine whether we should attempt to retry it.
+	 *
+	 * On retry, we cut in line in the issue queue, since we don't want
+	 * compression/checksumming/etc. work to prevent our (cheap) IO reissue.
 	 */
 	if (zio->io_error && vd == NULL &&
 	    !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) {
@@ -1883,8 +2403,9 @@ zio_vdev_io_assess(zio_t *zio)
 		zio->io_error = 0;
 		zio->io_flags |= ZIO_FLAG_IO_RETRY |
 		    ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE;
-		zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1;
-		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE);
+		zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1;
+		zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE,
+		    zio_requeue_io_start_cut_in_line);
 		return (ZIO_PIPELINE_STOP);
 	}
 
@@ -1916,7 +2437,7 @@ zio_vdev_io_reissue(zio_t *zio)
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START);
 	ASSERT(zio->io_error == 0);
 
-	zio->io_stage--;
+	zio->io_stage >>= 1;
 }
 
 void
@@ -1924,7 +2445,7 @@ zio_vdev_io_redone(zio_t *zio)
 {
 	ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE);
 
-	zio->io_stage--;
+	zio->io_stage >>= 1;
 }
 
 void
@@ -1934,7 +2455,7 @@ zio_vdev_io_bypass(zio_t *zio)
 	ASSERT(zio->io_error == 0);
 
 	zio->io_flags |= ZIO_FLAG_IO_BYPASS;
-	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1;
+	zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1;
 }
 
 /*
@@ -1976,9 +2497,12 @@ zio_checksum_generate(zio_t *zio)
 static int
 zio_checksum_verify(zio_t *zio)
 {
+	zio_bad_cksum_t info;
 	blkptr_t *bp = zio->io_bp;
 	int error;
 
+	ASSERT(zio->io_vd != NULL);
+
 	if (bp == NULL) {
 		/*
 		 * This is zio_read_phys().
@@ -1990,11 +2514,12 @@ zio_checksum_verify(zio_t *zio)
 		ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL);
 	}
 
-	if ((error = zio_checksum_error(zio)) != 0) {
+	if ((error = zio_checksum_error(zio, &info)) != 0) {
 		zio->io_error = error;
 		if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) {
-			zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM,
-			    zio->io_spa, zio->io_vd, zio, 0, 0);
+			zfs_ereport_start_checksum(zio->io_spa,
+			    zio->io_vd, zio, zio->io_offset,
+			    zio->io_size, NULL, &info);
 		}
 	}
 
@@ -2007,7 +2532,7 @@ zio_checksum_verify(zio_t *zio)
 void
 zio_checksum_verified(zio_t *zio)
 {
-	zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY);
+	zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY;
 }
 
 /*
@@ -2045,13 +2570,13 @@ static int
 zio_ready(zio_t *zio)
 {
 	blkptr_t *bp = zio->io_bp;
-	zio_t *pio = zio->io_parent;
+	zio_t *pio, *pio_next;
 
-	if (zio->io_ready) {
-		if (BP_IS_GANG(bp) &&
-		    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY))
-			return (ZIO_PIPELINE_STOP);
+	if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY))
+		return (ZIO_PIPELINE_STOP);
 
+	if (zio->io_ready) {
 		ASSERT(IO_IS_ALLOCATING(zio));
 		ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp));
 		ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0);
@@ -2065,8 +2590,35 @@ zio_ready(zio_t *zio)
 	if (zio->io_error)
 		zio->io_pipeline = ZIO_INTERLOCK_PIPELINE;
 
-	if (pio != NULL)
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_READY] = 1;
+	pio = zio_walk_parents(zio);
+	mutex_exit(&zio->io_lock);
+
+	/*
+	 * As we notify zio's parents, new parents could be added.
+	 * New parents go to the head of zio's io_parent_list, however,
+	 * so we will (correctly) not notify them.  The remainder of zio's
+	 * io_parent_list, from 'pio_next' onward, cannot change because
+	 * all parents must wait for us to be done before they can be done.
+	 */
+	for (; pio != NULL; pio = pio_next) {
+		pio_next = zio_walk_parents(zio);
 		zio_notify_parent(pio, zio, ZIO_WAIT_READY);
+	}
+
+	if (zio->io_flags & ZIO_FLAG_NODATA) {
+		if (BP_IS_GANG(bp)) {
+			zio->io_flags &= ~ZIO_FLAG_NODATA;
+		} else {
+			ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE);
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+		}
+	}
+
+	if (zio_injection_enabled &&
+	    zio->io_spa->spa_syncing_txg == zio->io_txg)
+		zio_handle_ignored_writes(zio);
 
 	return (ZIO_PIPELINE_CONTINUE);
 }
@@ -2075,18 +2627,19 @@ static int
 zio_done(zio_t *zio)
 {
 	spa_t *spa = zio->io_spa;
-	zio_t *pio = zio->io_parent;
 	zio_t *lio = zio->io_logical;
 	blkptr_t *bp = zio->io_bp;
 	vdev_t *vd = zio->io_vd;
 	uint64_t psize = zio->io_size;
+	zio_t *pio, *pio_next;
 
 	/*
-	 * If our of children haven't all completed,
+	 * If our children haven't all completed,
 	 * wait for them and then repeat this pipeline stage.
 	 */
 	if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) ||
+	    zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) ||
 	    zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE))
 		return (ZIO_PIPELINE_STOP);
 
@@ -2097,23 +2650,51 @@ zio_done(zio_t *zio)
 	if (bp != NULL) {
 		ASSERT(bp->blk_pad[0] == 0);
 		ASSERT(bp->blk_pad[1] == 0);
-		ASSERT(bp->blk_pad[2] == 0);
 		ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 ||
-		    (pio != NULL && bp == pio->io_bp));
+		    (bp == zio_unique_parent(zio)->io_bp));
 		if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) &&
+		    zio->io_bp_override == NULL &&
 		    !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) {
 			ASSERT(!BP_SHOULD_BYTESWAP(bp));
-			ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp));
+			ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp));
 			ASSERT(BP_COUNT_GANG(bp) == 0 ||
 			    (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp)));
 		}
 	}
 
 	/*
-	 * If there were child vdev or gang errors, they apply to us now.
+	 * If there were child vdev/gang/ddt errors, they apply to us now.
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_VDEV);
 	zio_inherit_child_errors(zio, ZIO_CHILD_GANG);
+	zio_inherit_child_errors(zio, ZIO_CHILD_DDT);
+
+	/*
+	 * If the I/O on the transformed data was successful, generate any
+	 * checksum reports now while we still have the transformed data.
+	 */
+	if (zio->io_error == 0) {
+		while (zio->io_cksum_report != NULL) {
+			zio_cksum_report_t *zcr = zio->io_cksum_report;
+			uint64_t align = zcr->zcr_align;
+			uint64_t asize = P2ROUNDUP(psize, align);
+			char *abuf = zio->io_data;
+
+			if (asize != psize) {
+				abuf = zio_buf_alloc(asize);
+				bcopy(zio->io_data, abuf, psize);
+				bzero(abuf + psize, asize - psize);
+			}
+
+			zio->io_cksum_report = zcr->zcr_next;
+			zcr->zcr_next = NULL;
+			zcr->zcr_finish(zcr, abuf);
+			zfs_ereport_free_checksum(zcr);
+
+			if (asize != psize)
+				zio_buf_free(abuf, asize);
+		}
+	}
 
 	zio_pop_transforms(zio);	/* note: may set zio->io_error */
 
@@ -2129,8 +2710,9 @@ zio_done(zio_t *zio)
 		if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd))
 			zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0);
 
-		if ((zio->io_error == EIO ||
-		    !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) {
+		if ((zio->io_error == EIO || !(zio->io_flags &
+		    (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) &&
+		    zio == lio) {
 			/*
 			 * For logical I/O requests, tell the SPA to log the
 			 * error and generate a logical data ereport.
@@ -2147,21 +2729,33 @@ zio_done(zio_t *zio)
 		 * propagate all the way to the root via zio_notify_parent().
 		 */
 		ASSERT(vd == NULL && bp != NULL);
+		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
-		if (IO_IS_ALLOCATING(zio))
+		if (IO_IS_ALLOCATING(zio) &&
+		    !(zio->io_flags & ZIO_FLAG_CANFAIL)) {
 			if (zio->io_error != ENOSPC)
 				zio->io_reexecute |= ZIO_REEXECUTE_NOW;
 			else
 				zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+		}
 
 		if ((zio->io_type == ZIO_TYPE_READ ||
 		    zio->io_type == ZIO_TYPE_FREE) &&
 		    zio->io_error == ENXIO &&
+		    spa_load_state(spa) == SPA_LOAD_NONE &&
 		    spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
 
 		if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute)
 			zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND;
+
+		/*
+		 * Here is a possibly good place to attempt to do
+		 * either combinatorial reconstruction or error correction
+		 * based on checksums.  It also might be a good place
+		 * to send out preliminary ereports before we suspend
+		 * processing.
+		 */
 	}
 
 	/*
@@ -2172,6 +2766,20 @@ zio_done(zio_t *zio)
 	 */
 	zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL);
 
+	if ((zio->io_error || zio->io_reexecute) &&
+	    IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio &&
+	    !(zio->io_flags & ZIO_FLAG_IO_REWRITE))
+		zio_dva_unallocate(zio, zio->io_gang_tree, bp);
+
+	zio_gang_tree_free(&zio->io_gang_tree);
+
+	/*
+	 * Godfather I/Os should never suspend.
+	 */
+	if ((zio->io_flags & ZIO_FLAG_GODFATHER) &&
+	    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND))
+		zio->io_reexecute = 0;
+
 	if (zio->io_reexecute) {
 		/*
 		 * This is a logical I/O that wants to reexecute.
@@ -2188,17 +2796,37 @@ zio_done(zio_t *zio)
 		 */
 		ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
 
-		if (IO_IS_ALLOCATING(zio))
-			zio_dva_unallocate(zio, zio->io_gang_tree, bp);
+		zio->io_gang_leader = NULL;
 
-		zio_gang_tree_free(&zio->io_gang_tree);
+		mutex_enter(&zio->io_lock);
+		zio->io_state[ZIO_WAIT_DONE] = 1;
+		mutex_exit(&zio->io_lock);
+
+		/*
+		 * "The Godfather" I/O monitors its children but is
+		 * not a true parent to them. It will track them through
+		 * the pipeline but severs its ties whenever they get into
+		 * trouble (e.g. suspended). This allows "The Godfather"
+		 * I/O to return status without blocking.
+		 */
+		for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
+			zio_link_t *zl = zio->io_walk_link;
+			pio_next = zio_walk_parents(zio);
+
+			if ((pio->io_flags & ZIO_FLAG_GODFATHER) &&
+			    (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) {
+				zio_remove_child(pio, zio, zl);
+				zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
+			}
+		}
 
-		if (pio != NULL) {
+		if ((pio = zio_unique_parent(zio)) != NULL) {
 			/*
 			 * We're not a root i/o, so there's nothing to do
 			 * but notify our parent.  Don't propagate errors
 			 * upward since we haven't permanently failed yet.
 			 */
+			ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER));
 			zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE;
 			zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 		} else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) {
@@ -2219,20 +2847,37 @@ zio_done(zio_t *zio)
 		return (ZIO_PIPELINE_STOP);
 	}
 
-	ASSERT(zio->io_child == NULL);
+	ASSERT(zio->io_child_count == 0);
 	ASSERT(zio->io_reexecute == 0);
 	ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL));
 
+	/*
+	 * Report any checksum errors, since the I/O is complete.
+	 */
+	while (zio->io_cksum_report != NULL) {
+		zio_cksum_report_t *zcr = zio->io_cksum_report;
+		zio->io_cksum_report = zcr->zcr_next;
+		zcr->zcr_next = NULL;
+		zcr->zcr_finish(zcr, NULL);
+		zfs_ereport_free_checksum(zcr);
+	}
+
+	/*
+	 * It is the responsibility of the done callback to ensure that this
+	 * particular zio is no longer discoverable for adoption, and as
+	 * such, cannot acquire any new parents.
+	 */
 	if (zio->io_done)
 		zio->io_done(zio);
 
-	zio_gang_tree_free(&zio->io_gang_tree);
-
-	ASSERT(zio->io_delegate_list == NULL);
-	ASSERT(zio->io_delegate_next == NULL);
+	mutex_enter(&zio->io_lock);
+	zio->io_state[ZIO_WAIT_DONE] = 1;
+	mutex_exit(&zio->io_lock);
 
-	if (pio != NULL) {
-		zio_remove_child(pio, zio);
+	for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) {
+		zio_link_t *zl = zio->io_walk_link;
+		pio_next = zio_walk_parents(zio);
+		zio_remove_child(pio, zio, zl);
 		zio_notify_parent(pio, zio, ZIO_WAIT_DONE);
 	}
 
@@ -2253,12 +2898,17 @@ zio_done(zio_t *zio)
  * I/O pipeline definition
  * ==========================================================================
  */
-static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = {
+static zio_pipe_stage_t *zio_pipeline[] = {
 	NULL,
-	zio_issue_async,
 	zio_read_bp_init,
+	zio_free_bp_init,
+	zio_issue_async,
 	zio_write_bp_init,
 	zio_checksum_generate,
+	zio_ddt_read_start,
+	zio_ddt_read_done,
+	zio_ddt_write,
+	zio_ddt_free,
 	zio_gang_assemble,
 	zio_gang_issue,
 	zio_dva_allocate,
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c
index bf7fe733fe0c8..699e5c87605ee 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -27,6 +27,7 @@
 #include <sys/spa.h>
 #include <sys/zio.h>
 #include <sys/zio_checksum.h>
+#include <sys/zil.h>
 
 /*
  * Checksum vectors.
@@ -49,13 +50,13 @@
  *	we want the ability to take advantage of that hardware.
  *
  * Of course, we don't want a checksum upgrade to invalidate existing
- * data, so we store the checksum *function* in five bits of the DVA.
- * This gives us room for up to 32 different checksum functions.
+ * data, so we store the checksum *function* in eight bits of the bp.
+ * This gives us room for up to 256 different checksum functions.
  *
  * When writing a block, we always checksum it with the latest-and-greatest
  * checksum function of the appropriate strength.  When reading a block,
  * we compare the expected checksum against the actual checksum, which we
- * compute via the checksum function specified in the DVA encoding.
+ * compute via the checksum function specified by BP_GET_CHECKSUM(bp).
  */
 
 /*ARGSUSED*/
@@ -66,19 +67,20 @@ zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp)
 }
 
 zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = {
-	{{NULL,			NULL},			0, 0,	"inherit"},
-	{{NULL,			NULL},			0, 0,	"on"},
-	{{zio_checksum_off,	zio_checksum_off},	0, 0,	"off"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1,	"label"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1,	"gang_header"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1,	"zilog"},
-	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0,	"fletcher2"},
-	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0,	"fletcher4"},
-	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0,	"SHA256"},
+	{{NULL,			NULL},			0, 0, 0, "inherit"},
+	{{NULL,			NULL},			0, 0, 0, "on"},
+	{{zio_checksum_off,	zio_checksum_off},	0, 0, 0, "off"},
+	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "label"},
+	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 1, 0, "gang_header"},
+	{{fletcher_2_native,	fletcher_2_byteswap},	0, 1, 0, "zilog"},
+	{{fletcher_2_native,	fletcher_2_byteswap},	0, 0, 0, "fletcher2"},
+	{{fletcher_4_native,	fletcher_4_byteswap},	1, 0, 0, "fletcher4"},
+	{{zio_checksum_SHA256,	zio_checksum_SHA256},	1, 0, 1, "sha256"},
+	{{fletcher_4_native,	fletcher_4_byteswap},	0, 1, 0, "zilog2"},
 };
 
-uint8_t
-zio_checksum_select(uint8_t child, uint8_t parent)
+enum zio_checksum
+zio_checksum_select(enum zio_checksum child, enum zio_checksum parent)
 {
 	ASSERT(child < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS);
@@ -93,6 +95,29 @@ zio_checksum_select(uint8_t child, uint8_t parent)
 	return (child);
 }
 
+enum zio_checksum
+zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child,
+    enum zio_checksum parent)
+{
+	ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS);
+	ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON);
+
+	if (child == ZIO_CHECKSUM_INHERIT)
+		return (parent);
+
+	if (child == ZIO_CHECKSUM_ON)
+		return (spa_dedup_checksum(spa));
+
+	if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY))
+		return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY);
+
+	ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup ||
+	    (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF);
+
+	return (child);
+}
+
 /*
  * Set the external verifier for a gang block based on <vdev, offset, txg>,
  * a tuple which is guaranteed to be unique for the life of the pool.
@@ -101,7 +126,7 @@ static void
 zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp)
 {
 	dva_t *dva = BP_IDENTITY(bp);
-	uint64_t txg = bp->blk_birth;
+	uint64_t txg = BP_PHYSICAL_BIRTH(bp);
 
 	ASSERT(BP_IS_GANG(bp));
 
@@ -128,47 +153,79 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum,
 {
 	blkptr_t *bp = zio->io_bp;
 	uint64_t offset = zio->io_offset;
-	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
-	zio_cksum_t zbt_cksum;
+	zio_cksum_t cksum;
 
 	ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS);
 	ASSERT(ci->ci_func[0] != NULL);
 
-	if (ci->ci_zbt) {
+	if (ci->ci_eck) {
+		zio_eck_t *eck;
+
+		if (checksum == ZIO_CHECKSUM_ZILOG2) {
+			zil_chain_t *zilc = data;
+
+			size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ,
+			    uint64_t);
+			eck = &zilc->zc_eck;
+		} else {
+			eck = (zio_eck_t *)((char *)data + size) - 1;
+		}
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
-			zio_checksum_gang_verifier(&zbt->zbt_cksum, bp);
+			zio_checksum_gang_verifier(&eck->zec_cksum, bp);
 		else if (checksum == ZIO_CHECKSUM_LABEL)
-			zio_checksum_label_verifier(&zbt->zbt_cksum, offset);
+			zio_checksum_label_verifier(&eck->zec_cksum, offset);
 		else
-			bp->blk_cksum = zbt->zbt_cksum;
-		zbt->zbt_magic = ZBT_MAGIC;
-		ci->ci_func[0](data, size, &zbt_cksum);
-		zbt->zbt_cksum = zbt_cksum;
+			bp->blk_cksum = eck->zec_cksum;
+		eck->zec_magic = ZEC_MAGIC;
+		ci->ci_func[0](data, size, &cksum);
+		eck->zec_cksum = cksum;
 	} else {
 		ci->ci_func[0](data, size, &bp->blk_cksum);
 	}
 }
 
 int
-zio_checksum_error(zio_t *zio)
+zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info)
 {
 	blkptr_t *bp = zio->io_bp;
 	uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum :
 	    (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp)));
 	int byteswap;
-	void *data = zio->io_data;
+	int error;
 	uint64_t size = (bp == NULL ? zio->io_size :
 	    (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp)));
 	uint64_t offset = zio->io_offset;
-	zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1;
+	void *data = zio->io_data;
 	zio_checksum_info_t *ci = &zio_checksum_table[checksum];
 	zio_cksum_t actual_cksum, expected_cksum, verifier;
 
 	if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL)
 		return (EINVAL);
 
-	if (ci->ci_zbt) {
+	if (ci->ci_eck) {
+		zio_eck_t *eck;
+
+		if (checksum == ZIO_CHECKSUM_ZILOG2) {
+			zil_chain_t *zilc = data;
+			uint64_t nused;
+
+			eck = &zilc->zc_eck;
+			if (eck->zec_magic == ZEC_MAGIC)
+				nused = zilc->zc_nused;
+			else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC))
+				nused = BSWAP_64(zilc->zc_nused);
+			else
+				return (ECKSUM);
+
+			if (nused > size)
+				return (ECKSUM);
+
+			size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t);
+		} else {
+			eck = (zio_eck_t *)((char *)data + size) - 1;
+		}
+
 		if (checksum == ZIO_CHECKSUM_GANG_HEADER)
 			zio_checksum_gang_verifier(&verifier, bp);
 		else if (checksum == ZIO_CHECKSUM_LABEL)
@@ -176,15 +233,15 @@ zio_checksum_error(zio_t *zio)
 		else
 			verifier = bp->blk_cksum;
 
-		byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC));
+		byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC));
 
 		if (byteswap)
 			byteswap_uint64_array(&verifier, sizeof (zio_cksum_t));
 
-		expected_cksum = zbt->zbt_cksum;
-		zbt->zbt_cksum = verifier;
+		expected_cksum = eck->zec_cksum;
+		eck->zec_cksum = verifier;
 		ci->ci_func[byteswap](data, size, &actual_cksum);
-		zbt->zbt_cksum = expected_cksum;
+		eck->zec_cksum = expected_cksum;
 
 		if (byteswap)
 			byteswap_uint64_array(&expected_cksum,
@@ -196,11 +253,22 @@ zio_checksum_error(zio_t *zio)
 		ci->ci_func[byteswap](data, size, &actual_cksum);
 	}
 
+	info->zbc_expected = expected_cksum;
+	info->zbc_actual = actual_cksum;
+	info->zbc_checksum_name = ci->ci_name;
+	info->zbc_byteswapped = byteswap;
+	info->zbc_injected = 0;
+	info->zbc_has_cksum = 1;
+
 	if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum))
 		return (ECKSUM);
 
-	if (zio_injection_enabled && !zio->io_error)
-		return (zio_handle_fault_injection(zio, ECKSUM));
+	if (zio_injection_enabled && !zio->io_error &&
+	    (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) {
+
+		info->zbc_injected = 1;
+		return (error);
+	}
 
 	return (0);
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c
index c563be4eb9557..f148977c44680 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c
@@ -20,12 +20,10 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/zfs_context.h>
 #include <sys/compress.h>
 #include <sys/spa.h>
@@ -51,10 +49,11 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = {
 	{gzip_compress,		gzip_decompress,	7,	"gzip-7"},
 	{gzip_compress,		gzip_decompress,	8,	"gzip-8"},
 	{gzip_compress,		gzip_decompress,	9,	"gzip-9"},
+	{zle_compress,		zle_decompress,		64,	"zle"},
 };
 
-uint8_t
-zio_compress_select(uint8_t child, uint8_t parent)
+enum zio_compress
+zio_compress_select(enum zio_compress child, enum zio_compress parent)
 {
 	ASSERT(child < ZIO_COMPRESS_FUNCTIONS);
 	ASSERT(parent < ZIO_COMPRESS_FUNCTIONS);
@@ -69,80 +68,65 @@ zio_compress_select(uint8_t child, uint8_t parent)
 	return (child);
 }
 
-int
-zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp,
-    uint64_t *destsizep, uint64_t *destbufsizep)
+size_t
+zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len)
 {
 	uint64_t *word, *word_end;
-	uint64_t ciosize, gapsize, destbufsize;
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
-	char *dest;
-	uint_t allzero;
+	size_t c_len, d_len, r_len;
+	zio_compress_info_t *ci = &zio_compress_table[c];
 
-	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
-	ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
+	ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS);
+	ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL);
 
 	/*
 	 * If the data is all zeroes, we don't even need to allocate
-	 * a block for it.  We indicate this by setting *destsizep = 0.
+	 * a block for it.  We indicate this by returning zero size.
 	 */
-	allzero = 1;
-	word = src;
-	word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize);
-	while (word < word_end) {
-		if (*word++ != 0) {
-			allzero = 0;
+	word_end = (uint64_t *)((char *)src + s_len);
+	for (word = src; word < word_end; word++)
+		if (*word != 0)
 			break;
-		}
-	}
-	if (allzero) {
-		*destp = NULL;
-		*destsizep = 0;
-		*destbufsizep = 0;
-		return (1);
-	}
 
-	if (cpfunc == ZIO_COMPRESS_EMPTY)
+	if (word == word_end)
 		return (0);
 
+	if (c == ZIO_COMPRESS_EMPTY)
+		return (s_len);
+
 	/* Compress at least 12.5% */
-	destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE);
-	if (destbufsize == 0)
-		return (0);
-	dest = zio_buf_alloc(destbufsize);
+	d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE);
+	if (d_len == 0)
+		return (s_len);
 
-	ciosize = ci->ci_compress(src, dest, (size_t)srcsize,
-	    (size_t)destbufsize, ci->ci_level);
-	if (ciosize > destbufsize) {
-		zio_buf_free(dest, destbufsize);
-		return (0);
-	}
+	c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level);
 
-	/* Cool.  We compressed at least as much as we were hoping to. */
+	if (c_len > d_len)
+		return (s_len);
 
-	/* For security, make sure we don't write random heap crap to disk */
-	gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize;
-	if (gapsize != 0) {
-		bzero(dest + ciosize, gapsize);
-		ciosize += gapsize;
+	/*
+	 * Cool.  We compressed at least as much as we were hoping to.
+	 * For both security and repeatability, pad out the last sector.
+	 */
+	r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE);
+	if (r_len > c_len) {
+		bzero((char *)dst + c_len, r_len - c_len);
+		c_len = r_len;
 	}
 
-	ASSERT3U(ciosize, <=, destbufsize);
-	ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0);
-	*destp = dest;
-	*destsizep = ciosize;
-	*destbufsizep = destbufsize;
+	ASSERT3U(c_len, <=, d_len);
+	ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0);
 
-	return (1);
+	return (c_len);
 }
 
 int
-zio_decompress_data(int cpfunc, void *src, uint64_t srcsize,
-	void *dest, uint64_t destsize)
+zio_decompress_data(enum zio_compress c, void *src, void *dst,
+    size_t s_len, size_t d_len)
 {
-	zio_compress_info_t *ci = &zio_compress_table[cpfunc];
+	zio_compress_info_t *ci = &zio_compress_table[c];
 
-	ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS);
+	if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL)
+		return (EINVAL);
 
-	return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level));
+	return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level));
 }
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c
index b3469fdd5c243..fa040ea4b31a1 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -43,8 +43,8 @@
 #include <sys/arc.h>
 #include <sys/zio_impl.h>
 #include <sys/zfs_ioctl.h>
-#include <sys/spa_impl.h>
 #include <sys/vdev_impl.h>
+#include <sys/dmu_objset.h>
 #include <sys/fs/zfs.h>
 
 uint32_t zio_injection_enabled;
@@ -70,8 +70,9 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
 	/*
 	 * Check for a match against the MOS, which is based on type
 	 */
-	if (zb->zb_objset == 0 && record->zi_objset == 0 &&
-	    record->zi_object == 0) {
+	if (zb->zb_objset == DMU_META_OBJSET &&
+	    record->zi_objset == DMU_META_OBJSET &&
+	    record->zi_object == DMU_META_DNODE_OBJECT) {
 		if (record->zi_type == DMU_OT_NONE ||
 		    type == record->zi_type)
 			return (record->zi_freq == 0 ||
@@ -95,6 +96,31 @@ zio_match_handler(zbookmark_t *zb, uint64_t type,
 	return (B_FALSE);
 }
 
+/*
+ * Panic the system when a config change happens in the function
+ * specified by tag.
+ */
+void
+zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type)
+{
+	inject_handler_t *handler;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		if (spa != handler->zi_spa)
+			continue;
+
+		if (handler->zi_record.zi_type == type &&
+		    strcmp(tag, handler->zi_record.zi_func) == 0)
+			panic("Panic requested in function %s\n", tag);
+	}
+
+	rw_exit(&inject_lock);
+}
+
 /*
  * Determine if the I/O in question should return failure.  Returns the errno
  * to be returned to the caller.
@@ -126,8 +152,10 @@ zio_handle_fault_injection(zio_t *zio, int error)
 		if (zio->io_spa != handler->zi_spa)
 			continue;
 
-		/* Ignore device errors */
-		if (handler->zi_record.zi_guid != 0)
+		/* Ignore device errors and panic injection */
+		if (handler->zi_record.zi_guid != 0 ||
+		    handler->zi_record.zi_func[0] != '\0' ||
+		    handler->zi_record.zi_duration != 0)
 			continue;
 
 		/* If this handler matches, return EIO */
@@ -159,7 +187,7 @@ zio_handle_label_injection(zio_t *zio, int error)
 	int label;
 	int ret = 0;
 
-	if (offset + zio->io_size > VDEV_LABEL_START_SIZE &&
+	if (offset >= VDEV_LABEL_START_SIZE &&
 	    offset < vd->vdev_psize - VDEV_LABEL_END_SIZE)
 		return (0);
 
@@ -170,8 +198,10 @@ zio_handle_label_injection(zio_t *zio, int error)
 		uint64_t start = handler->zi_record.zi_start;
 		uint64_t end = handler->zi_record.zi_end;
 
-		/* Ignore device only faults */
-		if (handler->zi_record.zi_start == 0)
+		/* Ignore device only faults or panic injection */
+		if (handler->zi_record.zi_start == 0 ||
+		    handler->zi_record.zi_func[0] != '\0' ||
+		    handler->zi_record.zi_duration != 0)
 			continue;
 
 		/*
@@ -195,21 +225,50 @@ zio_handle_label_injection(zio_t *zio, int error)
 
 
 int
-zio_handle_device_injection(vdev_t *vd, int error)
+zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error)
 {
 	inject_handler_t *handler;
 	int ret = 0;
 
+	/*
+	 * We skip over faults in the labels unless it's during
+	 * device open (i.e. zio == NULL).
+	 */
+	if (zio != NULL) {
+		uint64_t offset = zio->io_offset;
+
+		if (offset < VDEV_LABEL_START_SIZE ||
+		    offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE)
+		return (0);
+	}
+
 	rw_enter(&inject_lock, RW_READER);
 
 	for (handler = list_head(&inject_handlers); handler != NULL;
 	    handler = list_next(&inject_handlers, handler)) {
 
-		/* Ignore label specific faults */
-		if (handler->zi_record.zi_start != 0)
+		/*
+		 * Ignore label specific faults, panic injection
+		 * or fake writes
+		 */
+		if (handler->zi_record.zi_start != 0 ||
+		    handler->zi_record.zi_func[0] != '\0' ||
+		    handler->zi_record.zi_duration != 0)
 			continue;
 
 		if (vd->vdev_guid == handler->zi_record.zi_guid) {
+			if (handler->zi_record.zi_failfast &&
+			    (zio == NULL || (zio->io_flags &
+			    (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) {
+				continue;
+			}
+
+			/* Handle type specific I/O failures */
+			if (zio != NULL &&
+			    handler->zi_record.zi_iotype != ZIO_TYPES &&
+			    handler->zi_record.zi_iotype != zio->io_type)
+				continue;
+
 			if (handler->zi_record.zi_error == error) {
 				/*
 				 * For a failed open, pretend like the device
@@ -233,6 +292,84 @@ zio_handle_device_injection(vdev_t *vd, int error)
 	return (ret);
 }
 
+/*
+ * Simulate hardware that ignores cache flushes.  For requested number
+ * of seconds nix the actual writing to disk.
+ */
+void
+zio_handle_ignored_writes(zio_t *zio)
+{
+	inject_handler_t *handler;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		/* Ignore errors not destined for this pool */
+		if (zio->io_spa != handler->zi_spa)
+			continue;
+
+		if (handler->zi_record.zi_duration == 0)
+			continue;
+
+		/*
+		 * Positive duration implies # of seconds, negative
+		 * a number of txgs
+		 */
+		if (handler->zi_record.zi_timer == 0) {
+			if (handler->zi_record.zi_duration > 0)
+				handler->zi_record.zi_timer = ddi_get_lbolt64();
+			else
+				handler->zi_record.zi_timer = zio->io_txg;
+		}
+
+		/* Have a "problem" writing 60% of the time */
+		if (spa_get_random(100) < 60)
+			zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES;
+		break;
+	}
+
+	rw_exit(&inject_lock);
+}
+
+void
+spa_handle_ignored_writes(spa_t *spa)
+{
+	inject_handler_t *handler;
+
+	if (zio_injection_enabled == 0)
+		return;
+
+	rw_enter(&inject_lock, RW_READER);
+
+	for (handler = list_head(&inject_handlers); handler != NULL;
+	    handler = list_next(&inject_handlers, handler)) {
+
+		/* Ignore errors not destined for this pool */
+		if (spa != handler->zi_spa)
+			continue;
+
+		if (handler->zi_record.zi_duration == 0)
+			continue;
+
+		if (handler->zi_record.zi_duration > 0) {
+			VERIFY(handler->zi_record.zi_timer == 0 ||
+			    handler->zi_record.zi_timer +
+			    handler->zi_record.zi_duration * hz >
+			    ddi_get_lbolt64());
+		} else {
+			/* duration is negative so the subtraction here adds */
+			VERIFY(handler->zi_record.zi_timer == 0 ||
+			    handler->zi_record.zi_timer -
+			    handler->zi_record.zi_duration >=
+			    spa_syncing_txg(spa));
+		}
+	}
+
+	rw_exit(&inject_lock);
+}
+
 /*
  * Create a new handler for the given record.  We add it to the list, adding
  * a reference to the spa_t in the process.  We increment zio_injection_enabled,
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zle.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zle.c
new file mode 100644
index 0000000000000..13c5673fbe267
--- /dev/null
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zle.c
@@ -0,0 +1,86 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+
+/*
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Zero-length encoding.  This is a fast and simple algorithm to eliminate
+ * runs of zeroes.  Each chunk of compressed data begins with a length byte, b.
+ * If b < n (where n is the compression parameter) then the next b + 1 bytes
+ * are literal values.  If b >= n then the next (256 - b + 1) bytes are zero.
+ */
+#include <sys/types.h>
+#include <sys/sysmacros.h>
+
+size_t
+zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *s_end = src + s_len;
+	uchar_t *d_end = dst + d_len;
+
+	while (src < s_end && dst < d_end - 1) {
+		uchar_t *first = src;
+		uchar_t *len = dst++;
+		if (src[0] == 0) {
+			uchar_t *last = src + (256 - n);
+			while (src < MIN(last, s_end) && src[0] == 0)
+				src++;
+			*len = src - first - 1 + n;
+		} else {
+			uchar_t *last = src + n;
+			if (d_end - dst < n)
+				break;
+			while (src < MIN(last, s_end) - 1 && (src[0] | src[1]))
+				*dst++ = *src++;
+			if (src[0])
+				*dst++ = *src++;
+			*len = src - first - 1;
+		}
+	}
+	return (src == s_end ? dst - (uchar_t *)d_start : s_len);
+}
+
+int
+zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n)
+{
+	uchar_t *src = s_start;
+	uchar_t *dst = d_start;
+	uchar_t *s_end = src + s_len;
+	uchar_t *d_end = dst + d_len;
+
+	while (src < s_end && dst < d_end) {
+		int len = 1 + *src++;
+		if (len <= n) {
+			while (len-- != 0)
+				*dst++ = *src++;
+		} else {
+			len -= n;
+			while (len-- != 0)
+				*dst++ = 0;
+		}
+	}
+	return (dst == d_end ? 0 : -1);
+}
diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c
index 4e993060ceb27..2b4a0b2bdb93b 100644
--- a/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c
+++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -32,7 +32,7 @@
  * /dev/zvol/dsk/<pool_name>/<dataset_name>
  * /dev/zvol/rdsk/<pool_name>/<dataset_name>
  *
- * These links are created by the ZFS-specific devfsadm link generator.
+ * These links are created by the /dev filesystem (sdev_zvolops.c).
  * Volumes are persistent through reboot.  No user command needs to be
  * run before opening and using a device.
  */
@@ -75,10 +75,12 @@
 #include <sys/vdev_impl.h>
 #include <sys/zvol.h>
 #include <sys/dumphdr.h>
+#include <sys/zil_impl.h>
 
 #include "zfs_namecheck.h"
 
 static void *zvol_state;
+static char *zvol_tag = "zvol_tag";
 
 #define	ZVOL_DUMPSIZE		"dumpsize"
 
@@ -106,14 +108,12 @@ typedef struct zvol_state {
 	uint64_t	zv_volblocksize; /* volume block size */
 	minor_t		zv_minor;	/* minor number */
 	uint8_t		zv_min_bs;	/* minimum addressable block shift */
-	uint8_t		zv_flags;	/* readonly; dumpified */
+	uint8_t		zv_flags;	/* readonly, dumpified, etc. */
 	objset_t	*zv_objset;	/* objset handle */
-	uint32_t	zv_mode;	/* DS_MODE_* flags at open time */
 	uint32_t	zv_open_count[OTYPCNT];	/* open counts */
 	uint32_t	zv_total_opens;	/* total open count */
 	zilog_t		*zv_zilog;	/* ZIL handle */
 	list_t		zv_extents;	/* List of extents for dump */
-	uint64_t	zv_txg_assign;	/* txg to assign during ZIL replay */
 	znode_t		zv_znode;	/* for range locking */
 } zvol_state_t;
 
@@ -123,27 +123,30 @@ typedef struct zvol_state {
 #define	ZVOL_RDONLY	0x1
 #define	ZVOL_DUMPIFIED	0x2
 #define	ZVOL_EXCL	0x4
+#define	ZVOL_WCE	0x8
 
 /*
  * zvol maximum transfer in one DMU tx.
  */
 int zvol_maxphys = DMU_MAX_ACCESS/2;
 
-extern int zfs_set_prop_nvlist(const char *, nvlist_t *);
+extern int zfs_set_prop_nvlist(const char *, zprop_source_t,
+    nvlist_t *, nvlist_t **);
+static int zvol_remove_zv(zvol_state_t *);
 static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio);
 static int zvol_dumpify(zvol_state_t *zv);
 static int zvol_dump_fini(zvol_state_t *zv);
 static int zvol_dump_init(zvol_state_t *zv, boolean_t resize);
 
 static void
-zvol_size_changed(zvol_state_t *zv, major_t maj)
+zvol_size_changed(uint64_t volsize, major_t maj, minor_t min)
 {
-	dev_t dev = makedevice(maj, zv->zv_minor);
+	dev_t dev = makedevice(maj, min);
 
 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
-	    "Size", zv->zv_volsize) == DDI_SUCCESS);
+	    "Size", volsize) == DDI_SUCCESS);
 	VERIFY(ddi_prop_update_int64(dev, zfs_dip,
-	    "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS);
+	    "Nblocks", lbtodb(volsize)) == DDI_SUCCESS);
 
 	/* Notify specfs to invalidate the cached size */
 	spec_size_invalidate(dev, VBLK);
@@ -177,17 +180,6 @@ zvol_check_volblocksize(uint64_t volblocksize)
 	return (0);
 }
 
-static void
-zvol_readonly_changed_cb(void *arg, uint64_t newval)
-{
-	zvol_state_t *zv = arg;
-
-	if (newval)
-		zv->zv_flags |= ZVOL_RDONLY;
-	else
-		zv->zv_flags &= ~ZVOL_RDONLY;
-}
-
 int
 zvol_get_stats(objset_t *os, nvlist_t *nv)
 {
@@ -195,7 +187,6 @@ zvol_get_stats(objset_t *os, nvlist_t *nv)
 	dmu_object_info_t doi;
 	uint64_t val;
 
-
 	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val);
 	if (error)
 		return (error);
@@ -256,8 +247,8 @@ struct maparg {
 
 /*ARGSUSED*/
 static int
-zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb,
-    const dnode_phys_t *dnp, void *arg)
+zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp,
+    const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg)
 {
 	struct maparg *ma = arg;
 	zvol_extent_t *ze;
@@ -309,6 +300,7 @@ zvol_free_extents(zvol_state_t *zv)
 static int
 zvol_get_lbas(zvol_state_t *zv)
 {
+	objset_t *os = zv->zv_objset;
 	struct maparg	ma;
 	int		err;
 
@@ -316,7 +308,9 @@ zvol_get_lbas(zvol_state_t *zv)
 	ma.ma_blks = 0;
 	zvol_free_extents(zv);
 
-	err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0,
+	/* commit any in-flight changes before traversing the dataset */
+	txg_wait_synced(dmu_objset_pool(os), 0);
+	err = traverse_dataset(dmu_objset_ds(os), 0,
 	    TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma);
 	if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) {
 		zvol_free_extents(zv);
@@ -371,21 +365,32 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap)
 {
 	objset_t *os = zv->zv_objset;
 	char *data = (char *)(lr + 1);	/* data follows lr_write_t */
-	uint64_t off = lr->lr_offset;
-	uint64_t len = lr->lr_length;
+	uint64_t offset, length;
 	dmu_tx_t *tx;
 	int error;
 
 	if (byteswap)
 		byteswap_uint64_array(lr, sizeof (*lr));
 
+	offset = lr->lr_offset;
+	length = lr->lr_length;
+
+	/* If it's a dmu_sync() block, write the whole block */
+	if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) {
+		uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr);
+		if (length < blocksize) {
+			offset -= offset % blocksize;
+			length = blocksize;
+		}
+	}
+
 	tx = dmu_tx_create(os);
-	dmu_tx_hold_write(tx, ZVOL_OBJ, off, len);
-	error = dmu_tx_assign(tx, zv->zv_txg_assign);
+	dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length);
+	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
 	} else {
-		dmu_write(os, ZVOL_OBJ, off, len, data, tx);
+		dmu_write(os, ZVOL_OBJ, offset, length, data, tx);
 		dmu_tx_commit(tx);
 	}
 
@@ -417,137 +422,99 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = {
 	zvol_replay_err,	/* TX_TRUNCATE */
 	zvol_replay_err,	/* TX_SETATTR */
 	zvol_replay_err,	/* TX_ACL */
+	zvol_replay_err,	/* TX_CREATE_ACL */
+	zvol_replay_err,	/* TX_CREATE_ATTR */
+	zvol_replay_err,	/* TX_CREATE_ACL_ATTR */
+	zvol_replay_err,	/* TX_MKDIR_ACL */
+	zvol_replay_err,	/* TX_MKDIR_ATTR */
+	zvol_replay_err,	/* TX_MKDIR_ACL_ATTR */
+	zvol_replay_err,	/* TX_WRITE2 */
 };
 
+int
+zvol_name2minor(const char *name, minor_t *minor)
+{
+	zvol_state_t *zv;
+
+	mutex_enter(&zvol_state_lock);
+	zv = zvol_minor_lookup(name);
+	if (minor && zv)
+		*minor = zv->zv_minor;
+	mutex_exit(&zvol_state_lock);
+	return (zv ? 0 : -1);
+}
+
 /*
  * Create a minor node (plus a whole lot more) for the specified volume.
  */
 int
-zvol_create_minor(const char *name, major_t maj)
+zvol_create_minor(const char *name)
 {
 	zvol_state_t *zv;
 	objset_t *os;
 	dmu_object_info_t doi;
-	uint64_t volsize;
 	minor_t minor = 0;
-	struct pathname linkpath;
-	int ds_mode = DS_MODE_OWNER;
-	vnode_t *vp = NULL;
-	char *devpath;
-	size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1;
 	char chrbuf[30], blkbuf[30];
 	int error;
 
 	mutex_enter(&zvol_state_lock);
 
-	if ((zv = zvol_minor_lookup(name)) != NULL) {
+	if (zvol_minor_lookup(name) != NULL) {
 		mutex_exit(&zvol_state_lock);
 		return (EEXIST);
 	}
 
-	if (strchr(name, '@') != 0)
-		ds_mode |= DS_MODE_READONLY;
-
-	error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os);
+	/* lie and say we're read-only */
+	error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os);
 
 	if (error) {
 		mutex_exit(&zvol_state_lock);
 		return (error);
 	}
 
-	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
-
-	if (error) {
-		dmu_objset_close(os);
-		mutex_exit(&zvol_state_lock);
-		return (error);
-	}
-
-	/*
-	 * If there's an existing /dev/zvol symlink, try to use the
-	 * same minor number we used last time.
-	 */
-	devpath = kmem_alloc(devpathlen, KM_SLEEP);
-
-	(void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, name);
-
-	error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp);
-
-	kmem_free(devpath, devpathlen);
-
-	if (error == 0 && vp->v_type != VLNK)
-		error = EINVAL;
-
-	if (error == 0) {
-		pn_alloc(&linkpath);
-		error = pn_getsymlink(vp, &linkpath, kcred);
-		if (error == 0) {
-			char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV);
-			if (ms != NULL) {
-				ms += strlen(ZVOL_PSEUDO_DEV);
-				minor = stoi(&ms);
-			}
-		}
-		pn_free(&linkpath);
-	}
-
-	if (vp != NULL)
-		VN_RELE(vp);
-
-	/*
-	 * If we found a minor but it's already in use, we must pick a new one.
-	 */
-	if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL)
-		minor = 0;
-
-	if (minor == 0)
-		minor = zvol_minor_alloc();
-
-	if (minor == 0) {
-		dmu_objset_close(os);
+	if ((minor = zvol_minor_alloc()) == 0) {
+		dmu_objset_disown(os, zvol_tag);
 		mutex_exit(&zvol_state_lock);
 		return (ENXIO);
 	}
 
 	if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) {
-		dmu_objset_close(os);
+		dmu_objset_disown(os, zvol_tag);
 		mutex_exit(&zvol_state_lock);
 		return (EAGAIN);
 	}
-
 	(void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME,
 	    (char *)name);
 
-	(void) sprintf(chrbuf, "%uc,raw", minor);
+	(void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor);
 
 	if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR,
 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 		ddi_soft_state_free(zvol_state, minor);
-		dmu_objset_close(os);
+		dmu_objset_disown(os, zvol_tag);
 		mutex_exit(&zvol_state_lock);
 		return (EAGAIN);
 	}
 
-	(void) sprintf(blkbuf, "%uc", minor);
+	(void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor);
 
 	if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK,
 	    minor, DDI_PSEUDO, 0) == DDI_FAILURE) {
 		ddi_remove_minor_node(zfs_dip, chrbuf);
 		ddi_soft_state_free(zvol_state, minor);
-		dmu_objset_close(os);
+		dmu_objset_disown(os, zvol_tag);
 		mutex_exit(&zvol_state_lock);
 		return (EAGAIN);
 	}
 
 	zv = ddi_get_soft_state(zvol_state, minor);
 
-	(void) strcpy(zv->zv_name, name);
+	(void) strlcpy(zv->zv_name, name, MAXPATHLEN);
 	zv->zv_min_bs = DEV_BSHIFT;
 	zv->zv_minor = minor;
-	zv->zv_volsize = volsize;
 	zv->zv_objset = os;
-	zv->zv_mode = ds_mode;
-	zv->zv_zilog = zil_open(os, zvol_get_data);
+	if (dmu_objset_is_snapshot(os))
+		zv->zv_flags |= ZVOL_RDONLY;
 	mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL);
 	avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare,
 	    sizeof (rl_t), offsetof(rl_t, r_node));
@@ -558,12 +525,9 @@ zvol_create_minor(const char *name, major_t maj)
 	ASSERT(error == 0);
 	zv->zv_volblocksize = doi.doi_data_block_size;
 
-	zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL);
-	zvol_size_changed(zv, maj);
-
-	/* XXX this should handle the possible i/o error */
-	VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset),
-	    "readonly", zvol_readonly_changed_cb, zv) == 0);
+	zil_replay(os, zv, zvol_replay_vector);
+	dmu_objset_disown(os, zvol_tag);
+	zv->zv_objset = NULL;
 
 	zvol_minors++;
 
@@ -575,47 +539,88 @@ zvol_create_minor(const char *name, major_t maj)
 /*
  * Remove minor node for the specified volume.
  */
+static int
+zvol_remove_zv(zvol_state_t *zv)
+{
+	char nmbuf[20];
+
+	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	if (zv->zv_total_opens != 0)
+		return (EBUSY);
+
+	(void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", zv->zv_minor);
+	ddi_remove_minor_node(zfs_dip, nmbuf);
+
+	(void) snprintf(nmbuf, sizeof (nmbuf), "%u", zv->zv_minor);
+	ddi_remove_minor_node(zfs_dip, nmbuf);
+
+	avl_destroy(&zv->zv_znode.z_range_avl);
+	mutex_destroy(&zv->zv_znode.z_range_lock);
+
+	ddi_soft_state_free(zvol_state, zv->zv_minor);
+
+	zvol_minors--;
+	return (0);
+}
+
 int
 zvol_remove_minor(const char *name)
 {
 	zvol_state_t *zv;
-	char namebuf[30];
+	int rc;
 
 	mutex_enter(&zvol_state_lock);
-
 	if ((zv = zvol_minor_lookup(name)) == NULL) {
 		mutex_exit(&zvol_state_lock);
 		return (ENXIO);
 	}
+	rc = zvol_remove_zv(zv);
+	mutex_exit(&zvol_state_lock);
+	return (rc);
+}
 
-	if (zv->zv_total_opens != 0) {
-		mutex_exit(&zvol_state_lock);
-		return (EBUSY);
-	}
+int
+zvol_first_open(zvol_state_t *zv)
+{
+	objset_t *os;
+	uint64_t volsize;
+	int error;
+	uint64_t readonly;
 
-	(void) sprintf(namebuf, "%uc,raw", zv->zv_minor);
-	ddi_remove_minor_node(zfs_dip, namebuf);
+	/* lie and say we're read-only */
+	error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE,
+	    zvol_tag, &os);
+	if (error)
+		return (error);
 
-	(void) sprintf(namebuf, "%uc", zv->zv_minor);
-	ddi_remove_minor_node(zfs_dip, namebuf);
+	error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize);
+	if (error) {
+		ASSERT(error == 0);
+		dmu_objset_disown(os, zvol_tag);
+		return (error);
+	}
+	zv->zv_objset = os;
+	zv->zv_volsize = volsize;
+	zv->zv_zilog = zil_open(os, zvol_get_data);
+	zvol_size_changed(zv->zv_volsize, ddi_driver_major(zfs_dip),
+	    zv->zv_minor);
 
-	VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset),
-	    "readonly", zvol_readonly_changed_cb, zv) == 0);
+	VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly,
+	    NULL) == 0);
+	if (readonly || dmu_objset_is_snapshot(os))
+		zv->zv_flags |= ZVOL_RDONLY;
+	else
+		zv->zv_flags &= ~ZVOL_RDONLY;
+	return (error);
+}
 
+void
+zvol_last_close(zvol_state_t *zv)
+{
 	zil_close(zv->zv_zilog);
 	zv->zv_zilog = NULL;
-	dmu_objset_close(zv->zv_objset);
+	dmu_objset_disown(zv->zv_objset, zvol_tag);
 	zv->zv_objset = NULL;
-	avl_destroy(&zv->zv_znode.z_range_avl);
-	mutex_destroy(&zv->zv_znode.z_range_lock);
-
-	ddi_soft_state_free(zvol_state, zv->zv_minor);
-
-	zvol_minors--;
-
-	mutex_exit(&zvol_state_lock);
-
-	return (0);
 }
 
 int
@@ -658,14 +663,14 @@ zvol_prealloc(zvol_state_t *zv)
 }
 
 int
-zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
+zvol_update_volsize(objset_t *os, uint64_t volsize)
 {
 	dmu_tx_t *tx;
 	int error;
 
 	ASSERT(MUTEX_HELD(&zvol_state_lock));
 
-	tx = dmu_tx_create(zv->zv_objset);
+	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
@@ -673,127 +678,117 @@ zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize)
 		return (error);
 	}
 
-	error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1,
+	error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1,
 	    &volsize, tx);
 	dmu_tx_commit(tx);
 
 	if (error == 0)
-		error = dmu_free_long_range(zv->zv_objset,
+		error = dmu_free_long_range(os,
 		    ZVOL_OBJ, volsize, DMU_OBJECT_END);
+	return (error);
+}
 
-	/*
-	 * If we are using a faked-up state (zv_minor == 0) then don't
-	 * try to update the in-core zvol state.
-	 */
-	if (error == 0 && zv->zv_minor) {
-		zv->zv_volsize = volsize;
-		zvol_size_changed(zv, maj);
+void
+zvol_remove_minors(const char *name)
+{
+	zvol_state_t *zv;
+	char *namebuf;
+	minor_t minor;
+
+	namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP);
+	(void) strncpy(namebuf, name, strlen(name));
+	(void) strcat(namebuf, "/");
+	mutex_enter(&zvol_state_lock);
+	for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) {
+
+		zv = ddi_get_soft_state(zvol_state, minor);
+		if (zv == NULL)
+			continue;
+		if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0)
+			(void) zvol_remove_zv(zv);
 	}
-	return (error);
+	kmem_free(namebuf, strlen(name) + 2);
+
+	mutex_exit(&zvol_state_lock);
 }
 
 int
 zvol_set_volsize(const char *name, major_t maj, uint64_t volsize)
 {
-	zvol_state_t *zv;
+	zvol_state_t *zv = NULL;
+	objset_t *os;
 	int error;
 	dmu_object_info_t doi;
 	uint64_t old_volsize = 0ULL;
-	zvol_state_t state = { 0 };
+	uint64_t readonly;
 
 	mutex_enter(&zvol_state_lock);
-
-	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		/*
-		 * If we are doing a "zfs clone -o volsize=", then the
-		 * minor node won't exist yet.
-		 */
-		error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER,
-		    &state.zv_objset);
-		if (error != 0)
-			goto out;
-		zv = &state;
+	zv = zvol_minor_lookup(name);
+	if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) {
+		mutex_exit(&zvol_state_lock);
+		return (error);
 	}
-	old_volsize = zv->zv_volsize;
 
-	if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 ||
+	if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 ||
 	    (error = zvol_check_volsize(volsize,
 	    doi.doi_data_block_size)) != 0)
 		goto out;
 
-	if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
+	VERIFY(dsl_prop_get_integer(name, "readonly", &readonly,
+	    NULL) == 0);
+	if (readonly) {
 		error = EROFS;
 		goto out;
 	}
 
-	error = zvol_update_volsize(zv, maj, volsize);
-
+	error = zvol_update_volsize(os, volsize);
 	/*
 	 * Reinitialize the dump area to the new size. If we
-	 * failed to resize the dump area then restore the it back to
-	 * it's original size.
+	 * failed to resize the dump area then restore it back to
+	 * its original size.
 	 */
-	if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) {
-		if ((error = zvol_dumpify(zv)) != 0 ||
-		    (error = dumpvp_resize()) != 0) {
-			(void) zvol_update_volsize(zv, maj, old_volsize);
-			error = zvol_dumpify(zv);
+	if (zv && error == 0) {
+		if (zv->zv_flags & ZVOL_DUMPIFIED) {
+			old_volsize = zv->zv_volsize;
+			zv->zv_volsize = volsize;
+			if ((error = zvol_dumpify(zv)) != 0 ||
+			    (error = dumpvp_resize()) != 0) {
+				(void) zvol_update_volsize(os, old_volsize);
+				zv->zv_volsize = old_volsize;
+				error = zvol_dumpify(zv);
+			}
+		}
+		if (error == 0) {
+			zv->zv_volsize = volsize;
+			zvol_size_changed(volsize, maj, zv->zv_minor);
 		}
 	}
 
-out:
-	if (state.zv_objset)
-		dmu_objset_close(state.zv_objset);
+	/*
+	 * Generate a LUN expansion event.
+	 */
+	if (zv && error == 0) {
+		sysevent_id_t eid;
+		nvlist_t *attr;
+		char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP);
 
-	mutex_exit(&zvol_state_lock);
+		(void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV,
+		    zv->zv_minor);
 
-	return (error);
-}
+		VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0);
+		VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0);
 
-int
-zvol_set_volblocksize(const char *name, uint64_t volblocksize)
-{
-	zvol_state_t *zv;
-	dmu_tx_t *tx;
-	int error;
-	boolean_t needlock;
+		(void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS,
+		    ESC_DEV_DLE, attr, &eid, DDI_SLEEP);
 
-	/*
-	 * The lock may already be held if we are being called from
-	 * zvol_dump_init().
-	 */
-	needlock = !MUTEX_HELD(&zvol_state_lock);
-	if (needlock)
-		mutex_enter(&zvol_state_lock);
-
-	if ((zv = zvol_minor_lookup(name)) == NULL) {
-		if (needlock)
-			mutex_exit(&zvol_state_lock);
-		return (ENXIO);
-	}
-	if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) {
-		if (needlock)
-			mutex_exit(&zvol_state_lock);
-		return (EROFS);
+		nvlist_free(attr);
+		kmem_free(physpath, MAXPATHLEN);
 	}
 
-	tx = dmu_tx_create(zv->zv_objset);
-	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
-	error = dmu_tx_assign(tx, TXG_WAIT);
-	if (error) {
-		dmu_tx_abort(tx);
-	} else {
-		error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ,
-		    volblocksize, 0, tx);
-		if (error == ENOTSUP)
-			error = EBUSY;
-		dmu_tx_commit(tx);
-		if (error == 0)
-			zv->zv_volblocksize = volblocksize;
-	}
+out:
+	dmu_objset_rele(os, FTAG);
 
-	if (needlock)
-		mutex_exit(&zvol_state_lock);
+	mutex_exit(&zvol_state_lock);
 
 	return (error);
 }
@@ -804,6 +799,7 @@ zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 {
 	minor_t minor = getminor(*devp);
 	zvol_state_t *zv;
+	int err = 0;
 
 	if (minor == 0)			/* This is the control device */
 		return (0);
@@ -816,21 +812,24 @@ zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 		return (ENXIO);
 	}
 
-	ASSERT(zv->zv_objset != NULL);
-
-	if ((flag & FWRITE) &&
-	    (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))) {
+	if (zv->zv_total_opens == 0)
+		err = zvol_first_open(zv);
+	if (err) {
 		mutex_exit(&zvol_state_lock);
-		return (EROFS);
+		return (err);
+	}
+	if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) {
+		err = EROFS;
+		goto out;
 	}
 	if (zv->zv_flags & ZVOL_EXCL) {
-		mutex_exit(&zvol_state_lock);
-		return (EBUSY);
+		err = EBUSY;
+		goto out;
 	}
 	if (flag & FEXCL) {
 		if (zv->zv_total_opens != 0) {
-			mutex_exit(&zvol_state_lock);
-			return (EBUSY);
+			err = EBUSY;
+			goto out;
 		}
 		zv->zv_flags |= ZVOL_EXCL;
 	}
@@ -839,10 +838,14 @@ zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr)
 		zv->zv_open_count[otyp]++;
 		zv->zv_total_opens++;
 	}
-
 	mutex_exit(&zvol_state_lock);
 
-	return (0);
+	return (err);
+out:
+	if (zv->zv_total_opens == 0)
+		zvol_last_close(zv);
+	mutex_exit(&zvol_state_lock);
+	return (err);
 }
 
 /*ARGSUSED*/
@@ -851,6 +854,7 @@ zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
 {
 	minor_t minor = getminor(dev);
 	zvol_state_t *zv;
+	int error = 0;
 
 	if (minor == 0)		/* This is the control device */
 		return (0);
@@ -881,20 +885,24 @@ zvol_close(dev_t dev, int flag, int otyp, cred_t *cr)
 	zv->zv_open_count[otyp]--;
 	zv->zv_total_opens--;
 
-	mutex_exit(&zvol_state_lock);
+	if (zv->zv_total_opens == 0)
+		zvol_last_close(zv);
 
-	return (0);
+	mutex_exit(&zvol_state_lock);
+	return (error);
 }
 
 static void
-zvol_get_done(dmu_buf_t *db, void *vzgd)
+zvol_get_done(zgd_t *zgd, int error)
 {
-	zgd_t *zgd = (zgd_t *)vzgd;
-	rl_t *rl = zgd->zgd_rl;
+	if (zgd->zgd_db)
+		dmu_buf_rele(zgd->zgd_db, zgd);
+
+	zfs_range_unlock(zgd->zgd_rl);
+
+	if (error == 0 && zgd->zgd_bp)
+		zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 
-	dmu_buf_rele(db, vzgd);
-	zfs_range_unlock(rl);
-	zil_add_block(zgd->zgd_zilog, zgd->zgd_bp);
 	kmem_free(zgd, sizeof (zgd_t));
 }
 
@@ -906,15 +914,20 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 {
 	zvol_state_t *zv = arg;
 	objset_t *os = zv->zv_objset;
+	uint64_t object = ZVOL_OBJ;
+	uint64_t offset = lr->lr_offset;
+	uint64_t size = lr->lr_length;	/* length of user data */
+	blkptr_t *bp = &lr->lr_blkptr;
 	dmu_buf_t *db;
-	rl_t *rl;
 	zgd_t *zgd;
-	uint64_t boff; 			/* block starting offset */
-	int dlen = lr->lr_length;	/* length of user data */
 	int error;
 
-	ASSERT(zio);
-	ASSERT(dlen != 0);
+	ASSERT(zio != NULL);
+	ASSERT(size != 0);
+
+	zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP);
+	zgd->zgd_zilog = zv->zv_zilog;
+	zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER);
 
 	/*
 	 * Write records come in two flavors: immediate and indirect.
@@ -923,39 +936,30 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 	 * sync the data and get a pointer to it (indirect) so that
 	 * we don't have to write the data twice.
 	 */
-	if (buf != NULL) /* immediate write */
-		return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf));
+	if (buf != NULL) {	/* immediate write */
+		error = dmu_read(os, object, offset, size, buf,
+		    DMU_READ_NO_PREFETCH);
+	} else {
+		size = zv->zv_volblocksize;
+		offset = P2ALIGN(offset, size);
+		error = dmu_buf_hold(os, object, offset, zgd, &db);
+		if (error == 0) {
+			zgd->zgd_db = db;
+			zgd->zgd_bp = bp;
 
-	zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP);
-	zgd->zgd_zilog = zv->zv_zilog;
-	zgd->zgd_bp = &lr->lr_blkptr;
+			ASSERT(db->db_offset == offset);
+			ASSERT(db->db_size == size);
 
-	/*
-	 * Lock the range of the block to ensure that when the data is
-	 * written out and its checksum is being calculated that no other
-	 * thread can change the block.
-	 */
-	boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t);
-	rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize,
-	    RL_READER);
-	zgd->zgd_rl = rl;
+			error = dmu_sync(zio, lr->lr_common.lrc_txg,
+			    zvol_get_done, zgd);
+
+			if (error == 0)
+				return (0);
+		}
+	}
+
+	zvol_get_done(zgd, error);
 
-	VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db));
-	error = dmu_sync(zio, db, &lr->lr_blkptr,
-	    lr->lr_common.lrc_txg, zvol_get_done, zgd);
-	if (error == 0)
-		zil_add_block(zv->zv_zilog, &lr->lr_blkptr);
-	/*
-	 * If we get EINPROGRESS, then we need to wait for a
-	 * write IO initiated by dmu_sync() to complete before
-	 * we can release this dbuf.  We will finish everything
-	 * up in the zvol_get_done() callback.
-	 */
-	if (error == EINPROGRESS)
-		return (0);
-	dmu_buf_rele(db, zgd);
-	zfs_range_unlock(rl);
-	kmem_free(zgd, sizeof (zgd_t));
 	return (error);
 }
 
@@ -968,28 +972,75 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio)
 ssize_t zvol_immediate_write_sz = 32768;
 
 static void
-zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len)
+zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid,
+    boolean_t sync)
 {
 	uint32_t blocksize = zv->zv_volblocksize;
-	lr_write_t *lr;
+	zilog_t *zilog = zv->zv_zilog;
+	boolean_t slogging;
+	ssize_t immediate_write_sz;
 
-	while (len) {
-		ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize));
-		itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+	if (zil_disable)
+		return;
 
-		itx->itx_wr_state =
-		    len > zvol_immediate_write_sz ?  WR_INDIRECT : WR_NEED_COPY;
-		itx->itx_private = zv;
+	if (zil_replaying(zilog, tx))
+		return;
+
+	immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT)
+	    ? 0 : zvol_immediate_write_sz;
+
+	slogging = spa_has_slogs(zilog->zl_spa) &&
+	    (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY);
+
+	while (resid) {
+		itx_t *itx;
+		lr_write_t *lr;
+		ssize_t len;
+		itx_wr_state_t write_state;
+
+		/*
+		 * Unlike zfs_log_write() we can be called with
+		 * upto DMU_MAX_ACCESS/2 (5MB) writes.
+		 */
+		if (blocksize > immediate_write_sz && !slogging &&
+		    resid >= blocksize && off % blocksize == 0) {
+			write_state = WR_INDIRECT; /* uses dmu_sync */
+			len = blocksize;
+		} else if (sync) {
+			write_state = WR_COPIED;
+			len = MIN(ZIL_MAX_LOG_DATA, resid);
+		} else {
+			write_state = WR_NEED_COPY;
+			len = MIN(ZIL_MAX_LOG_DATA, resid);
+		}
+
+		itx = zil_itx_create(TX_WRITE, sizeof (*lr) +
+		    (write_state == WR_COPIED ? len : 0));
 		lr = (lr_write_t *)&itx->itx_lr;
+		if (write_state == WR_COPIED && dmu_read(zv->zv_objset,
+		    ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) {
+			zil_itx_destroy(itx);
+			itx = zil_itx_create(TX_WRITE, sizeof (*lr));
+			lr = (lr_write_t *)&itx->itx_lr;
+			write_state = WR_NEED_COPY;
+		}
+
+		itx->itx_wr_state = write_state;
+		if (write_state == WR_NEED_COPY)
+			itx->itx_sod += len;
 		lr->lr_foid = ZVOL_OBJ;
 		lr->lr_offset = off;
-		lr->lr_length = nbytes;
-		lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t);
+		lr->lr_length = len;
+		lr->lr_blkoff = 0;
 		BP_ZERO(&lr->lr_blkptr);
 
-		(void) zil_itx_assign(zv->zv_zilog, itx, tx);
-		len -= nbytes;
-		off += nbytes;
+		itx->itx_private = zv;
+		itx->itx_sync = sync;
+
+		(void) zil_itx_assign(zilog, itx, tx);
+
+		off += len;
+		resid -= len;
 	}
 }
 
@@ -1002,7 +1053,9 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size,
 	int numerrors = 0;
 
 	for (c = 0; c < vd->vdev_children; c++) {
-		ASSERT(vd->vdev_ops == &vdev_mirror_ops);
+		ASSERT(vd->vdev_ops == &vdev_mirror_ops ||
+		    vd->vdev_ops == &vdev_replacing_ops ||
+		    vd->vdev_ops == &vdev_spare_ops);
 		int err = zvol_dumpio_vdev(vd->vdev_child[c],
 		    addr, offset, size, doread, isdump);
 		if (err != 0) {
@@ -1078,6 +1131,7 @@ zvol_strategy(buf_t *bp)
 	int error = 0;
 	boolean_t doread = bp->b_flags & B_READ;
 	boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED;
+	boolean_t sync;
 
 	if (zv == NULL) {
 		bioerror(bp, ENXIO);
@@ -1091,9 +1145,7 @@ zvol_strategy(buf_t *bp)
 		return (0);
 	}
 
-	if (!(bp->b_flags & B_READ) &&
-	    (zv->zv_flags & ZVOL_RDONLY ||
-	    zv->zv_mode & DS_MODE_READONLY)) {
+	if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) {
 		bioerror(bp, EROFS);
 		biodone(bp);
 		return (0);
@@ -1115,6 +1167,9 @@ zvol_strategy(buf_t *bp)
 		return (0);
 	}
 
+	sync = !(bp->b_flags & B_ASYNC) && !doread && !is_dump &&
+	    !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+
 	/*
 	 * There must be no buffer changes when doing a dmu_sync() because
 	 * we can't change the data whilst calculating the checksum.
@@ -1129,7 +1184,8 @@ zvol_strategy(buf_t *bp)
 			error = zvol_dumpio(zv, addr, off, size,
 			    doread, B_FALSE);
 		} else if (doread) {
-			error = dmu_read(os, ZVOL_OBJ, off, size, addr);
+			error = dmu_read(os, ZVOL_OBJ, off, size, addr,
+			    DMU_READ_PREFETCH);
 		} else {
 			dmu_tx_t *tx = dmu_tx_create(os);
 			dmu_tx_hold_write(tx, ZVOL_OBJ, off, size);
@@ -1138,7 +1194,7 @@ zvol_strategy(buf_t *bp)
 				dmu_tx_abort(tx);
 			} else {
 				dmu_write(os, ZVOL_OBJ, off, size, addr, tx);
-				zvol_log_write(zv, tx, off, size);
+				zvol_log_write(zv, tx, off, size, sync);
 				dmu_tx_commit(tx);
 			}
 		}
@@ -1157,7 +1213,7 @@ zvol_strategy(buf_t *bp)
 	if ((bp->b_resid = resid) == bp->b_bcount)
 		bioerror(bp, off > volsize ? EINVAL : error);
 
-	if (!(bp->b_flags & B_ASYNC) && !doread && !zil_disable && !is_dump)
+	if (sync)
 		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
 	biodone(bp);
 
@@ -1272,6 +1328,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 	uint64_t volsize;
 	rl_t *rl;
 	int error = 0;
+	boolean_t sync;
 
 	if (minor == 0)			/* This is the control device */
 		return (ENXIO);
@@ -1291,6 +1348,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		return (error);
 	}
 
+	sync = !(zv->zv_flags & ZVOL_WCE) && !zil_disable;
+
 	rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid,
 	    RL_WRITER);
 	while (uio->uio_resid > 0 && uio->uio_loffset < volsize) {
@@ -1309,13 +1368,15 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr)
 		}
 		error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx);
 		if (error == 0)
-			zvol_log_write(zv, tx, off, bytes);
+			zvol_log_write(zv, tx, off, bytes, sync);
 		dmu_tx_commit(tx);
 
 		if (error)
 			break;
 	}
 	zfs_range_unlock(rl);
+	if (sync)
+		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
 	return (error);
 }
 
@@ -1398,6 +1459,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 		mutex_exit(&zvol_state_lock);
 		return (ENXIO);
 	}
+	ASSERT(zv->zv_total_opens > 0);
 
 	switch (cmd) {
 
@@ -1406,6 +1468,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 		(void) strcpy(dki.dki_cname, "zvol");
 		(void) strcpy(dki.dki_dname, "zvol");
 		dki.dki_ctype = DKC_UNKNOWN;
+		dki.dki_unit = getminor(dev);
 		dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs);
 		mutex_exit(&zvol_state_lock);
 		if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag))
@@ -1434,12 +1497,40 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 
 	case DKIOCFLUSHWRITECACHE:
 		dkc = (struct dk_callback *)arg;
+		mutex_exit(&zvol_state_lock);
 		zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
 		if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) {
 			(*dkc->dkc_callback)(dkc->dkc_cookie, error);
 			error = 0;
 		}
-		break;
+		return (error);
+
+	case DKIOCGETWCE:
+		{
+			int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0;
+			if (ddi_copyout(&wce, (void *)arg, sizeof (int),
+			    flag))
+				error = EFAULT;
+			break;
+		}
+	case DKIOCSETWCE:
+		{
+			int wce;
+			if (ddi_copyin((void *)arg, &wce, sizeof (int),
+			    flag)) {
+				error = EFAULT;
+				break;
+			}
+			if (wce) {
+				zv->zv_flags |= ZVOL_WCE;
+				mutex_exit(&zvol_state_lock);
+			} else {
+				zv->zv_flags &= ~ZVOL_WCE;
+				mutex_exit(&zvol_state_lock);
+				zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ);
+			}
+			return (0);
+		}
 
 	case DKIOCGGEOM:
 	case DKIOCGVTOC:
@@ -1458,6 +1549,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp)
 		break;
 
 	case DKIOCDUMPFINI:
+		if (!(zv->zv_flags & ZVOL_DUMPIFIED))
+			break;
 		rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize,
 		    RL_WRITER);
 		error = zvol_dump_fini(zv);
@@ -1493,29 +1586,6 @@ zvol_fini(void)
 	ddi_soft_state_fini(&zvol_state);
 }
 
-static boolean_t
-zvol_is_swap(zvol_state_t *zv)
-{
-	vnode_t *vp;
-	boolean_t ret = B_FALSE;
-	char *devpath;
-	size_t devpathlen;
-	int error;
-
-	devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1;
-	devpath = kmem_alloc(devpathlen, KM_SLEEP);
-	(void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name);
-	error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp);
-	kmem_free(devpath, devpathlen);
-
-	ret = !error && IS_SWAPVP(common_specvp(vp));
-
-	if (vp != NULL)
-		VN_RELE(vp);
-
-	return (ret);
-}
-
 static int
 zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 {
@@ -1523,11 +1593,17 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 	int error = 0;
 	objset_t *os = zv->zv_objset;
 	nvlist_t *nv = NULL;
+	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
 
 	ASSERT(MUTEX_HELD(&zvol_state_lock));
+	error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0,
+	    DMU_OBJECT_END);
+	/* wait for dmu_free_long_range to actually free the blocks */
+	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
 
 	tx = dmu_tx_create(os);
 	dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL);
+	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
 	error = dmu_tx_assign(tx, TXG_WAIT);
 	if (error) {
 		dmu_tx_abort(tx);
@@ -1545,7 +1621,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1,
 		    &zv->zv_volsize, tx);
 	} else {
-		uint64_t checksum, compress, refresrv, vbs;
+		uint64_t checksum, compress, refresrv, vbs, dedup;
 
 		error = dsl_prop_get_integer(zv->zv_name,
 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL);
@@ -1555,6 +1631,11 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 		    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL);
 		error = error ? error : dsl_prop_get_integer(zv->zv_name,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL);
+		if (version >= SPA_VERSION_DEDUP) {
+			error = error ? error :
+			    dsl_prop_get_integer(zv->zv_name,
+			    zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL);
+		}
 
 		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
 		    zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1,
@@ -1567,17 +1648,18 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 		error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
 		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1,
 		    &vbs, tx);
+		error = error ? error : dmu_object_set_blocksize(
+		    os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx);
+		if (version >= SPA_VERSION_DEDUP) {
+			error = error ? error : zap_update(os, ZVOL_ZAP_OBJ,
+			    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1,
+			    &dedup, tx);
+		}
+		if (error == 0)
+			zv->zv_volblocksize = SPA_MAXBLOCKSIZE;
 	}
 	dmu_tx_commit(tx);
 
-	/* Truncate the file */
-	if (!error)
-		error = dmu_free_long_range(zv->zv_objset,
-		    ZVOL_OBJ, 0, DMU_OBJECT_END);
-
-	if (error)
-		return (error);
-
 	/*
 	 * We only need update the zvol's property if we are initializing
 	 * the dump area for the first time.
@@ -1592,11 +1674,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize)
 		VERIFY(nvlist_add_uint64(nv,
 		    zfs_prop_to_name(ZFS_PROP_CHECKSUM),
 		    ZIO_CHECKSUM_OFF) == 0);
-		VERIFY(nvlist_add_uint64(nv,
-		    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE),
-		    SPA_MAXBLOCKSIZE) == 0);
+		if (version >= SPA_VERSION_DEDUP) {
+			VERIFY(nvlist_add_uint64(nv,
+			    zfs_prop_to_name(ZFS_PROP_DEDUP),
+			    ZIO_CHECKSUM_OFF) == 0);
+		}
 
-		error = zfs_set_prop_nvlist(zv->zv_name, nv);
+		error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
+		    nv, NULL);
 		nvlist_free(nv);
 
 		if (error)
@@ -1616,15 +1701,9 @@ zvol_dumpify(zvol_state_t *zv)
 	dmu_tx_t *tx;
 	objset_t *os = zv->zv_objset;
 
-	if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))
+	if (zv->zv_flags & ZVOL_RDONLY)
 		return (EROFS);
 
-	/*
-	 * We do not support swap devices acting as dump devices.
-	 */
-	if (zvol_is_swap(zv))
-		return (ENOTSUP);
-
 	if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE,
 	    8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) {
 		boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE;
@@ -1674,7 +1753,8 @@ zvol_dump_fini(zvol_state_t *zv)
 	objset_t *os = zv->zv_objset;
 	nvlist_t *nv;
 	int error = 0;
-	uint64_t checksum, compress, refresrv, vbs;
+	uint64_t checksum, compress, refresrv, vbs, dedup;
+	uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset));
 
 	/*
 	 * Attempt to restore the zvol back to its pre-dumpified state.
@@ -1709,14 +1789,31 @@ zvol_dump_fini(zvol_state_t *zv)
 	    zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress);
 	(void) nvlist_add_uint64(nv,
 	    zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv);
-	(void) nvlist_add_uint64(nv,
-	    zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs);
-	(void) zfs_set_prop_nvlist(zv->zv_name, nv);
+	if (version >= SPA_VERSION_DEDUP &&
+	    zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ,
+	    zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) {
+		(void) nvlist_add_uint64(nv,
+		    zfs_prop_to_name(ZFS_PROP_DEDUP), dedup);
+	}
+	(void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL,
+	    nv, NULL);
 	nvlist_free(nv);
 
 	zvol_free_extents(zv);
 	zv->zv_flags &= ~ZVOL_DUMPIFIED;
 	(void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END);
+	/* wait for dmu_free_long_range to actually free the blocks */
+	txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0);
+	tx = dmu_tx_create(os);
+	dmu_tx_hold_bonus(tx, ZVOL_OBJ);
+	error = dmu_tx_assign(tx, TXG_WAIT);
+	if (error) {
+		dmu_tx_abort(tx);
+		return (error);
+	}
+	if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0)
+		zv->zv_volblocksize = vbs;
+	dmu_tx_commit(tx);
 
 	return (0);
 }
diff --git a/external/cddl/osnet/dist/uts/common/rpc/xdr.c b/external/cddl/osnet/dist/uts/common/rpc/xdr.c
index 8514f67300bb7..6720324cddbc0 100644
--- a/external/cddl/osnet/dist/uts/common/rpc/xdr.c
+++ b/external/cddl/osnet/dist/uts/common/rpc/xdr.c
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -306,6 +306,29 @@ xdr_char(XDR *xdrs, char *cp)
 	return (TRUE);
 }
 
+/*
+ * XDR an unsigned char
+ */
+bool_t
+xdr_u_char(XDR *xdrs, uchar_t *cp)
+{
+	int i;
+
+	switch (xdrs->x_op) {
+	case XDR_ENCODE:
+		i = (*cp);
+		return (XDR_PUTINT32(xdrs, &i));
+	case XDR_DECODE:
+		if (!XDR_GETINT32(xdrs, &i))
+			return (FALSE);
+		*cp = (uchar_t)i;
+		return (TRUE);
+	case XDR_FREE:
+		return (TRUE);
+	}
+	return (FALSE);
+}
+
 /*
  * XDR booleans
  *
@@ -607,6 +630,32 @@ xdr_string(XDR *xdrs, char **cpp, const uint_t maxsize)
 	return (FALSE);
 }
 
+/*
+ * xdr_vector():
+ *
+ * XDR a fixed length array. Unlike variable-length arrays, the storage
+ * of fixed length arrays is static and unfreeable.
+ * > basep: base of the array
+ * > size: size of the array
+ * > elemsize: size of each element
+ * > xdr_elem: routine to XDR each element
+ */
+bool_t
+xdr_vector(XDR *xdrs, char *basep, const uint_t nelem,
+	const uint_t elemsize, const xdrproc_t xdr_elem)
+{
+	uint_t i;
+	char *elptr;
+
+	elptr = basep;
+	for (i = 0; i < nelem; i++) {
+		if (!(*xdr_elem)(xdrs, elptr, LASTUNSIGNED))
+			return (FALSE);
+		elptr += elemsize;
+	}
+	return (TRUE);
+}
+
 /*
  * Wrapper for xdr_string that can be called directly from
  * routines like clnt_call
diff --git a/external/cddl/osnet/dist/uts/common/rpc/xdr.h b/external/cddl/osnet/dist/uts/common/rpc/xdr.h
index e335e4b83c428..3db775893c88a 100644
--- a/external/cddl/osnet/dist/uts/common/rpc/xdr.h
+++ b/external/cddl/osnet/dist/uts/common/rpc/xdr.h
@@ -18,7 +18,7 @@
  *
  * CDDL HEADER END
  *
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
@@ -420,6 +420,8 @@ extern bool_t	xdr_opaque(XDR *, caddr_t, const uint_t);
 extern bool_t	xdr_string(XDR *, char **, const uint_t);
 extern bool_t	xdr_union(XDR *, enum_t *, char *,
 		    const struct xdr_discrim *, const xdrproc_t);
+extern bool_t	xdr_vector(XDR *, char *, const uint_t, const uint_t,
+    const xdrproc_t);
 extern unsigned int  xdr_sizeof(xdrproc_t, void *);
 
 extern bool_t   xdr_hyper(XDR *, longlong_t *);
@@ -428,6 +430,7 @@ extern bool_t   xdr_u_hyper(XDR *, u_longlong_t *);
 extern bool_t   xdr_u_longlong_t(XDR *, u_longlong_t *);
 
 extern bool_t	xdr_char(XDR *, char *);
+extern bool_t	xdr_u_char(XDR *, uchar_t *);
 extern bool_t	xdr_wrapstring(XDR *, char **);
 extern bool_t	xdr_reference(XDR *, caddr_t *, uint_t, const xdrproc_t);
 extern bool_t	xdr_pointer(XDR *, char **, uint_t, const xdrproc_t);
@@ -446,9 +449,6 @@ extern bool_t	xdr_uint64_t(XDR *, uint64_t *);
 #endif
 
 #ifndef _KERNEL
-extern bool_t	xdr_u_char(XDR *, uchar_t *);
-extern bool_t	xdr_vector(XDR *, char *, const uint_t, const uint_t, const
-xdrproc_t);
 extern bool_t	xdr_float(XDR *, float *);
 extern bool_t	xdr_double(XDR *, double *);
 extern bool_t	xdr_quadruple(XDR *, long double *);
@@ -468,12 +468,14 @@ extern bool_t	xdr_bytes();
 extern bool_t	xdr_opaque();
 extern bool_t	xdr_string();
 extern bool_t	xdr_union();
+extern bool_t	xdr_vector();
 
 extern bool_t   xdr_hyper();
 extern bool_t   xdr_longlong_t();
 extern bool_t   xdr_u_hyper();
 extern bool_t   xdr_u_longlong_t();
 extern bool_t	xdr_char();
+extern bool_t	xdr_u_char();
 extern bool_t	xdr_reference();
 extern bool_t	xdr_pointer();
 extern void	xdr_free();
@@ -492,8 +494,6 @@ extern bool_t	xdr_uint64_t();
 #endif
 
 #ifndef _KERNEL
-extern bool_t	xdr_u_char();
-extern bool_t	xdr_vector();
 extern bool_t	xdr_float();
 extern bool_t	xdr_double();
 extern bool_t   xdr_quadruple();
@@ -585,6 +585,8 @@ extern uint_t xdrrec_readbytes();
 #endif
 #else
 
+#define	DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr))
+
 extern void	xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op);
 extern void	xdrmblk_init(XDR *, mblk_t *, enum xdr_op, int);
 extern bool_t	xdrmblk_getmblk(XDR *, mblk_t **, uint_t *);
diff --git a/external/cddl/osnet/dist/uts/common/sys/acl.h b/external/cddl/osnet/dist/uts/common/sys/acl.h
index 27fd577371a97..35c9772b8e9be 100644
--- a/external/cddl/osnet/dist/uts/common/sys/acl.h
+++ b/external/cddl/osnet/dist/uts/common/sys/acl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ACL_H
 #define	_SYS_ACL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/acl_impl.h>
 
@@ -156,6 +154,10 @@ typedef struct ace_object {
     ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \
     ACE_WRITE_OWNER|ACE_SYNCHRONIZE)
 
+#define	ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \
+    ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \
+    ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD)
+
 #define	ACE_READ_PERMS	(ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \
     ACE_READ_NAMED_ATTRS)
 
diff --git a/external/cddl/osnet/dist/uts/common/sys/attr.h b/external/cddl/osnet/dist/uts/common/sys/attr.h
index 86c4cd5d6c80b..b312b5a4297f1 100644
--- a/external/cddl/osnet/dist/uts/common/sys/attr.h
+++ b/external/cddl/osnet/dist/uts/common/sys/attr.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_ATTR_H
 #define	_SYS_ATTR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -56,6 +54,7 @@ extern "C" {
 #define	A_AV_SCANSTAMP		"av_scanstamp"
 #define	A_OWNERSID		"ownersid"
 #define	A_GROUPSID		"groupsid"
+#define	A_REPARSE_POINT		"reparse"
 
 /* Attribute option for utilities */
 #define	O_HIDDEN	 "H"
@@ -68,6 +67,7 @@ extern "C" {
 #define	O_NODUMP	 "d"
 #define	O_AV_QUARANTINED "q"
 #define	O_AV_MODIFIED	 "m"
+#define	O_REPARSE_POINT	 "r"
 #define	O_NONE		 ""
 
 /* ownersid and groupsid are composed of two nvpairs */
@@ -92,6 +92,7 @@ typedef enum {
 	F_OWNERSID,
 	F_GROUPSID,
 	F_FSID,
+	F_REPARSE,
 	F_ATTR_ALL
 } f_attr_t;
 
diff --git a/external/cddl/osnet/dist/uts/common/sys/avl.h b/external/cddl/osnet/dist/uts/common/sys/avl.h
index 02263a5a0cf14..ba305c9082392 100644
--- a/external/cddl/osnet/dist/uts/common/sys/avl.h
+++ b/external/cddl/osnet/dist/uts/common/sys/avl.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_AVL_H
 #define	_AVL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 /*
  * This is a private header file.  Applications should not directly include
  * this file.
@@ -163,7 +161,7 @@ extern void avl_create(avl_tree_t *tree,
  * node   - node that has the value being looked for
  * where  - position for use with avl_nearest() or avl_insert(), may be NULL
  */
-extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where);
+extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where);
 
 /*
  * Insert a node into the tree.
diff --git a/external/cddl/osnet/dist/uts/common/sys/byteorder.h b/external/cddl/osnet/dist/uts/common/sys/byteorder.h
index 2f4545c65da36..fd9f8a1d98289 100644
--- a/external/cddl/osnet/dist/uts/common/sys/byteorder.h
+++ b/external/cddl/osnet/dist/uts/common/sys/byteorder.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -181,14 +181,13 @@ extern	uint64_t ntohll(uint64_t);
 #define	BE_IN32(xa) htonl(*((uint32_t *)(void *)(xa)))
 #endif	/* !__i386 && !__amd64 */
 
-#if !defined(_LP64) && !defined(_LONGLONG_TYPE)
-#if (!defined(__i386) && !defined(__amd64)) /* sparc */
+#if (!defined(__i386) && !defined(__amd64)) || \
+	(!defined(_LP64) && !defined(_LONGLONG_TYPE))
 #define	BE_IN64(xa) \
 	(((uint64_t)BE_IN32(xa) << 32) | BE_IN32((uint8_t *)(xa) + 4))
 #else /* x86 */
 #define	BE_IN64(xa) htonll(*((uint64_t *)(void *)(xa)))
-#endif	/* (!__i386 && !__amd64) */
-#endif	/* !_LP64 && !_LONGLONG_TYPE */
+#endif /* (!__i386 && !__amd64) || (!_LP64 && !_LONGLONG_TYPE) */
 
 #define	LE_IN8(xa) \
 	*((uint8_t *)(xa))
diff --git a/external/cddl/osnet/dist/uts/common/sys/callb.h b/external/cddl/osnet/dist/uts/common/sys/callb.h
index b548f4ca23b2e..302f314b800ae 100644
--- a/external/cddl/osnet/dist/uts/common/sys/callb.h
+++ b/external/cddl/osnet/dist/uts/common/sys/callb.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_CALLB_H
 #define	_SYS_CALLB_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/t_lock.h>
 #include <sys/thread.h>
 
@@ -69,7 +66,8 @@ extern "C" {
 #define	CB_CL_MDBOOT		CB_CL_UADMIN
 #define	CB_CL_ENTER_DEBUGGER	14
 #define	CB_CL_CPR_POST_KERNEL	15
-#define	NCBCLASS		16 /* CHANGE ME if classes are added/removed */
+#define	CB_CL_CPU_DEEP_IDLE	16
+#define	NCBCLASS		17 /* CHANGE ME if classes are added/removed */
 
 /*
  * CB_CL_CPR_DAEMON class specific definitions are given below:
diff --git a/external/cddl/osnet/dist/uts/common/sys/cpupart.h b/external/cddl/osnet/dist/uts/common/sys/cpupart.h
index b9e0da4e1993e..508637fa2680b 100644
--- a/external/cddl/osnet/dist/uts/common/sys/cpupart.h
+++ b/external/cddl/osnet/dist/uts/common/sys/cpupart.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_CPUPART_H
 #define	_SYS_CPUPART_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/processor.h>
 #include <sys/cpuvar.h>
@@ -58,16 +56,6 @@ typedef int	cpupartid_t;
 #define	CP_ALL		0		/* return all cpu partitions */
 #define	CP_NONEMPTY	1		/* return only non-empty ones */
 
-#if defined(_MACHDEP)
-struct mach_cpupart {
-	cpuset_t	mc_haltset;
-};
-
-extern struct mach_cpupart cp_default_mach;
-#else
-struct mach_cpupart;
-#endif
-
 typedef struct cpupart {
 	disp_t		cp_kp_queue;	/* partition-wide kpreempt queue */
 	cpupartid_t	cp_id;		/* partition ID */
@@ -103,8 +91,7 @@ typedef struct cpupart {
 	lgrp_gen_t	cp_gen;		/* generation number */
 	lgrp_id_t	cp_lgrp_hint;	/* last home lgroup chosen */
 	bitset_t	cp_cmt_pgs;	/* CMT PGs represented */
-
-	struct mach_cpupart *cp_mach;   /* mach-specific */
+	bitset_t	cp_haltset;	/* halted CPUs */
 } cpupart_t;
 
 typedef struct cpupart_kstat {
diff --git a/external/cddl/osnet/dist/uts/common/sys/cpuvar.h b/external/cddl/osnet/dist/uts/common/sys/cpuvar.h
index 0a038e00d0e44..b52192b4197f0 100644
--- a/external/cddl/osnet/dist/uts/common/sys/cpuvar.h
+++ b/external/cddl/osnet/dist/uts/common/sys/cpuvar.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -168,7 +168,7 @@ typedef struct cpu {
 
 	ftrace_data_t	cpu_ftrace;		/* per cpu ftrace data */
 
-	clock_t		cpu_deadman_lbolt;	/* used by deadman() */
+	clock_t		cpu_deadman_counter;	/* used by deadman() */
 	uint_t		cpu_deadman_countdown;	/* used by deadman() */
 
 	kmutex_t	cpu_cpc_ctxlock; /* protects context for idle thread */
@@ -211,12 +211,27 @@ typedef struct cpu {
 	uint64_t	cpu_curr_clock;		/* current clock freq in Hz */
 	char		*cpu_supp_freqs;	/* supported freqs in Hz */
 
+	uintptr_t	cpu_cpcprofile_pc;	/* kernel PC in cpc interrupt */
+	uintptr_t	cpu_cpcprofile_upc;	/* user PC in cpc interrupt */
+
 	/*
 	 * Interrupt load factor used by dispatcher & softcall
 	 */
 	hrtime_t	cpu_intrlast;   /* total interrupt time (nsec) */
 	int		cpu_intrload;   /* interrupt load factor (0-99%) */
 
+	uint_t		cpu_rotor;	/* for cheap pseudo-random numbers */
+
+	struct cu_cpu_info	*cpu_cu_info;	/* capacity & util. info */
+
+	/*
+	 * cpu_generation is updated whenever CPU goes on-line or off-line.
+	 * Updates to cpu_generation are protected by cpu_lock.
+	 *
+	 * See CPU_NEW_GENERATION() macro below.
+	 */
+	volatile uint_t		cpu_generation;	/* tracking on/off-line */
+
 	/*
 	 * New members must be added /before/ this member, as the CTF tools
 	 * rely on this being the last field before cpu_m, so they can
@@ -238,12 +253,13 @@ typedef struct cpu {
  * is up to the platform to assure that this is performed properly.  Note that
  * the structure is sized to avoid false sharing.
  */
-#define	CPUC_SIZE		(sizeof (uint16_t) + sizeof (uintptr_t) + \
-				sizeof (kmutex_t))
+#define	CPUC_SIZE		(sizeof (uint16_t) + sizeof (uint8_t) + \
+				sizeof (uintptr_t) + sizeof (kmutex_t))
 #define	CPUC_PADSIZE		CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE
 
 typedef struct cpu_core {
 	uint16_t	cpuc_dtrace_flags;	/* DTrace flags */
+	uint8_t		cpuc_dcpc_intr_state;	/* DCPC provider intr state */
 	uint8_t		cpuc_pad[CPUC_PADSIZE];	/* padding */
 	uintptr_t	cpuc_dtrace_illval;	/* DTrace illegal value */
 	kmutex_t	cpuc_pid_lock;		/* DTrace pid provider lock */
@@ -261,6 +277,28 @@ extern cpu_core_t cpu_core[];
  */
 #define	CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1))
 
+/*
+ * Check to see if an interrupt thread might be active at a given ipl.
+ * If so return true.
+ * We must be conservative--it is ok to give a false yes, but a false no
+ * will cause disaster.  (But if the situation changes after we check it is
+ * ok--the caller is trying to ensure that an interrupt routine has been
+ * exited).
+ * This is used when trying to remove an interrupt handler from an autovector
+ * list in avintr.c.
+ */
+#define	INTR_ACTIVE(cpup, level)	\
+	((level) <= LOCK_LEVEL ? 	\
+	((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup)))
+
+/*
+ * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one
+ * looks at it. It's meant as a cheap mechanism to be incorporated in routines
+ * wanting to avoid biasing, but where true randomness isn't needed (just
+ * something that changes).
+ */
+#define	CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++)
+
 #if defined(_KERNEL) || defined(_KMEMUSER)
 
 #define	INTR_STACK_SIZE	MAX(DEFAULTSTKSZ, PAGESIZE)
@@ -352,7 +390,6 @@ extern cpu_core_t cpu_core[];
 #define	CPU_DISP_DONTSTEAL	0x01	/* CPU undergoing context swtch */
 #define	CPU_DISP_HALTED		0x02	/* CPU halted waiting for interrupt */
 
-
 #endif /* _KERNEL || _KMEMUSER */
 
 #if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP)
@@ -516,6 +553,7 @@ extern cpuset_t cpu_seqid_inuse;
 #if defined(_KERNEL) || defined(_KMEMUSER)
 
 extern struct cpu	*cpu[];		/* indexed by CPU number */
+extern struct cpu	**cpu_seq;	/* indexed by sequential CPU id */
 extern cpu_t		*cpu_list;	/* list of CPUs */
 extern cpu_t		*cpu_active;	/* list of active CPUs */
 extern int		ncpus;		/* number of CPUs present */
@@ -569,6 +607,13 @@ extern struct cpu *curcpup(void);
 #define	CPU_STATS(cp, stat)                                       \
 	((cp)->cpu_stats.stat)
 
+/*
+ * Increment CPU generation value.
+ * This macro should be called whenever CPU goes on-line or off-line.
+ * Updates to cpu_generation should be protected by cpu_lock.
+ */
+#define	CPU_NEW_GENERATION(cp)	((cp)->cpu_generation++)
+
 #endif /* _KERNEL || _KMEMUSER */
 
 /*
@@ -658,6 +703,7 @@ int	cpu_get_state(cpu_t *);		/* get current cpu state */
 const char *cpu_get_state_str(cpu_t *);	/* get current cpu state as string */
 
 
+void	cpu_set_curr_clock(uint64_t);	/* indicate the current CPU's freq */
 void	cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */
 						/* frequencies */
 
@@ -697,6 +743,49 @@ void	cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */
  */
 extern kmutex_t	cpu_lock;	/* lock protecting CPU data */
 
+/*
+ * CPU state change events
+ *
+ * Various subsystems need to know when CPUs change their state. They get this
+ * information by registering  CPU state change callbacks using
+ * register_cpu_setup_func(). Whenever any CPU changes its state, the callback
+ * function is called. The callback function is passed three arguments:
+ *
+ *   Event, described by cpu_setup_t
+ *   CPU ID
+ *   Transparent pointer passed when registering the callback
+ *
+ * The callback function is called with cpu_lock held. The return value from the
+ * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG
+ * events. For these two events, non-zero return value indicates a failure and
+ * prevents successful completion of the operation.
+ *
+ * New events may be added in the future. Callback functions should ignore any
+ * events that they do not understand.
+ *
+ * The following events provide notification callbacks:
+ *
+ *  CPU_INIT	A new CPU is started and added to the list of active CPUs
+ *		  This event is only used during boot
+ *
+ *  CPU_CONFIG	A newly inserted CPU is prepared for starting running code
+ *		  This event is called by DR code
+ *
+ *  CPU_UNCONFIG CPU has been powered off and needs cleanup
+ *		  This event is called by DR code
+ *
+ *  CPU_ON	CPU is enabled but does not run anything yet
+ *
+ *  CPU_INTR_ON	CPU is enabled and has interrupts enabled
+ *
+ *  CPU_OFF	CPU is going offline but can still run threads
+ *
+ *  CPU_CPUPART_OUT	CPU is going to move out of its partition
+ *
+ *  CPU_CPUPART_IN	CPU is going to move to a new partition
+ *
+ *  CPU_SETUP	CPU is set up during boot and can run threads
+ */
 typedef enum {
 	CPU_INIT,
 	CPU_CONFIG,
@@ -704,7 +793,9 @@ typedef enum {
 	CPU_ON,
 	CPU_OFF,
 	CPU_CPUPART_IN,
-	CPU_CPUPART_OUT
+	CPU_CPUPART_OUT,
+	CPU_SETUP,
+	CPU_INTR_ON
 } cpu_setup_t;
 
 typedef int cpu_setup_func_t(cpu_setup_t, int, void *);
@@ -717,6 +808,13 @@ extern void register_cpu_setup_func(cpu_setup_func_t *, void *);
 extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *);
 extern void cpu_state_change_notify(int, cpu_setup_t);
 
+/*
+ * Call specified function on the given CPU
+ */
+typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t);
+extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t);
+
+
 /*
  * Create various strings that describe the given CPU for the
  * processor_info system call and configuration-related kstats.
diff --git a/external/cddl/osnet/dist/uts/common/sys/cred.h b/external/cddl/osnet/dist/uts/common/sys/cred.h
index e84f1e04305dd..5056f9a511053 100644
--- a/external/cddl/osnet/dist/uts/common/sys/cred.h
+++ b/external/cddl/osnet/dist/uts/common/sys/cred.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -34,8 +34,6 @@
 #ifndef _SYS_CRED_H
 #define	_SYS_CRED_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 
 #ifdef	__cplusplus
@@ -58,6 +56,7 @@ struct prcred;
 struct ksid;
 struct ksidlist;
 struct credklpd;
+struct credgrp;
 
 struct auditinfo_addr;			/* cred.h is included in audit.h */
 
@@ -79,6 +78,7 @@ extern cred_t *crdup(cred_t *);
 extern void crdup_to(cred_t *, cred_t *);
 extern cred_t *crgetcred(void);
 extern void crset(struct proc *, cred_t *);
+extern void crset_zone_privall(cred_t *);
 extern int groupmember(gid_t, const cred_t *);
 extern int supgroupmember(gid_t, const cred_t *);
 extern int hasprocperm(const cred_t *, const cred_t *);
@@ -104,6 +104,7 @@ extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *);
 extern uint_t crgetref(const cred_t *);
 
 extern const gid_t *crgetgroups(const cred_t *);
+extern const gid_t *crgetggroups(const struct credgrp *);
 
 extern int crgetngroups(const cred_t *);
 
@@ -120,7 +121,13 @@ extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t);
  */
 extern int crsetugid(cred_t *, uid_t, gid_t);
 
+/*
+ * Functions to handle the supplemental group list.
+ */
 extern int crsetgroups(cred_t *, int, gid_t *);
+extern struct credgrp *crgrpcopyin(int, gid_t *);
+extern void crgrprele(struct credgrp *);
+extern void crsetcredgrp(cred_t *, struct credgrp *);
 
 /*
  * Private interface for setting zone association of credential.
diff --git a/external/cddl/osnet/dist/uts/common/sys/debug.h b/external/cddl/osnet/dist/uts/common/sys/debug.h
index c156e7c463c3b..4de39d255e71e 100644
--- a/external/cddl/osnet/dist/uts/common/sys/debug.h
+++ b/external/cddl/osnet/dist/uts/common/sys/debug.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,21 +19,19 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 /*	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T	*/
 /*	  All Rights Reserved	*/
 
-
 #ifndef _SYS_DEBUG_H
 #define	_SYS_DEBUG_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/isa_defs.h>
 #include <sys/types.h>
+#include <sys/note.h>
 
 #ifdef	__cplusplus
 extern "C" {
@@ -51,7 +48,7 @@ extern "C" {
 extern int assfail(const char *, const char *, int);
 #define	VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
 #if DEBUG
-#define	ASSERT(EX) VERIFY(EX)
+#define	ASSERT(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__)))
 #else
 #define	ASSERT(x)  ((void)0)
 #endif
@@ -59,7 +56,7 @@ extern int assfail(const char *, const char *, int);
 extern int assfail();
 #define	VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
 #if DEBUG
-#define	ASSERT(EX) VERIFY(EX)
+#define	ASSERT(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__)))
 #else
 #define	ASSERT(x)  ((void)0)
 #endif
@@ -76,6 +73,25 @@ extern int assfail();
 #define	ASSERT32(x)	ASSERT(x)
 #endif
 
+/*
+ * IMPLY and EQUIV are assertions of the form:
+ *
+ *	if (a) then (b)
+ * and
+ *	if (a) then (b) *AND* if (b) then (a)
+ */
+#if DEBUG
+#define	IMPLY(A, B) \
+	((void)(((!(A)) || (B)) || \
+	    assfail("(" #A ") implies (" #B ")", __FILE__, __LINE__)))
+#define	EQUIV(A, B) \
+	((void)((!!(A) == !!(B)) || \
+	    assfail("(" #A ") is equivalent to (" #B ")", __FILE__, __LINE__)))
+#else
+#define	IMPLY(A, B) ((void)0)
+#define	EQUIV(A, B) ((void)0)
+#endif
+
 /*
  * ASSERT3() behaves like ASSERT() except that it is an explicit conditional,
  * and prints out the values of the left and right hand expressions as part of
@@ -99,9 +115,9 @@ _NOTE(CONSTCOND) } while (0)
 #define	VERIFY3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
 #define	VERIFY3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
 #if DEBUG
-#define	ASSERT3S(x, y, z)	VERIFY3S(x, y, z)
-#define	ASSERT3U(x, y, z)	VERIFY3U(x, y, z)
-#define	ASSERT3P(x, y, z)	VERIFY3P(x, y, z)
+#define	ASSERT3S(x, y, z)	VERIFY3_IMPL(x, y, z, int64_t)
+#define	ASSERT3U(x, y, z)	VERIFY3_IMPL(x, y, z, uint64_t)
+#define	ASSERT3P(x, y, z)	VERIFY3_IMPL(x, y, z, uintptr_t)
 #else
 #define	ASSERT3S(x, y, z)	((void)0)
 #define	ASSERT3U(x, y, z)	((void)0)
diff --git a/external/cddl/osnet/dist/uts/common/sys/dklabel.h b/external/cddl/osnet/dist/uts/common/sys/dklabel.h
index 01baa7157cafc..457c1ecadc938 100644
--- a/external/cddl/osnet/dist/uts/common/sys/dklabel.h
+++ b/external/cddl/osnet/dist/uts/common/sys/dklabel.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -207,7 +207,7 @@ struct dk_label {
 	uint16_t	dkl_ncyl;	/* # of data cylinders */
 	uint16_t	dkl_acyl;	/* # of alternate cylinders */
 	uint16_t	dkl_nhead;	/* # of heads in this partition */
-	uint16_t	dkl_nsect;	/* # of 512 byte sectors per track */
+	uint16_t	dkl_nsect;	/* # of sectors per track */
 	uint16_t	dkl_obs3;	/* obsolete */
 	uint16_t	dkl_obs4;	/* obsolete */
 	struct dk_map32	dkl_map[NDKMAP]; /* logical partition headers */
diff --git a/external/cddl/osnet/dist/uts/common/sys/dtrace.h b/external/cddl/osnet/dist/uts/common/sys/dtrace.h
index b6e52ec1c4da5..007502d7d8562 100644
--- a/external/cddl/osnet/dist/uts/common/sys/dtrace.h
+++ b/external/cddl/osnet/dist/uts/common/sys/dtrace.h
@@ -20,15 +20,13 @@
  */
 
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_DTRACE_H
 #define	_SYS_DTRACE_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -664,6 +662,20 @@ typedef struct dof_sec {
 
 #define	DOF_SECF_LOAD		1	/* section should be loaded */
 
+#define	DOF_SEC_ISLOADABLE(x)						\
+	(((x) == DOF_SECT_ECBDESC) || ((x) == DOF_SECT_PROBEDESC) ||	\
+	((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) ||	\
+	((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) ||		\
+	((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) ||		\
+	((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) ||	\
+	((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) ||	\
+	((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) ||	\
+	((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) ||		\
+	((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) ||		\
+	((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) ||	\
+	((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) ||	\
+	((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS))
+
 typedef struct dof_ecbdesc {
 	dof_secidx_t dofe_probes;	/* link to DOF_SECT_PROBEDESC */
 	dof_secidx_t dofe_pred;		/* link to DOF_SECT_DIFOHDR */
@@ -1382,7 +1394,7 @@ typedef struct dof_helper {
  *   dtps_provide_module(); see "Arguments and Notes" for dtrace_register(),
  *   below.
  *
- * 1.4  void dtps_enable(void *arg, dtrace_id_t id, void *parg)
+ * 1.4  int dtps_enable(void *arg, dtrace_id_t id, void *parg)
  *
  * 1.4.1  Overview
  *
@@ -1403,7 +1415,8 @@ typedef struct dof_helper {
  *
  * 1.4.3  Return value
  *
- *   None.
+ *   On success, dtps_enable() should return 0. On failure, -1 should be
+ *   returned.
  *
  * 1.4.4  Caller's context
  *
@@ -1957,7 +1970,7 @@ typedef struct dof_helper {
 typedef struct dtrace_pops {
 	void (*dtps_provide)(void *arg, const dtrace_probedesc_t *spec);
 	void (*dtps_provide_module)(void *arg, struct modctl *mp);
-	void (*dtps_enable)(void *arg, dtrace_id_t id, void *parg);
+	int (*dtps_enable)(void *arg, dtrace_id_t id, void *parg);
 	void (*dtps_disable)(void *arg, dtrace_id_t id, void *parg);
 	void (*dtps_suspend)(void *arg, dtrace_id_t id, void *parg);
 	void (*dtps_resume)(void *arg, dtrace_id_t id, void *parg);
diff --git a/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h b/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h
index 66ca9c5d7108b..c752edc99bbd9 100644
--- a/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h
+++ b/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FM_FS_ZFS_H
 #define	_SYS_FM_FS_ZFS_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -57,6 +55,7 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE	"vdev_type"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH	"vdev_path"
 #define	FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID	"vdev_devid"
+#define	FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU		"vdev_fru"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID	"parent_guid"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE	"parent_type"
 #define	FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH	"parent_path"
@@ -69,6 +68,18 @@ extern "C" {
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET	"zio_offset"
 #define	FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE		"zio_size"
 #define	FM_EREPORT_PAYLOAD_ZFS_PREV_STATE	"prev_state"
+#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED	"cksum_expected"
+#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL	"cksum_actual"
+#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO	"cksum_algorithm"
+#define	FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP	"cksum_byteswap"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS	"bad_range_sets"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS	"bad_range_clears"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS	"bad_set_bits"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS	"bad_cleared_bits"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram"
+#define	FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram"
 
 #define	FM_EREPORT_FAILMODE_WAIT		"wait"
 #define	FM_EREPORT_FAILMODE_CONTINUE		"continue"
@@ -76,6 +87,7 @@ extern "C" {
 
 #define	FM_RESOURCE_REMOVED			"removed"
 #define	FM_RESOURCE_AUTOREPLACE			"autoreplace"
+#define	FM_RESOURCE_STATECHANGE			"statechange"
 
 #ifdef	__cplusplus
 }
diff --git a/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h b/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h
index 388951bfce65f..fbf614caa20e9 100644
--- a/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h
+++ b/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -47,6 +47,7 @@ extern "C" {
 /* FM event class values */
 #define	FM_EREPORT_CLASS		"ereport"
 #define	FM_FAULT_CLASS			"fault"
+#define	FM_DEFECT_CLASS			"defect"
 #define	FM_RSRC_CLASS			"resource"
 #define	FM_LIST_EVENT			"list"
 
@@ -83,9 +84,11 @@ extern "C" {
 #define	FM_SUSPECT_FAULT_LIST		"fault-list"
 #define	FM_SUSPECT_FAULT_SZ		"fault-list-sz"
 #define	FM_SUSPECT_FAULT_STATUS		"fault-status"
+#define	FM_SUSPECT_INJECTED		"__injected"
 #define	FM_SUSPECT_MESSAGE		"message"
 #define	FM_SUSPECT_RETIRE		"retire"
 #define	FM_SUSPECT_RESPONSE		"response"
+#define	FM_SUSPECT_SEVERITY		"severity"
 
 #define	FM_SUSPECT_VERS0		0
 #define	FM_SUSPECT_VERSION		FM_SUSPECT_VERS0
@@ -121,6 +124,7 @@ extern "C" {
 #define	FM_RSRC_ASRU_REPAIRED		"repaired"
 #define	FM_RSRC_ASRU_REPLACED		"replaced"
 #define	FM_RSRC_ASRU_ACQUITTED		"acquitted"
+#define	FM_RSRC_ASRU_RESOLVED		"resolved"
 #define	FM_RSRC_ASRU_UNUSABLE		"unusable"
 #define	FM_RSRC_ASRU_EVENT		"event"
 
@@ -129,6 +133,8 @@ extern "C" {
 #define	FM_RSRC_XPRT_VERSION		FM_RSRC_XPRT_VERS0
 #define	FM_RSRC_XPRT_UUID		"uuid"
 #define	FM_RSRC_XPRT_SUBCLASS		"subclass"
+#define	FM_RSRC_XPRT_FAULT_STATUS	"fault-status"
+#define	FM_RSRC_XPRT_FAULT_HAS_ASRU	"fault-has-asru"
 
 /*
  * FM ENA Format Macros
@@ -167,6 +173,7 @@ extern "C" {
 
 /* FMRI authority-type member names */
 #define	FM_FMRI_AUTH_CHASSIS		"chassis-id"
+#define	FM_FMRI_AUTH_PRODUCT_SN		"product-sn"
 #define	FM_FMRI_AUTH_PRODUCT		"product-id"
 #define	FM_FMRI_AUTH_DOMAIN		"domain-id"
 #define	FM_FMRI_AUTH_SERVER		"server-id"
@@ -205,6 +212,8 @@ extern "C" {
 #define	FM_PKG_SCHEME_VERSION		PKG_SCHEME_VERSION0
 #define	LEGACY_SCHEME_VERSION0		0
 #define	FM_LEGACY_SCHEME_VERSION	LEGACY_SCHEME_VERSION0
+#define	SVC_SCHEME_VERSION0		0
+#define	FM_SVC_SCHEME_VERSION		SVC_SCHEME_VERSION0
 #define	ZFS_SCHEME_VERSION0		0
 #define	FM_ZFS_SCHEME_VERSION		ZFS_SCHEME_VERSION0
 
@@ -246,14 +255,13 @@ extern "C" {
 #define	FM_FMRI_PKG_VERSION		"pkg-version"
 
 /* svc scheme member names */
-#define	FM_FMRI_SVC_NAME		"service-name"
-#define	FM_FMRI_SVC_VERSION		"service-version"
-#define	FM_FMRI_SVC_INSTANCE		"instance"
-#define	FM_FMRI_SVC_CONTRACT_ID		"contract-id"
+#define	FM_FMRI_SVC_NAME		"svc-name"
+#define	FM_FMRI_SVC_INSTANCE		"svc-instance"
+#define	FM_FMRI_SVC_CONTRACT_ID		"svc-contract-id"
 
 /* svc-authority member names */
 #define	FM_FMRI_SVC_AUTH_SCOPE		"scope"
-#define	FM_FMRI_SVC_AUTH_SYSTEM_FQN	"system-FQN"
+#define	FM_FMRI_SVC_AUTH_SYSTEM_FQN	"system-fqn"
 
 /* cpu scheme member names */
 #define	FM_FMRI_CPU_ID			"cpuid"
@@ -316,6 +324,8 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *,
 extern void fm_authority_set(nvlist_t *, int, const char *, const char *,
     const char *, const char *);
 extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t);
+extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *,
+    nvlist_t *, int, ...);
 
 extern uint64_t fm_ena_increment(uint64_t);
 extern uint64_t fm_ena_generate(uint64_t, uchar_t);
diff --git a/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h b/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h
index 95f04d842efa1..e986759a2d3eb 100644
--- a/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h
+++ b/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h
@@ -18,14 +18,17 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_FS_ZFS_H
 #define	_SYS_FS_ZFS_H
 
+#include <sys/time.h>
+
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -48,6 +51,10 @@ typedef enum {
 #define	ZFS_TYPE_DATASET	\
 	(ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT)
 
+#define	ZAP_MAXNAMELEN 256
+#define	ZAP_MAXVALUELEN (1024 * 8)
+#define	ZAP_OLDMAXVALUELEN 1024
+
 /*
  * Dataset properties are identified by these constants and must be added to
  * the end of this list to ensure that external consumers are not affected
@@ -105,9 +112,28 @@ typedef enum {
 	ZFS_PROP_USEDDS,
 	ZFS_PROP_USEDCHILD,
 	ZFS_PROP_USEDREFRESERV,
+	ZFS_PROP_USERACCOUNTING,	/* not exposed to the user */
+	ZFS_PROP_STMF_SHAREINFO,	/* not exposed to the user */
+	ZFS_PROP_DEFER_DESTROY,
+	ZFS_PROP_USERREFS,
+	ZFS_PROP_LOGBIAS,
+	ZFS_PROP_UNIQUE,		/* not exposed to the user */
+	ZFS_PROP_OBJSETID,		/* not exposed to the user */
+	ZFS_PROP_DEDUP,
+	ZFS_PROP_MLSLABEL,
 	ZFS_NUM_PROPS
 } zfs_prop_t;
 
+typedef enum {
+	ZFS_PROP_USERUSED,
+	ZFS_PROP_USERQUOTA,
+	ZFS_PROP_GROUPUSED,
+	ZFS_PROP_GROUPQUOTA,
+	ZFS_NUM_USERQUOTA_PROPS
+} zfs_userquota_prop_t;
+
+extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS];
+
 /*
  * Pool properties are identified by these constants and must be added to the
  * end of this list to ensure that external consumers are not affected
@@ -117,8 +143,6 @@ typedef enum {
 typedef enum {
 	ZPOOL_PROP_NAME,
 	ZPOOL_PROP_SIZE,
-	ZPOOL_PROP_USED,
-	ZPOOL_PROP_AVAILABLE,
 	ZPOOL_PROP_CAPACITY,
 	ZPOOL_PROP_ALTROOT,
 	ZPOOL_PROP_HEALTH,
@@ -130,6 +154,11 @@ typedef enum {
 	ZPOOL_PROP_CACHEFILE,
 	ZPOOL_PROP_FAILUREMODE,
 	ZPOOL_PROP_LISTSNAPS,
+	ZPOOL_PROP_AUTOEXPAND,
+	ZPOOL_PROP_DEDUPDITTO,
+	ZPOOL_PROP_DEDUPRATIO,
+	ZPOOL_PROP_FREE,
+	ZPOOL_PROP_ALLOCATED,
 	ZPOOL_NUM_PROPS
 } zpool_prop_t;
 
@@ -144,10 +173,27 @@ typedef enum {
 	ZPROP_SRC_DEFAULT = 0x2,
 	ZPROP_SRC_TEMPORARY = 0x4,
 	ZPROP_SRC_LOCAL = 0x8,
-	ZPROP_SRC_INHERITED = 0x10
+	ZPROP_SRC_INHERITED = 0x10,
+	ZPROP_SRC_RECEIVED = 0x20
 } zprop_source_t;
 
-#define	ZPROP_SRC_ALL	0x1f
+#define	ZPROP_SRC_ALL	0x3f
+
+#define	ZPROP_SOURCE_VAL_RECVD	"$recvd"
+#define	ZPROP_N_MORE_ERRORS	"N_MORE_ERRORS"
+/*
+ * Dataset flag implemented as a special entry in the props zap object
+ * indicating that the dataset has received properties on or after
+ * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties
+ * just as it did in earlier versions, and thereafter, local properties are
+ * preserved.
+ */
+#define	ZPROP_HAS_RECVD		"$hasrecvd"
+
+typedef enum {
+	ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */
+	ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */
+} zprop_errflags_t;
 
 typedef int (*zprop_func)(int, void *);
 
@@ -169,8 +215,10 @@ boolean_t zfs_prop_setonce(zfs_prop_t);
 const char *zfs_prop_to_name(zfs_prop_t);
 zfs_prop_t zfs_name_to_prop(const char *);
 boolean_t zfs_prop_user(const char *);
+boolean_t zfs_prop_userquota(const char *);
 int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **);
 int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *);
+uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed);
 boolean_t zfs_prop_valid_for_type(int, zfs_type_t);
 
 /*
@@ -183,6 +231,7 @@ uint64_t zpool_prop_default_numeric(zpool_prop_t);
 boolean_t zpool_prop_readonly(zpool_prop_t);
 int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **);
 int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *);
+uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed);
 
 /*
  * Definitions for the Delegation.
@@ -213,12 +262,22 @@ typedef enum {
 #define	ZFS_DELEG_PERM_GID	"gid"
 #define	ZFS_DELEG_PERM_GROUPS	"groups"
 
+#define	ZFS_MLSLABEL_DEFAULT	"none"
+
+#define	ZFS_SMB_ACL_SRC		"src"
+#define	ZFS_SMB_ACL_TARGET	"target"
+
 typedef enum {
 	ZFS_CANMOUNT_OFF = 0,
 	ZFS_CANMOUNT_ON = 1,
 	ZFS_CANMOUNT_NOAUTO = 2
 } zfs_canmount_type_t;
 
+typedef enum {
+	ZFS_LOGBIAS_LATENCY = 0,
+	ZFS_LOGBIAS_THROUGHPUT = 1
+} zfs_logbias_op_t;
+
 typedef enum zfs_share_op {
 	ZFS_SHARE_NFS = 0,
 	ZFS_UNSHARE_NFS = 1,
@@ -226,6 +285,13 @@ typedef enum zfs_share_op {
 	ZFS_UNSHARE_SMB = 3
 } zfs_share_op_t;
 
+typedef enum zfs_smb_acl_op {
+	ZFS_SMB_ACL_ADD,
+	ZFS_SMB_ACL_REMOVE,
+	ZFS_SMB_ACL_RENAME,
+	ZFS_SMB_ACL_PURGE
+} zfs_smb_acl_op_t;
+
 typedef enum zfs_cache_type {
 	ZFS_CACHE_NONE = 0,
 	ZFS_CACHE_METADATA = 1,
@@ -250,13 +316,23 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_12			12ULL
 #define	SPA_VERSION_13			13ULL
 #define	SPA_VERSION_14			14ULL
+#define	SPA_VERSION_15			15ULL
+#define	SPA_VERSION_16			16ULL
+#define	SPA_VERSION_17			17ULL
+#define	SPA_VERSION_18			18ULL
+#define	SPA_VERSION_19			19ULL
+#define	SPA_VERSION_20			20ULL
+#define	SPA_VERSION_21			21ULL
+#define	SPA_VERSION_22			22ULL
+#define	SPA_VERSION_23			23ULL
 /*
  * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk
- * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*},
- * and do the appropriate changes.
+ * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*},
+ * and do the appropriate changes.  Also bump the version number in
+ * usr/src/grub/capability.
  */
-#define	SPA_VERSION			SPA_VERSION_14
-#define	SPA_VERSION_STRING		"14"
+#define	SPA_VERSION			SPA_VERSION_23
+#define	SPA_VERSION_STRING		"23"
 
 /*
  * Symbolic names for the changes that caused a SPA_VERSION switch.
@@ -272,7 +348,7 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_INITIAL		SPA_VERSION_1
 #define	SPA_VERSION_DITTO_BLOCKS	SPA_VERSION_2
 #define	SPA_VERSION_SPARES		SPA_VERSION_3
-#define	SPA_VERSION_RAID6		SPA_VERSION_3
+#define	SPA_VERSION_RAIDZ2		SPA_VERSION_3
 #define	SPA_VERSION_BPLIST_ACCOUNT	SPA_VERSION_3
 #define	SPA_VERSION_RAIDZ_DEFLATE	SPA_VERSION_3
 #define	SPA_VERSION_DNODE_BYTES		SPA_VERSION_3
@@ -292,6 +368,15 @@ typedef enum zfs_cache_type {
 #define	SPA_VERSION_SNAP_PROPS		SPA_VERSION_12
 #define	SPA_VERSION_USED_BREAKDOWN	SPA_VERSION_13
 #define	SPA_VERSION_PASSTHROUGH_X	SPA_VERSION_14
+#define	SPA_VERSION_USERSPACE		SPA_VERSION_15
+#define	SPA_VERSION_STMF_PROP		SPA_VERSION_16
+#define	SPA_VERSION_RAIDZ3		SPA_VERSION_17
+#define	SPA_VERSION_USERREFS		SPA_VERSION_18
+#define	SPA_VERSION_HOLES		SPA_VERSION_19
+#define	SPA_VERSION_ZLE_COMPRESSION	SPA_VERSION_20
+#define	SPA_VERSION_DEDUP		SPA_VERSION_21
+#define	SPA_VERSION_RECVD_PROPS		SPA_VERSION_22
+#define	SPA_VERSION_SLIM_ZIL		SPA_VERSION_23
 
 /*
  * ZPL version - rev'd whenever an incompatible on-disk format change
@@ -299,19 +384,37 @@ typedef enum zfs_cache_type {
  * also update the version_table[] and help message in zfs_prop.c.
  *
  * When changing, be sure to teach GRUB how to read the new format!
- * See usr/src/grub/grub-0.95/stage2/{zfs-include/,fsys_zfs*}
+ * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*}
  */
 #define	ZPL_VERSION_1			1ULL
 #define	ZPL_VERSION_2			2ULL
 #define	ZPL_VERSION_3			3ULL
-#define	ZPL_VERSION			ZPL_VERSION_3
-#define	ZPL_VERSION_STRING		"3"
+#define	ZPL_VERSION_4			4ULL
+#define	ZPL_VERSION			ZPL_VERSION_4
+#define	ZPL_VERSION_STRING		"4"
 
 #define	ZPL_VERSION_INITIAL		ZPL_VERSION_1
 #define	ZPL_VERSION_DIRENT_TYPE		ZPL_VERSION_2
 #define	ZPL_VERSION_FUID		ZPL_VERSION_3
 #define	ZPL_VERSION_NORMALIZATION	ZPL_VERSION_3
 #define	ZPL_VERSION_SYSATTR		ZPL_VERSION_3
+#define	ZPL_VERSION_USERSPACE		ZPL_VERSION_4
+
+/* Rewind request information */
+#define	ZPOOL_NO_REWIND		1  /* No policy - default behavior */
+#define	ZPOOL_NEVER_REWIND	2  /* Do not search for best txg or rewind */
+#define	ZPOOL_TRY_REWIND	4  /* Search for best txg, but do not rewind */
+#define	ZPOOL_DO_REWIND		8  /* Rewind to best txg w/in deferred frees */
+#define	ZPOOL_EXTREME_REWIND	16 /* Allow extreme measures to find best txg */
+#define	ZPOOL_REWIND_MASK	28 /* All the possible rewind bits */
+#define	ZPOOL_REWIND_POLICIES	31 /* All the possible policy bits */
+
+typedef struct zpool_rewind_policy {
+	uint32_t	zrp_request;	/* rewind behavior requested */
+	uint64_t	zrp_maxmeta;	/* max acceptable meta-data errors */
+	uint64_t	zrp_maxdata;	/* max acceptable data errors */
+	uint64_t	zrp_txg;	/* specific txg to load */
+} zpool_rewind_policy_t;
 
 /*
  * The following are configuration names used in the nvlist describing a pool's
@@ -349,6 +452,16 @@ typedef enum zfs_cache_type {
 #define	ZPOOL_CONFIG_PHYS_PATH		"phys_path"
 #define	ZPOOL_CONFIG_IS_LOG		"is_log"
 #define	ZPOOL_CONFIG_L2CACHE		"l2cache"
+#define	ZPOOL_CONFIG_HOLE_ARRAY		"hole_array"
+#define	ZPOOL_CONFIG_VDEV_CHILDREN	"vdev_children"
+#define	ZPOOL_CONFIG_IS_HOLE		"is_hole"
+#define	ZPOOL_CONFIG_DDT_HISTOGRAM	"ddt_histogram"
+#define	ZPOOL_CONFIG_DDT_OBJ_STATS	"ddt_object_stats"
+#define	ZPOOL_CONFIG_DDT_STATS		"ddt_stats"
+#define	ZPOOL_CONFIG_SPLIT		"splitcfg"
+#define	ZPOOL_CONFIG_ORIG_GUID		"orig_guid"
+#define	ZPOOL_CONFIG_SPLIT_GUID		"split_guid"
+#define	ZPOOL_CONFIG_SPLIT_LIST		"guid_list"
 #define	ZPOOL_CONFIG_SUSPENDED		"suspended"	/* not stored on disk */
 #define	ZPOOL_CONFIG_TIMESTAMP		"timestamp"	/* not stored on disk */
 #define	ZPOOL_CONFIG_BOOTFS		"bootfs"	/* not stored on disk */
@@ -361,6 +474,20 @@ typedef enum zfs_cache_type {
 #define	ZPOOL_CONFIG_FAULTED		"faulted"
 #define	ZPOOL_CONFIG_DEGRADED		"degraded"
 #define	ZPOOL_CONFIG_REMOVED		"removed"
+#define	ZPOOL_CONFIG_FRU		"fru"
+#define	ZPOOL_CONFIG_AUX_STATE		"aux_state"
+
+/* Rewind policy parameters */
+#define	ZPOOL_REWIND_POLICY		"rewind-policy"
+#define	ZPOOL_REWIND_REQUEST		"rewind-request"
+#define	ZPOOL_REWIND_REQUEST_TXG	"rewind-request-txg"
+#define	ZPOOL_REWIND_META_THRESH	"rewind-meta-thresh"
+#define	ZPOOL_REWIND_DATA_THRESH	"rewind-data-thresh"
+
+/* Rewind data discovered */
+#define	ZPOOL_CONFIG_LOAD_TIME		"rewind_txg_ts"
+#define	ZPOOL_CONFIG_LOAD_DATA_ERRORS	"verify_data_errors"
+#define	ZPOOL_CONFIG_REWIND_TIME	"seconds_of_rewind"
 
 #define	VDEV_TYPE_ROOT			"root"
 #define	VDEV_TYPE_MIRROR		"mirror"
@@ -369,6 +496,7 @@ typedef enum zfs_cache_type {
 #define	VDEV_TYPE_DISK			"disk"
 #define	VDEV_TYPE_FILE			"file"
 #define	VDEV_TYPE_MISSING		"missing"
+#define	VDEV_TYPE_HOLE			"hole"
 #define	VDEV_TYPE_SPARE			"spare"
 #define	VDEV_TYPE_LOG			"log"
 #define	VDEV_TYPE_L2CACHE		"l2cache"
@@ -418,7 +546,9 @@ typedef enum vdev_aux {
 	VDEV_AUX_SPARED,	/* hot spare used in another pool	*/
 	VDEV_AUX_ERR_EXCEEDED,	/* too many errors			*/
 	VDEV_AUX_IO_FAILURE,	/* experienced I/O failure		*/
-	VDEV_AUX_BAD_LOG	/* cannot read log chain(s)		*/
+	VDEV_AUX_BAD_LOG,	/* cannot read log chain(s)		*/
+	VDEV_AUX_EXTERNAL,	/* external diagnosis			*/
+	VDEV_AUX_SPLIT_POOL	/* vdev was split off into another pool	*/
 } vdev_aux_t;
 
 /*
@@ -488,25 +618,45 @@ typedef struct vdev_stat {
 	uint64_t	vs_scrub_end;		/* UTC scrub end time	*/
 } vdev_stat_t;
 
+/*
+ * DDT statistics.  Note: all fields should be 64-bit because this
+ * is passed between kernel and userland as an nvlist uint64 array.
+ */
+typedef struct ddt_object {
+	uint64_t	ddo_count;	/* number of elments in ddt 	*/
+	uint64_t	ddo_dspace;	/* size of ddt on disk		*/
+	uint64_t	ddo_mspace;	/* size of ddt in-core		*/
+} ddt_object_t;
+
+typedef struct ddt_stat {
+	uint64_t	dds_blocks;	/* blocks			*/
+	uint64_t	dds_lsize;	/* logical size			*/
+	uint64_t	dds_psize;	/* physical size		*/
+	uint64_t	dds_dsize;	/* deflated allocated size	*/
+	uint64_t	dds_ref_blocks;	/* referenced blocks		*/
+	uint64_t	dds_ref_lsize;	/* referenced lsize * refcnt	*/
+	uint64_t	dds_ref_psize;	/* referenced psize * refcnt	*/
+	uint64_t	dds_ref_dsize;	/* referenced dsize * refcnt	*/
+} ddt_stat_t;
+
+typedef struct ddt_histogram {
+	ddt_stat_t	ddh_stat[64];	/* power-of-two histogram buckets */
+} ddt_histogram_t;
+
 #define	ZVOL_DRIVER	"zvol"
 #define	ZFS_DRIVER	"zfs"
 #define	ZFS_DEV		"/dev/zfs"
 
-/*
- * zvol paths.  Irritatingly, the devfsadm interfaces want all these
- * paths without the /dev prefix, but for some things, we want the
- * /dev prefix.  Below are the names without /dev.
- */
-#define	ZVOL_DEV_DIR	"zvol/dsk"
-#define	ZVOL_RDEV_DIR	"zvol/rdsk"
-
-/*
- * And here are the things we need with /dev, etc. in front of them.
- */
-#define	ZVOL_PSEUDO_DEV		"/devices/pseudo/zvol@0:"
-#define	ZVOL_FULL_DEV_DIR	"/dev/" ZVOL_DEV_DIR "/"
+/* general zvol path */
+#define	ZVOL_DIR		"/dev/zvol"
+/* expansion */
+#define	ZVOL_PSEUDO_DEV		"/devices/pseudo/zfs@0:"
+/* for dump and swap */
+#define	ZVOL_FULL_DEV_DIR	ZVOL_DIR "/dsk/"
+#define	ZVOL_FULL_RDEV_DIR	ZVOL_DIR "/rdsk/"
 
 #define	ZVOL_PROP_NAME		"name"
+#define	ZVOL_DEFAULT_BLOCKSIZE	8192
 
 /*
  * /dev/zfs ioctl numbers.
@@ -531,13 +681,12 @@ typedef enum zfs_ioc {
 	ZFS_IOC_VDEV_ATTACH,
 	ZFS_IOC_VDEV_DETACH,
 	ZFS_IOC_VDEV_SETPATH,
+	ZFS_IOC_VDEV_SETFRU,
 	ZFS_IOC_OBJSET_STATS,
 	ZFS_IOC_OBJSET_ZPLPROPS,
 	ZFS_IOC_DATASET_LIST_NEXT,
 	ZFS_IOC_SNAPSHOT_LIST_NEXT,
 	ZFS_IOC_SET_PROP,
-	ZFS_IOC_CREATE_MINOR,
-	ZFS_IOC_REMOVE_MINOR,
 	ZFS_IOC_CREATE,
 	ZFS_IOC_DESTROY,
 	ZFS_IOC_ROLLBACK,
@@ -560,17 +709,28 @@ typedef enum zfs_ioc {
 	ZFS_IOC_GET_FSACL,
 	ZFS_IOC_ISCSI_PERM_CHECK,
 	ZFS_IOC_SHARE,
-	ZFS_IOC_INHERIT_PROP
+	ZFS_IOC_INHERIT_PROP,
+	ZFS_IOC_SMB_ACL,
+	ZFS_IOC_USERSPACE_ONE,
+	ZFS_IOC_USERSPACE_MANY,
+	ZFS_IOC_USERSPACE_UPGRADE,
+	ZFS_IOC_HOLD,
+	ZFS_IOC_RELEASE,
+	ZFS_IOC_GET_HOLDS,
+	ZFS_IOC_OBJSET_RECVD_PROPS,
+	ZFS_IOC_VDEV_SPLIT
 } zfs_ioc_t;
 
 /*
  * Internal SPA load state.  Used by FMA diagnosis engine.
  */
 typedef enum {
-	SPA_LOAD_NONE,		/* no load in progress */
-	SPA_LOAD_OPEN,		/* normal open */
-	SPA_LOAD_IMPORT,	/* import in progress */
-	SPA_LOAD_TRYIMPORT	/* tryimport in progress */
+	SPA_LOAD_NONE,		/* no load in progress	*/
+	SPA_LOAD_OPEN,		/* normal open		*/
+	SPA_LOAD_IMPORT,	/* import in progress	*/
+	SPA_LOAD_TRYIMPORT,	/* tryimport in progress */
+	SPA_LOAD_RECOVER,	/* recovery requested	*/
+	SPA_LOAD_ERROR		/* load failed		*/
 } spa_load_state_t;
 
 /*
@@ -602,6 +762,7 @@ typedef enum {
 #define	ZFS_ONLINE_CHECKREMOVE	0x1
 #define	ZFS_ONLINE_UNSPARE	0x2
 #define	ZFS_ONLINE_FORCEFAULT	0x4
+#define	ZFS_ONLINE_EXPAND	0x8
 #define	ZFS_OFFLINE_TEMPORARY	0x1
 
 /*
@@ -632,7 +793,7 @@ typedef enum {
 /*
  * Note: This is encoded on-disk, so new events must be added to the
  * end, and unused events can not be removed.  Be sure to edit
- * zpool_main.c: hist_event_table[].
+ * libzfs_pool.c: hist_event_table[].
  */
 typedef enum history_internal_events {
 	LOG_NO_EVENT = 0,
@@ -673,6 +834,9 @@ typedef enum history_internal_events {
 	LOG_DS_REFQUOTA,
 	LOG_DS_REFRESERV,
 	LOG_POOL_SCRUB_DONE,
+	LOG_DS_USER_HOLD,
+	LOG_DS_USER_RELEASE,
+	LOG_POOL_SPLIT,
 	LOG_END
 } history_internal_events_t;
 
diff --git a/external/cddl/osnet/dist/uts/common/sys/mnttab.h b/external/cddl/osnet/dist/uts/common/sys/mnttab.h
index eeddd96a2efca..ff086370ec3ee 100644
--- a/external/cddl/osnet/dist/uts/common/sys/mnttab.h
+++ b/external/cddl/osnet/dist/uts/common/sys/mnttab.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -24,15 +23,13 @@
 
 
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_MNTTAB_H
 #define	_SYS_MNTTAB_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 
 #ifdef	__cplusplus
@@ -53,6 +50,14 @@ extern "C" {
 
 #define	putmntent(fd, mp)	(-1)
 
+/*
+ * The fields in struct extmnttab should match those in struct mnttab until new
+ * fields are encountered. This allows hasmntopt(), getmntent_common() and
+ * mntioctl() to cast one type to the other safely.
+ *
+ * The fields in struct mnttab, struct extmnttab and struct mntentbuf must all
+ * match those in the corresponding 32-bit versions defined in mntvnops.c.
+ */
 struct mnttab {
 	char	*mnt_special;
 	char	*mnt_mountp;
@@ -61,11 +66,6 @@ struct mnttab {
 	char	*mnt_time;
 };
 
-/*
- * NOTE: fields in extmnttab should match struct mnttab till new fields
- * are encountered, this allows hasmntopt to work properly when its arg is
- * a pointer to an extmnttab struct cast to a mnttab struct pointer.
- */
 struct extmnttab {
 	char	*mnt_special;
 	char	*mnt_mountp;
@@ -76,6 +76,12 @@ struct extmnttab {
 	uint_t	mnt_minor;
 };
 
+struct mntentbuf {
+	struct	extmnttab *mbuf_emp;
+	size_t 	mbuf_bufsize;
+	char	*mbuf_buf;
+};
+
 #if !defined(_KERNEL)
 #ifdef __STDC__
 extern void	resetmnttab(FILE *);
diff --git a/external/cddl/osnet/dist/uts/common/sys/nvpair.h b/external/cddl/osnet/dist/uts/common/sys/nvpair.h
index 9e768541f2e71..58037b06537e5 100644
--- a/external/cddl/osnet/dist/uts/common/sys/nvpair.h
+++ b/external/cddl/osnet/dist/uts/common/sys/nvpair.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_NVPAIR_H
 #define	_SYS_NVPAIR_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/errno.h>
 #include <sys/va_list.h>
@@ -199,6 +197,7 @@ int nvlist_add_double(nvlist_t *, const char *, double);
 
 int nvlist_remove(nvlist_t *, const char *, data_type_t);
 int nvlist_remove_all(nvlist_t *, const char *);
+int nvlist_remove_nvpair(nvlist_t *, nvpair_t *);
 
 int nvlist_lookup_boolean(nvlist_t *, const char *);
 int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *);
@@ -237,9 +236,11 @@ int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **);
 int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **,
     int *, char **);
 boolean_t nvlist_exists(nvlist_t *, const char *);
+boolean_t nvlist_empty(nvlist_t *);
 
 /* processing nvpair */
 nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *);
+nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *);
 char *nvpair_name(nvpair_t *);
 data_type_t nvpair_type(nvpair_t *);
 int nvpair_type_is_array(nvpair_t *);
diff --git a/external/cddl/osnet/dist/uts/common/sys/priv.h b/external/cddl/osnet/dist/uts/common/sys/priv.h
index d9be377cd9fef..2683446bd236d 100644
--- a/external/cddl/osnet/dist/uts/common/sys/priv.h
+++ b/external/cddl/osnet/dist/uts/common/sys/priv.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_PRIV_H
 #define	_SYS_PRIV_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* from TSOL 8 */
-
 #include <sys/types.h>
 #include <sys/cred.h>
 #include <sys/priv_names.h>
@@ -137,11 +135,12 @@ typedef struct priv_impl_info {
 #define	__PROC_PROTECT			0x0008		/* Private */
 #define	NET_MAC_AWARE			0x0010		/* Is MAC aware */
 #define	NET_MAC_AWARE_INHERIT		0x0020		/* Inherit MAC aware */
+#define	PRIV_AWARE_RESET		0x0040		/* Reset on setuid() */
 #define	PRIV_XPOLICY			0x0080		/* Extended policy */
 
 /* user-settable flags: */
 #define	PRIV_USER	(PRIV_DEBUG | NET_MAC_AWARE | NET_MAC_AWARE_INHERIT |\
-			    PRIV_XPOLICY)
+			    PRIV_XPOLICY | PRIV_AWARE_RESET)
 
 /*
  * Header of the privilege info data structure; multiple structures can
@@ -199,6 +198,9 @@ typedef struct priv_info_names {
 
 #define	PRIV_ALLOC			0x1
 
+extern int priv_debug;
+extern int priv_basic_test;
+
 struct proc;
 struct prpriv;
 struct cred;
@@ -234,6 +236,7 @@ extern void priv_inverse(priv_set_t *);
 
 extern void priv_set_PA(cred_t *);
 extern void priv_adjust_PA(cred_t *);
+extern void priv_reset_PA(cred_t *, boolean_t);
 extern boolean_t priv_can_clear_PA(const cred_t *);
 
 extern int setpflags(uint_t, uint_t, cred_t *);
diff --git a/external/cddl/osnet/dist/uts/common/sys/processor.h b/external/cddl/osnet/dist/uts/common/sys/processor.h
new file mode 100644
index 0000000000000..3a76c8c9b4200
--- /dev/null
+++ b/external/cddl/osnet/dist/uts/common/sys/processor.h
@@ -0,0 +1,150 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ *	Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T
+ *	  All Rights Reserved
+ *
+ */
+
+/*
+ * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Use is subject to license terms.
+ */
+
+#ifndef _SYS_PROCESSOR_H
+#define	_SYS_PROCESSOR_H
+
+#pragma ident	"%Z%%M%	%I%	%E% SMI"
+
+#include <sys/types.h>
+#include <sys/procset.h>
+
+#ifdef	__cplusplus
+extern "C" {
+#endif
+
+/*
+ * Definitions for p_online, processor_info & lgrp system calls.
+ */
+
+/*
+ * Type for an lgrpid
+ */
+typedef uint16_t lgrpid_t;
+
+/*
+ * Type for processor name (CPU number).
+ */
+typedef	int	processorid_t;
+typedef int	chipid_t;
+
+/*
+ * Flags and return values for p_online(2), and pi_state for processor_info(2).
+ * These flags are *not* for in-kernel examination of CPU states.
+ * See <sys/cpuvar.h> for appropriate informational functions.
+ */
+#define	P_OFFLINE	0x0001	/* processor is offline, as quiet as possible */
+#define	P_ONLINE	0x0002	/* processor is online */
+#define	P_STATUS	0x0003	/* value passed to p_online to request status */
+#define	P_FAULTED	0x0004	/* processor is offline, in faulted state */
+#define	P_POWEROFF	0x0005	/* processor is powered off */
+#define	P_NOINTR	0x0006	/* processor is online, but no I/O interrupts */
+#define	P_SPARE		0x0007	/* processor is offline, can be reactivated */
+#define	P_BAD		P_FAULTED	/* unused but defined by USL */
+#define	P_FORCED 	0x10000000	/* force processor offline */
+
+/*
+ * String names for processor states defined above.
+ */
+#define	PS_OFFLINE	"off-line"
+#define	PS_ONLINE	"on-line"
+#define	PS_FAULTED	"faulted"
+#define	PS_POWEROFF	"powered-off"
+#define	PS_NOINTR	"no-intr"
+#define	PS_SPARE	"spare"
+
+/*
+ * Structure filled in by processor_info(2). This structure
+ * SHOULD NOT BE MODIFIED. Changes to the structure would
+ * negate ABI compatibility.
+ *
+ * The string fields are guaranteed to contain a NULL.
+ *
+ * The pi_fputypes field contains a (possibly empty) comma-separated
+ * list of floating point identifier strings.
+ */
+#define	PI_TYPELEN	16	/* max size of CPU type string */
+#define	PI_FPUTYPE	32	/* max size of FPU types string */
+
+typedef struct {
+	int	pi_state;  			/* processor state, see above */
+	char	pi_processor_type[PI_TYPELEN];	/* ASCII CPU type */
+	char	pi_fputypes[PI_FPUTYPE];	/* ASCII FPU types */
+	int	pi_clock;			/* CPU clock freq in MHz */
+} processor_info_t;
+
+/*
+ * Binding values for processor_bind(2)
+ */
+#define	PBIND_NONE	-1	/* LWP/thread is not bound */
+#define	PBIND_QUERY	-2	/* don't set, just return the binding */
+#define	PBIND_HARD	-3	/* prevents offlining CPU (default) */
+#define	PBIND_SOFT	-4	/* allows offlining CPU */
+#define	PBIND_QUERY_TYPE	-5	/* Return binding type */
+
+/*
+ * User-level system call interface prototypes
+ */
+#ifndef _KERNEL
+#ifdef __STDC__
+
+extern int	p_online(processorid_t processorid, int flag);
+extern int	processor_info(processorid_t processorid,
+		    processor_info_t *infop);
+extern int	processor_bind(idtype_t idtype, id_t id,
+		    processorid_t processorid, processorid_t *obind);
+extern processorid_t getcpuid(void);
+extern lgrpid_t gethomelgroup(void);
+
+#else
+
+extern int	p_online();
+extern int	processor_info();
+extern int	processor_bind();
+extern processorid_t getcpuid();
+extern lgrpid_t gethomelgroup();
+
+#endif /* __STDC__ */
+
+#else   /* _KERNEL */
+
+/*
+ * Internal interface prototypes
+ */
+extern int	p_online_internal(processorid_t, int, int *);
+
+#endif /* !_KERNEL */
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif	/* _SYS_PROCESSOR_H */
diff --git a/external/cddl/osnet/dist/uts/common/sys/sdt.h b/external/cddl/osnet/dist/uts/common/sys/sdt.h
index ff04802196533..6ca064c9782b1 100644
--- a/external/cddl/osnet/dist/uts/common/sys/sdt.h
+++ b/external/cddl/osnet/dist/uts/common/sys/sdt.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef _SYS_SDT_H
 #define	_SYS_SDT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -131,6 +129,16 @@ extern "C" {
 	    (uintptr_t)(arg6), (uintptr_t)(arg7));			\
 }
 
+#define	DTRACE_PROBE8(name, type1, arg1, type2, arg2, type3, arg3,	\
+    type4, arg4, type5, arg5, type6, arg6, type7, arg7, type8, arg8) {	\
+	extern void __dtrace_probe_##name(uintptr_t, uintptr_t,		\
+	    uintptr_t, uintptr_t, uintptr_t, uintptr_t,			\
+	    uintptr_t, uintptr_t);					\
+	__dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2),	\
+	    (uintptr_t)(arg3), (uintptr_t)(arg4), (uintptr_t)(arg5),	\
+	    (uintptr_t)(arg6), (uintptr_t)(arg7), (uintptr_t)(arg8));	\
+}
+
 #define	DTRACE_SCHED(name)						\
 	DTRACE_PROBE(__sched_##name);
 
@@ -182,6 +190,40 @@ extern "C" {
 	DTRACE_PROBE4(__io_##name, type1, arg1, type2, arg2, 		\
 	    type3, arg3, type4, arg4);
 
+#define	DTRACE_ISCSI_2(name, type1, arg1, type2, arg2)			\
+	DTRACE_PROBE2(__iscsi_##name, type1, arg1, type2, arg2);
+
+#define	DTRACE_ISCSI_3(name, type1, arg1, type2, arg2, type3, arg3)	\
+	DTRACE_PROBE3(__iscsi_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define	DTRACE_ISCSI_4(name, type1, arg1, type2, arg2,			\
+    type3, arg3, type4, arg4)						\
+	DTRACE_PROBE4(__iscsi_##name, type1, arg1, type2, arg2,		\
+	    type3, arg3, type4, arg4);
+
+#define	DTRACE_ISCSI_5(name, type1, arg1, type2, arg2,			\
+    type3, arg3, type4, arg4, type5, arg5)				\
+	DTRACE_PROBE5(__iscsi_##name, type1, arg1, type2, arg2,		\
+	    type3, arg3, type4, arg4, type5, arg5);
+
+#define	DTRACE_ISCSI_6(name, type1, arg1, type2, arg2,			\
+    type3, arg3, type4, arg4, type5, arg5, type6, arg6)			\
+	DTRACE_PROBE6(__iscsi_##name, type1, arg1, type2, arg2,		\
+	    type3, arg3, type4, arg4, type5, arg5, type6, arg6);
+
+#define	DTRACE_ISCSI_7(name, type1, arg1, type2, arg2,			\
+    type3, arg3, type4, arg4, type5, arg5, type6, arg6, type7, arg7)	\
+	DTRACE_PROBE7(__iscsi_##name, type1, arg1, type2, arg2,		\
+	    type3, arg3, type4, arg4, type5, arg5, type6, arg6,		\
+	    type7, arg7);
+
+#define	DTRACE_ISCSI_8(name, type1, arg1, type2, arg2,			\
+    type3, arg3, type4, arg4, type5, arg5, type6, arg6,			\
+    type7, arg7, type8, arg8)						\
+	DTRACE_PROBE8(__iscsi_##name, type1, arg1, type2, arg2,		\
+	    type3, arg3, type4, arg4, type5, arg5, type6, arg6,		\
+	    type7, arg7, type8, arg8);
+
 #define	DTRACE_NFSV3_3(name, type1, arg1, type2, arg2, 			\
     type3, arg3)							\
 	DTRACE_PROBE3(__nfsv3_##name, type1, arg1, type2, arg2,		\
@@ -259,6 +301,59 @@ extern "C" {
 	DTRACE_PROBE4(__xpv_##name, type1, arg1, type2, arg2, 		\
 	    type3, arg3, type4, arg4);
 
+#define	DTRACE_FC_1(name, type1, arg1) \
+	DTRACE_PROBE1(__fc_##name, type1, arg1);
+
+#define	DTRACE_FC_2(name, type1, arg1, type2, arg2) \
+	DTRACE_PROBE2(__fc_##name, type1, arg1, type2, arg2);
+
+#define	DTRACE_FC_3(name, type1, arg1, type2, arg2, type3, arg3) \
+	DTRACE_PROBE3(__fc_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define	DTRACE_FC_4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \
+	DTRACE_PROBE4(__fc_##name, type1, arg1, type2, arg2, type3, arg3, \
+	    type4, arg4);
+
+#define	DTRACE_FC_5(name, type1, arg1, type2, arg2, type3, arg3, 	\
+	    type4, arg4, type5, arg5)					\
+	DTRACE_PROBE5(__fc_##name, type1, arg1, type2, arg2, type3, arg3, \
+	    type4, arg4, type5, arg5);
+
+#define	DTRACE_SRP_1(name, type1, arg1)					\
+	DTRACE_PROBE1(__srp_##name, type1, arg1);
+
+#define	DTRACE_SRP_2(name, type1, arg1, type2, arg2)			\
+	DTRACE_PROBE2(__srp_##name, type1, arg1, type2, arg2);
+
+#define	DTRACE_SRP_3(name, type1, arg1, type2, arg2, type3, arg3)	\
+	DTRACE_PROBE3(__srp_##name, type1, arg1, type2, arg2, type3, arg3);
+
+#define	DTRACE_SRP_4(name, type1, arg1, type2, arg2, type3, arg3,	\
+	    type4, arg4)						\
+	DTRACE_PROBE4(__srp_##name, type1, arg1, type2, arg2, 		\
+	    type3, arg3, type4, arg4);
+
+#define	DTRACE_SRP_5(name, type1, arg1, type2, arg2, type3, arg3,	\
+	    type4, arg4, type5, arg5)					\
+	DTRACE_PROBE5(__srp_##name, type1, arg1, type2, arg2, 		\
+	    type3, arg3, type4, arg4, type5, arg5);
+
+#define	DTRACE_SRP_6(name, type1, arg1, type2, arg2, type3, arg3,	\
+	    type4, arg4, type5, arg5, type6, arg6)			\
+	DTRACE_PROBE6(__srp_##name, type1, arg1, type2, arg2, 		\
+	    type3, arg3, type4, arg4, type5, arg5, type6, arg6);
+
+#define	DTRACE_SRP_7(name, type1, arg1, type2, arg2, type3, arg3,	\
+	    type4, arg4, type5, arg5, type6, arg6, type7, arg7)		\
+	DTRACE_PROBE7(__srp_##name, type1, arg1, type2, arg2, 		\
+	    type3, arg3, type4, arg4, type5, arg5, type6, arg6, type7, arg7);
+
+#define	DTRACE_SRP_8(name, type1, arg1, type2, arg2, type3, arg3,	\
+	    type4, arg4, type5, arg5, type6, arg6, type7, arg7, type8, arg8) \
+	DTRACE_PROBE8(__srp_##name, type1, arg1, type2, arg2, 		\
+	    type3, arg3, type4, arg4, type5, arg5, type6, arg6,		\
+	    type7, arg7, type8, arg8);
+
 #endif /* _KERNEL */
 
 extern const char *sdt_prefix;
diff --git a/external/cddl/osnet/dist/uts/common/sys/sysevent.h b/external/cddl/osnet/dist/uts/common/sys/sysevent.h
index 0a61e41de8490..44b5d069fb265 100644
--- a/external/cddl/osnet/dist/uts/common/sys/sysevent.h
+++ b/external/cddl/osnet/dist/uts/common/sys/sysevent.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -19,16 +18,15 @@
  *
  * CDDL HEADER END
  */
+
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_SYSEVENT_H
 #define	_SYS_SYSEVENT_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/nvpair.h>
 
 #ifdef	__cplusplus
@@ -164,18 +162,50 @@ typedef struct sysevent_value {
 #define	EVCH_QWAIT	0x0008	/* Wait for slot in event queue */
 
 /*
- * Meaning of flags for subscribe/unsubscribe. Bits 0 to 7 are dedicated to
- * the consolidation private interface.
+ * Meaning of flags for subscribe. Bits 8 to 15 are dedicated to
+ * the consolidation private interface, so flags defined here are restricted
+ * to the LSB.
+ *
+ * EVCH_SUB_KEEP indicates that this subscription should persist even if
+ * this subscriber id should die unexpectedly; matching events will be
+ * queued (up to a limit) and will be delivered if/when we restart again
+ * with the same subscriber id.
+ */
+#define	EVCH_SUB_KEEP		0x01
+
+/*
+ * Subscriptions may be wildcarded, but we limit the number of
+ * wildcards permitted.
+ */
+#define	EVCH_WILDCARD_MAX	10
+
+/*
+ * Used in unsubscribe to indicate all subscriber ids for a channel.
  */
-#define	EVCH_SUB_KEEP		0x0001
 #define	EVCH_ALLSUB		"all_subs"
 
 /*
  * Meaning of flags parameter of channel bind function
+ *
+ * EVCH_CREAT indicates to create a channel if not already present.
+ *
+ * EVCH_HOLD_PEND indicates that events should be published to this
+ * channel even if there are no matching subscribers present; when
+ * a subscriber belatedly binds to the channel and registers their
+ * subscriptions they will receive events that predate their bind.
+ * If the channel is closed, however, with no remaining bindings then
+ * the channel is destroyed.
+ *
+ * EVCH_HOLD_PEND_INDEF is a stronger version of EVCH_HOLD_PEND -
+ * even if the channel has no remaining bindings it will not be
+ * destroyed so long as events remain unconsumed.  This is suitable for
+ * use with short-lived event producers that may bind to (create) the
+ * channel and exit before the intended consumer has started.
  */
-#define	EVCH_CREAT		0x0001	/* Create a channel if not present */
+#define	EVCH_CREAT		0x0001
 #define	EVCH_HOLD_PEND		0x0002
-#define	EVCH_B_FLAGS		0x0003	/* All valid bits */
+#define	EVCH_HOLD_PEND_INDEF	0x0004
+#define	EVCH_B_FLAGS		0x0007	/* All valid bits */
 
 /*
  * Meaning of commands of evc_control function
@@ -186,37 +216,62 @@ typedef struct sysevent_value {
 #define	EVCH_CMD_LAST		 EVCH_SET_CHAN_LEN	/* Last command */
 
 /*
- * Event channel interface definitions
+ * Shared user/kernel event channel interface definitions
  */
-int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
-void sysevent_evc_unbind(evchan_t *);
-int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
+extern int sysevent_evc_bind(const char *, evchan_t **, uint32_t);
+extern int sysevent_evc_unbind(evchan_t *);
+extern int sysevent_evc_subscribe(evchan_t *, const char *, const char *,
     int (*)(sysevent_t *, void *), void *, uint32_t);
-void sysevent_evc_unsubscribe(evchan_t *, const char *);
-int sysevent_evc_publish(evchan_t *, const char *, const char *,
+extern int sysevent_evc_unsubscribe(evchan_t *, const char *);
+extern int sysevent_evc_publish(evchan_t *, const char *, const char *,
     const char *, const char *, nvlist_t *, uint32_t);
-int sysevent_evc_control(evchan_t *, int, ...);
+extern int sysevent_evc_control(evchan_t *, int, ...);
 
-#ifdef	_KERNEL
+#ifndef	_KERNEL
+
+/*
+ * Userland-only event channel interfaces
+ */
+
+#include <door.h>
+
+typedef struct sysevent_subattr sysevent_subattr_t;
+
+extern sysevent_subattr_t *sysevent_subattr_alloc(void);
+extern void sysevent_subattr_free(sysevent_subattr_t *);
+
+extern void sysevent_subattr_thrattr(sysevent_subattr_t *, pthread_attr_t *);
+extern void sysevent_subattr_sigmask(sysevent_subattr_t *, sigset_t *);
+
+extern void sysevent_subattr_thrcreate(sysevent_subattr_t *,
+    door_xcreate_server_func_t *, void *);
+extern void sysevent_subattr_thrsetup(sysevent_subattr_t *,
+    door_xcreate_thrsetup_func_t *, void *);
+
+extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *,
+    int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *);
+
+#else
 
 /*
  * Kernel log_event interfaces.
  */
-int log_sysevent(sysevent_t *, int, sysevent_id_t *);
-
-sysevent_t *sysevent_alloc(char *, char *, char *, int);
-void sysevent_free(sysevent_t *);
-int sysevent_add_attr(sysevent_attr_list_t **, char *, sysevent_value_t *, int);
-void sysevent_free_attr(sysevent_attr_list_t *);
-int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
-void sysevent_detach_attributes(sysevent_t *);
-char *sysevent_get_class_name(sysevent_t *);
-char *sysevent_get_subclass_name(sysevent_t *);
-uint64_t sysevent_get_seq(sysevent_t *);
-void sysevent_get_time(sysevent_t *, hrtime_t *);
-size_t sysevent_get_size(sysevent_t *);
-char *sysevent_get_pub(sysevent_t *);
-int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
+extern int log_sysevent(sysevent_t *, int, sysevent_id_t *);
+
+extern sysevent_t *sysevent_alloc(char *, char *, char *, int);
+extern void sysevent_free(sysevent_t *);
+extern int sysevent_add_attr(sysevent_attr_list_t **, char *,
+    sysevent_value_t *, int);
+extern void sysevent_free_attr(sysevent_attr_list_t *);
+extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *);
+extern void sysevent_detach_attributes(sysevent_t *);
+extern char *sysevent_get_class_name(sysevent_t *);
+extern char *sysevent_get_subclass_name(sysevent_t *);
+extern uint64_t sysevent_get_seq(sysevent_t *);
+extern void sysevent_get_time(sysevent_t *, hrtime_t *);
+extern size_t sysevent_get_size(sysevent_t *);
+extern char *sysevent_get_pub(sysevent_t *);
+extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **);
 
 #endif	/* _KERNEL */
 
diff --git a/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h b/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h
index ac21686e84b81..6a93416cc784e 100644
--- a/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h
+++ b/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h
@@ -19,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -52,6 +52,7 @@ extern "C" {
 #define	EC_FM		"EC_fm"		/* FMA error report event */
 #define	EC_ZFS		"EC_zfs"	/* ZFS event */
 #define	EC_DATALINK	"EC_datalink"	/* datalink event */
+#define	EC_VRRP		"EC_vrrp"	/* VRRP event */
 
 /*
  * The following event class is reserved for exclusive use
@@ -179,6 +180,8 @@ extern "C" {
 /* Interface within an IPMP group has changed state or type */
 #define	ESC_IPMP_IF_CHANGE		"ESC_ipmp_if_change"
 
+/* IPMP probe has changed state */
+#define	ESC_IPMP_PROBE_STATE		"ESC_ipmp_probe_state"
 
 /*
  * EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes
@@ -200,9 +203,16 @@ extern "C" {
 /* device tree branch removed */
 #define	ESC_DEV_BRANCH_REMOVE	"ESC_dev_branch_remove"
 
-/* device capacity dynamically changed */
+/*
+ * EC_DEV_STATUS subclass definitions
+ *
+ * device capacity dynamically changed
+ */
 #define	ESC_DEV_DLE		"ESC_dev_dle"
 
+/* LUN has received an eject request from the user */
+#define	ESC_DEV_EJECT_REQUEST	"ESC_dev_eject_request"
+
 /* FMA Fault and Error event protocol subclass */
 #define	ESC_FM_ERROR		"ESC_FM_error"
 #define	ESC_FM_ERROR_REPLAY	"ESC_FM_error_replay"
@@ -223,6 +233,15 @@ extern "C" {
 #define	ESC_PWRCTL_BRIGHTNESS_UP	"ESC_pwrctl_brightness_up"
 #define	ESC_PWRCTL_BRIGHTNESS_DOWN	"ESC_pwrctl_brightness_down"
 
+/* EC_ACPIEV subclass definitions */
+#define	EC_ACPIEV			"EC_acpiev"
+#define	ESC_ACPIEV_DISPLAY_SWITCH	"ESC_acpiev_display_switch"
+#define	ESC_ACPIEV_SCREEN_LOCK		"ESC_acpiev_screen_lock"
+#define	ESC_ACPIEV_SLEEP		"ESC_acpiev_sleep"
+#define	ESC_ACPIEV_AUDIO_MUTE		"ESC_acpiev_audio_mute"
+#define	ESC_ACPIEV_WIFI			"ESC_acpiev_wifi"
+#define	ESC_ACPIEV_TOUCHPAD		"ESC_acpiev_touchpad"
+
 /*
  * ZFS subclass definitions.  supporting attributes (name/value paris) are found
  * in sys/fs/zfs.h
@@ -234,12 +253,21 @@ extern "C" {
 #define	ESC_ZFS_VDEV_CLEAR	"ESC_ZFS_vdev_clear"
 #define	ESC_ZFS_VDEV_CHECK	"ESC_ZFS_vdev_check"
 #define	ESC_ZFS_CONFIG_SYNC	"ESC_ZFS_config_sync"
+#define	ESC_ZFS_SCRUB_START	"ESC_ZFS_scrub_start"
+#define	ESC_ZFS_SCRUB_FINISH	"ESC_ZFS_scrub_finish"
+#define	ESC_ZFS_VDEV_SPARE	"ESC_ZFS_vdev_spare"
 
 /*
  * datalink subclass definitions.
  */
 #define	ESC_DATALINK_PHYS_ADD	"ESC_datalink_phys_add"	/* new physical link */
 
+/*
+ * VRRP subclass definitions. Supporting attributes (name/value paris) are
+ * found in sys/sysevent/vrrp.h
+ */
+#define	ESC_VRRP_STATE_CHANGE	"ESC_vrrp_state_change"
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/external/cddl/osnet/dist/uts/common/sys/systeminfo.h b/external/cddl/osnet/dist/uts/common/sys/systeminfo.h
index 73a9922f18f35..3f7a465aa51bc 100644
--- a/external/cddl/osnet/dist/uts/common/sys/systeminfo.h
+++ b/external/cddl/osnet/dist/uts/common/sys/systeminfo.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,7 +19,7 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -31,8 +30,6 @@
 #ifndef _SYS_SYSTEMINFO_H
 #define	_SYS_SYSTEMINFO_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"	/* SVr4.0 1.4 */
-
 #ifdef	__cplusplus
 extern "C" {
 #endif
@@ -40,7 +37,7 @@ extern "C" {
 #ifdef	_KERNEL
 extern char architecture[];
 extern char architecture_32[];
-extern char hw_serial[];
+extern char hw_serial[];	/* machine's 32-bit hostid; a decimal string */
 extern char hw_provider[];
 extern char srpc_domain[];
 extern char platform[];
@@ -93,7 +90,12 @@ extern char platform[];
 /* Solaris defined `set' commands (769-1024) (none currently assigned) */
 
 
-#define	DOM_NM_LN		64	/* maximum length of domain name */
+#define	HW_INVALID_HOSTID	0xFFFFFFFF	/* an invalid hostid */
+#define	HW_HOSTID_LEN		11		/* minimum buffer size needed */
+						/* to hold a decimal or hex */
+						/* hostid string */
+#define	DOM_NM_LN		64		/* maximum length of domain */
+						/* name */
 
 #if !defined(_KERNEL)
 #if defined(__STDC__)
diff --git a/external/cddl/osnet/dist/uts/common/sys/taskq.h b/external/cddl/osnet/dist/uts/common/sys/taskq.h
index 1051531d9e4b7..8b601c86a5986 100644
--- a/external/cddl/osnet/dist/uts/common/sys/taskq.h
+++ b/external/cddl/osnet/dist/uts/common/sys/taskq.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_TASKQ_H
 #define	_SYS_TASKQ_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #include <sys/thread.h>
 
@@ -42,12 +39,16 @@ typedef struct taskq taskq_t;
 typedef uintptr_t taskqid_t;
 typedef void (task_func_t)(void *);
 
+struct proc;
+
 /*
  * Public flags for taskq_create(): bit range 0-15
  */
 #define	TASKQ_PREPOPULATE	0x0001	/* Prepopulate with threads and data */
 #define	TASKQ_CPR_SAFE		0x0002	/* Use CPR safe protocol */
 #define	TASKQ_DYNAMIC		0x0004	/* Use dynamic thread scheduling */
+#define	TASKQ_THREADS_CPU_PCT	0x0008	/* number of threads as % of ncpu */
+#define	TASKQ_DC_BATCH		0x0010	/* Taskq uses SDC in batch mode */
 
 /*
  * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as
@@ -57,16 +58,22 @@ typedef void (task_func_t)(void *);
 #define	TQ_NOSLEEP	0x01	/* cannot block for memory; may fail */
 #define	TQ_NOQUEUE	0x02	/* Do not enqueue if can't dispatch */
 #define	TQ_NOALLOC	0x04	/* cannot allocate memory; may fail */
+#define	TQ_FRONT	0x08	/* Put task at the front of the queue */
 
 #ifdef _KERNEL
 
 extern taskq_t *system_taskq;
 
 extern void	taskq_init(void);
+extern void	taskq_mp_init(void);
 
 extern taskq_t	*taskq_create(const char *, int, pri_t, int, int, uint_t);
 extern taskq_t	*taskq_create_instance(const char *, int, int, pri_t, int,
     int, uint_t);
+extern taskq_t	*taskq_create_proc(const char *, int, pri_t, int, int,
+    struct proc *, uint_t);
+extern taskq_t	*taskq_create_sysdc(const char *, int, int, int,
+    struct proc *, uint_t, uint_t);
 extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t);
 extern void	nulltask(void *);
 extern void	taskq_destroy(taskq_t *);
diff --git a/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h b/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h
index 07b7d1416b83e..a6f99fa3d969b 100644
--- a/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h
+++ b/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h
@@ -2,9 +2,8 @@
  * CDDL HEADER START
  *
  * The contents of this file are subject to the terms of the
- * Common Development and Distribution License, Version 1.0 only
- * (the "License").  You may not use this file except in compliance
- * with the License.
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
  *
  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
  * or http://www.opensolaris.org/os/licensing.
@@ -20,17 +19,17 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2004 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_TASKQ_IMPL_H
 #define	_SYS_TASKQ_IMPL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/taskq.h>
+#include <sys/inttypes.h>
 #include <sys/vmem.h>
+#include <sys/list.h>
 #include <sys/kstat.h>
 
 #ifdef	__cplusplus
@@ -84,12 +83,16 @@ struct taskq_bucket {
 #define	TQBUCKET_CLOSE		0x01
 #define	TQBUCKET_SUSPEND	0x02
 
+#define	TASKQ_INTERFACE_FLAGS	0x0000ffff	/* defined in <sys/taskq.h> */
+
 /*
  * taskq implementation flags: bit range 16-31
  */
-#define	TASKQ_ACTIVE		0x00010000
-#define	TASKQ_SUSPENDED		0x00020000
-#define	TASKQ_NOINSTANCE	0x00040000
+#define	TASKQ_CHANGING		0x00010000	/* nthreads != target */
+#define	TASKQ_SUSPENDED		0x00020000	/* taskq is suspended */
+#define	TASKQ_NOINSTANCE	0x00040000	/* no instance number */
+#define	TASKQ_THREAD_CREATED	0x00080000	/* a thread has been created */
+#define	TASKQ_DUTY_CYCLE	0x00100000	/* using the SDC class */
 
 struct taskq {
 	char		tq_name[TASKQ_NAMELEN + 1];
@@ -97,16 +100,20 @@ struct taskq {
 	krwlock_t	tq_threadlock;
 	kcondvar_t	tq_dispatch_cv;
 	kcondvar_t	tq_wait_cv;
+	kcondvar_t	tq_exit_cv;
+	pri_t		tq_pri;		/* Scheduling priority */
 	uint_t		tq_flags;
 	int		tq_active;
 	int		tq_nthreads;
+	int		tq_nthreads_target;
+	int		tq_nthreads_max;
+	int		tq_threads_ncpus_pct;
 	int		tq_nalloc;
 	int		tq_minalloc;
 	int		tq_maxalloc;
 	taskq_ent_t	*tq_freelist;
 	taskq_ent_t	tq_task;
 	int		tq_maxsize;
-	pri_t		tq_pri;		/* Scheduling priority	    */
 	taskq_bucket_t	*tq_buckets;	/* Per-cpu array of buckets */
 	int		tq_instance;
 	uint_t		tq_nbuckets;	/* # of buckets	(2^n)	    */
@@ -114,13 +121,19 @@ struct taskq {
 		kthread_t *_tq_thread;
 		kthread_t **_tq_threadlist;
 	}		tq_thr;
+
+	list_node_t	tq_cpupct_link;	/* linkage for taskq_cpupct_list */
+	struct proc	*tq_proc;	/* process for taskq threads */
+	int		tq_cpupart;	/* cpupart id bound to */
+	uint_t		tq_DC;		/* duty cycle for SDC */
+
 	/*
 	 * Statistics.
 	 */
 	kstat_t		*tq_kstat;	/* Exported statistics */
 	hrtime_t	tq_totaltime;	/* Time spent processing tasks */
-	int		tq_tasks;	/* Total # of tasks posted */
-	int		tq_executed;	/* Total # of tasks executed */
+	uint64_t	tq_tasks;	/* Total # of tasks posted */
+	uint64_t	tq_executed;	/* Total # of tasks executed */
 	int		tq_maxtasks;	/* Max number of tasks in the queue */
 	int		tq_tcreates;
 	int		tq_tdeaths;
@@ -129,6 +142,9 @@ struct taskq {
 #define	tq_thread tq_thr._tq_thread
 #define	tq_threadlist tq_thr._tq_threadlist
 
+/* The MAX guarantees we have at least one thread */
+#define	TASKQ_THREADS_PCT(ncpus, pct)	MAX(((ncpus) * (pct)) / 100, 1)
+
 #ifdef	__cplusplus
 }
 #endif
diff --git a/external/cddl/osnet/dist/uts/common/sys/tsol/label.h b/external/cddl/osnet/dist/uts/common/sys/tsol/label.h
index b496737334d61..f88f40973d5bf 100644
--- a/external/cddl/osnet/dist/uts/common/sys/tsol/label.h
+++ b/external/cddl/osnet/dist/uts/common/sys/tsol/label.h
@@ -19,15 +19,13 @@
  * CDDL HEADER END
  */
 /*
- * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
 #ifndef	_SYS_TSOL_LABEL_H
 #define	_SYS_TSOL_LABEL_H
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <sys/types.h>
 #ifdef _KERNEL
 #include <sys/cred.h>
@@ -45,6 +43,10 @@ extern "C" {
 #define	EQUALITY_CHECK	0
 #define	DOMINANCE_CHECK	1
 
+/* Manifest human readable label names */
+#define	ADMIN_LOW	"ADMIN_LOW"
+#define	ADMIN_HIGH	"ADMIN_HIGH"
+
 /* Binary Label Structure Definitions */
 
 typedef	struct _mac_label_impl	m_label_t;
@@ -105,7 +107,21 @@ typedef	struct ts_label_s {
 
 #define	DEFAULT_DOI 1
 
-#define	TSLF_UNLABELED	0x00000001	/* source was unlabeled */
+/*
+ * TSLF_UNLABELED is set in tsl_flags for  packets with no explicit label
+ * when the peer is unlabeled.
+ *
+ * TSLF_IMPLICIT_IN is set when a packet is received with no explicit label
+ * from a peer which is flagged in the tnrhdb as label-aware.
+ *
+ * TSLF_IMPLICIT_OUT is set when the packet should be sent without an
+ * explict label even if the peer or next-hop router is flagged in the
+ * tnrhdb as label-aware.
+ */
+
+#define	TSLF_UNLABELED		0x00000001	/* peer is unlabeled */
+#define	TSLF_IMPLICIT_IN	0x00000002	/* inbound implicit */
+#define	TSLF_IMPLICIT_OUT	0x00000004	/* outbound implicit */
 
 #define	CR_SL(cr)	(label2bslabel(crgetlabel(cr)))
 
@@ -116,21 +132,25 @@ extern int		sys_labeling;
 
 extern void		label_init(void);
 extern ts_label_t	*labelalloc(const m_label_t *, uint32_t, int);
+extern ts_label_t	*labeldup(const ts_label_t *, int);
 extern void		label_hold(ts_label_t *);
 extern void		label_rele(ts_label_t *);
 extern m_label_t	*label2bslabel(ts_label_t *);
 extern uint32_t		label2doi(ts_label_t *);
 extern boolean_t	label_equal(const ts_label_t *, const ts_label_t *);
 extern cred_t 		*newcred_from_bslabel(m_label_t *, uint32_t, int);
-extern cred_t 		*copycred_from_bslabel(cred_t *, m_label_t *,
+extern cred_t 		*copycred_from_bslabel(const cred_t *, m_label_t *,
 			    uint32_t, int);
+extern cred_t		*copycred_from_tslabel(const cred_t *, ts_label_t *,
+			    int);
 extern ts_label_t	*getflabel(vnode_t *);
 extern int		getlabel(const char *, m_label_t *);
 extern int		fgetlabel(int, m_label_t *);
 extern int		_blinrange(const m_label_t *, const brange_t *);
 extern int		blinlset(const m_label_t *, const blset_t);
-extern ts_label_t	*nfs_getflabel(vnode_t *);
-extern boolean_t	do_rfs_label_check(bslabel_t *, vnode_t *, int);
+
+extern int		l_to_str_internal(const m_label_t *, char **);
+extern int		hexstr_to_label(const char *, m_label_t *);
 
 /*
  * The use of '!!' here prevents users from referencing this function-like
diff --git a/external/cddl/osnet/dist/uts/common/sys/vtoc.h b/external/cddl/osnet/dist/uts/common/sys/vtoc.h
index 3600fd85bd6d9..004b49097ae31 100644
--- a/external/cddl/osnet/dist/uts/common/sys/vtoc.h
+++ b/external/cddl/osnet/dist/uts/common/sys/vtoc.h
@@ -20,7 +20,7 @@
  */
 
 /*
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
@@ -158,7 +158,7 @@ struct extvtoc {
 		v.v_part[i].p_flag = extv.v_part[i].p_flag;		\
 		v.v_part[i].p_start = (daddr_t)extv.v_part[i].p_start;	\
 		v.v_part[i].p_size = (long)extv.v_part[i].p_size;	\
-		v.timestamp[i] = (time_t)v.timestamp[i];		\
+		v.timestamp[i] = (time_t)extv.timestamp[i];		\
 	}								\
 	bcopy(extv.v_asciilabel, v.v_asciilabel, LEN_DKL_ASCII);	\
 	}