diff --git a/external/cddl/osnet/dist/cmd/zdb/zdb.c b/external/cddl/osnet/dist/cmd/zdb/zdb.c index 48a84d693e449..6b2f3b7cd6a9d 100644 --- a/external/cddl/osnet/dist/cmd/zdb/zdb.c +++ b/external/cddl/osnet/dist/cmd/zdb/zdb.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -51,10 +51,25 @@ #include #include #include +#include #undef ZFS_MAXNAMELEN #undef verify #include +#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ + zio_compress_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ + zio_checksum_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_OT_NAME(idx) ((idx) < DMU_OT_NUMTYPES ? \ + dmu_ot[(idx)].ot_name : "UNKNOWN") +#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : DMU_OT_NUMTYPES) + +#ifndef lint +extern int zfs_recover; +#else +int zfs_recover; +#endif + const char cmdname[] = "zdb"; uint8_t dump_opt[256]; @@ -64,8 +79,6 @@ extern void dump_intent_log(zilog_t *); uint64_t *zopt_object = NULL; int zopt_objects = 0; libzfs_handle_t *g_zfs; -boolean_t zdb_sig_user_data = B_TRUE; -int zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256; /* * These libumem hooks provide a reasonable set of defaults for the allocator's @@ -87,42 +100,66 @@ static void usage(void) { (void) fprintf(stderr, - "Usage: %s [-udibcsvL] [-U cachefile_path] " - "[-S user:cksumalg] " - "dataset [object...]\n" - " %s -C [pool]\n" - " %s -l dev\n" - " %s -R pool:vdev:offset:size:flags\n" - " %s [-p path_to_vdev_dir]\n" - " %s -e pool | GUID | devid ...\n", - cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); - - (void) fprintf(stderr, " -u uberblock\n"); - (void) fprintf(stderr, " -d datasets\n"); - (void) fprintf(stderr, " -C cached pool configuration\n"); - (void) fprintf(stderr, " -i intent logs\n"); - (void) fprintf(stderr, " -b block statistics\n"); - (void) fprintf(stderr, " -c checksum all data blocks\n"); - (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); - (void) fprintf(stderr, " -S : -- " - "dump blkptr signatures\n"); - (void) fprintf(stderr, " -v verbose (applies to all others)\n"); + "Usage: %s [-CumdibcsvhL] poolname [object...]\n" + " %s [-div] dataset [object...]\n" + " %s -m [-L] poolname [vdev [metaslab...]]\n" + " %s -R poolname vdev:offset:size[:flags]\n" + " %s -S poolname\n" + " %s -l [-u] device\n" + " %s -C\n\n", + cmdname, cmdname, cmdname, cmdname, cmdname, cmdname, cmdname); + + (void) fprintf(stderr, " Dataset name must include at least one " + "separator character '/' or '@'\n"); + (void) fprintf(stderr, " If dataset name is specified, only that " + "dataset is dumped\n"); + (void) fprintf(stderr, " If object numbers are specified, only " + "those objects are dumped\n\n"); + (void) fprintf(stderr, " Options to control amount of output:\n"); + (void) fprintf(stderr, " -u uberblock\n"); + (void) fprintf(stderr, " -d dataset(s)\n"); + (void) fprintf(stderr, " -i intent logs\n"); + (void) fprintf(stderr, " -C config (or cachefile if alone)\n"); + (void) fprintf(stderr, " -h pool history\n"); + (void) fprintf(stderr, " -b block statistics\n"); + (void) fprintf(stderr, " -m metaslabs\n"); + (void) fprintf(stderr, " -c checksum all metadata (twice for " + "all data) blocks\n"); + (void) fprintf(stderr, " -s report stats on zdb's I/O\n"); + (void) fprintf(stderr, " -S simulate dedup to measure effect\n"); + (void) fprintf(stderr, " -v verbose (applies to all others)\n"); (void) fprintf(stderr, " -l dump label contents\n"); (void) fprintf(stderr, " -L disable leak tracking (do not " "load spacemaps)\n"); - (void) fprintf(stderr, " -U cachefile_path -- use alternate " - "cachefile\n"); (void) fprintf(stderr, " -R read and display block from a " - "device\n"); - (void) fprintf(stderr, " -e Pool is exported/destroyed/" - "has altroot\n"); - (void) fprintf(stderr, " -p (use with -e)\n"); + "device\n\n"); + (void) fprintf(stderr, " Below options are intended for use " + "with other options (except -l):\n"); + (void) fprintf(stderr, " -A ignore assertions (-A), enable " + "panic recovery (-AA) or both (-AAA)\n"); + (void) fprintf(stderr, " -F attempt automatic rewind within " + "safe range of transaction groups\n"); + (void) fprintf(stderr, " -U -- use alternate " + "cachefile\n"); + (void) fprintf(stderr, " -X attempt extreme rewind (does not " + "work with dataset)\n"); + (void) fprintf(stderr, " -e pool is exported/destroyed/" + "has altroot/not in a cachefile\n"); + (void) fprintf(stderr, " -p -- use one or more with " + "-e to specify path to vdev dir\n"); + (void) fprintf(stderr, " -t -- highest txg to use when " + "searching for uberblocks\n"); (void) fprintf(stderr, "Specify an option more than once (e.g. -bb) " "to make only that option verbose\n"); (void) fprintf(stderr, "Default is to dump everything non-verbosely\n"); exit(1); } +/* + * Called for usage errors that are discovered after a call to spa_open(), + * dmu_bonus_hold(), or pool_match(). abort() is called for other errors. + */ + static void fatal(const char *fmt, ...) { @@ -134,69 +171,7 @@ fatal(const char *fmt, ...) va_end(ap); (void) fprintf(stderr, "\n"); - abort(); -} - -static void -dump_nvlist(nvlist_t *list, int indent) -{ - nvpair_t *elem = NULL; - - while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { - switch (nvpair_type(elem)) { - case DATA_TYPE_STRING: - { - char *value; - - VERIFY(nvpair_value_string(elem, &value) == 0); - (void) printf("%*s%s='%s'\n", indent, "", - nvpair_name(elem), value); - } - break; - - case DATA_TYPE_UINT64: - { - uint64_t value; - - VERIFY(nvpair_value_uint64(elem, &value) == 0); - (void) printf("%*s%s=%llu\n", indent, "", - nvpair_name(elem), (u_longlong_t)value); - } - break; - - case DATA_TYPE_NVLIST: - { - nvlist_t *value; - - VERIFY(nvpair_value_nvlist(elem, &value) == 0); - (void) printf("%*s%s\n", indent, "", - nvpair_name(elem)); - dump_nvlist(value, indent + 4); - } - break; - - case DATA_TYPE_NVLIST_ARRAY: - { - nvlist_t **value; - uint_t c, count; - - VERIFY(nvpair_value_nvlist_array(elem, &value, - &count) == 0); - - for (c = 0; c < count; c++) { - (void) printf("%*s%s[%u]\n", indent, "", - nvpair_name(elem), c); - dump_nvlist(value[c], indent + 8); - } - } - break; - - default: - - (void) printf("bad config type %d for %s\n", - nvpair_type(elem), nvpair_name(elem)); - } - } + exit(1); } /* ARGSUSED */ @@ -207,7 +182,7 @@ dump_packed_nvlist(objset_t *os, uint64_t object, void *data, size_t size) size_t nvsize = *(uint64_t *)data; char *packed = umem_alloc(nvsize, UMEM_NOFAIL); - VERIFY(0 == dmu_read(os, object, 0, nvsize, packed)); + VERIFY(0 == dmu_read(os, object, 0, nvsize, packed, DMU_READ_PREFETCH)); VERIFY(nvlist_unpack(packed, nvsize, &nv, 0) == 0); @@ -315,6 +290,13 @@ dump_none(objset_t *os, uint64_t object, void *data, size_t size) { } +/*ARGSUSED*/ +static void +dump_unknown(objset_t *os, uint64_t object, void *data, size_t size) +{ + (void) printf("\tUNKNOWN OBJECT TYPE\n"); +} + /*ARGSUSED*/ void dump_uint8(objset_t *os, uint64_t object, void *data, size_t size) @@ -377,6 +359,14 @@ dump_zap(objset_t *os, uint64_t object, void *data, size_t size) zap_cursor_fini(&zc); } +/*ARGSUSED*/ +static void +dump_ddt_zap(objset_t *os, uint64_t object, void *data, size_t size) +{ + dump_zap_stats(os, object); + /* contents are printed elsewhere, properly decoded */ +} + /*ARGSUSED*/ static void dump_zpldir(objset_t *os, uint64_t object, void *data, size_t size) @@ -433,16 +423,16 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) alloc = 0; for (offset = 0; offset < smo->smo_objsize; offset += sizeof (entry)) { VERIFY(0 == dmu_read(os, smo->smo_object, offset, - sizeof (entry), &entry)); + sizeof (entry), &entry, DMU_READ_PREFETCH)); if (SM_DEBUG_DECODE(entry)) { - (void) printf("\t\t[%4llu] %s: txg %llu, pass %llu\n", + (void) printf("\t [%6llu] %s: txg %llu, pass %llu\n", (u_longlong_t)(offset / sizeof (entry)), ddata[SM_DEBUG_ACTION_DECODE(entry)], (u_longlong_t)SM_DEBUG_TXG_DECODE(entry), (u_longlong_t)SM_DEBUG_SYNCPASS_DECODE(entry)); } else { - (void) printf("\t\t[%4llu] %c range:" - " %08llx-%08llx size: %06llx\n", + (void) printf("\t [%6llu] %c range:" + " %010llx-%010llx size: %06llx\n", (u_longlong_t)(offset / sizeof (entry)), SM_TYPE_DECODE(entry) == SM_ALLOC ? 'A' : 'F', (u_longlong_t)((SM_OFFSET_DECODE(entry) << @@ -464,100 +454,348 @@ dump_spacemap(objset_t *os, space_map_obj_t *smo, space_map_t *sm) } } +static void +dump_metaslab_stats(metaslab_t *msp) +{ + char maxbuf[5]; + space_map_t *sm = &msp->ms_map; + avl_tree_t *t = sm->sm_pp_root; + int free_pct = sm->sm_space * 100 / sm->sm_size; + + nicenum(space_map_maxsize(sm), maxbuf); + + (void) printf("\t %25s %10lu %7s %6s %4s %4d%%\n", + "segments", avl_numnodes(t), "maxsize", maxbuf, + "freepct", free_pct); +} + static void dump_metaslab(metaslab_t *msp) { - char freebuf[5]; - space_map_obj_t *smo = &msp->ms_smo; vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; + char freebuf[5]; - nicenum(msp->ms_map.sm_size - smo->smo_alloc, freebuf); + nicenum(sm->sm_size - smo->smo_alloc, freebuf); - if (dump_opt['d'] <= 5) { - (void) printf("\t%10llx %10llu %5s\n", - (u_longlong_t)msp->ms_map.sm_start, - (u_longlong_t)smo->smo_object, - freebuf); - return; + (void) printf( + "\tmetaslab %6llu offset %12llx spacemap %6llu free %5s\n", + (u_longlong_t)(sm->sm_start / sm->sm_size), + (u_longlong_t)sm->sm_start, (u_longlong_t)smo->smo_object, freebuf); + + if (dump_opt['m'] > 1 && !dump_opt['L']) { + mutex_enter(&msp->ms_lock); + space_map_load_wait(sm); + if (!sm->sm_loaded) + VERIFY(space_map_load(sm, zfs_metaslab_ops, + SM_FREE, smo, spa->spa_meta_objset) == 0); + dump_metaslab_stats(msp); + space_map_unload(sm); + mutex_exit(&msp->ms_lock); } - (void) printf( - "\tvdev %llu offset %08llx spacemap %4llu free %5s\n", - (u_longlong_t)vd->vdev_id, (u_longlong_t)msp->ms_map.sm_start, - (u_longlong_t)smo->smo_object, freebuf); + if (dump_opt['d'] > 5 || dump_opt['m'] > 2) { + ASSERT(sm->sm_size == (1ULL << vd->vdev_ms_shift)); - ASSERT(msp->ms_map.sm_size == (1ULL << vd->vdev_ms_shift)); + mutex_enter(&msp->ms_lock); + dump_spacemap(spa->spa_meta_objset, smo, sm); + mutex_exit(&msp->ms_lock); + } +} - dump_spacemap(spa->spa_meta_objset, smo, &msp->ms_map); +static void +print_vdev_metaslab_header(vdev_t *vd) +{ + (void) printf("\tvdev %10llu\n\t%-10s%5llu %-19s %-15s %-10s\n", + (u_longlong_t)vd->vdev_id, + "metaslabs", (u_longlong_t)vd->vdev_ms_count, + "offset", "spacemap", "free"); + (void) printf("\t%15s %19s %15s %10s\n", + "---------------", "-------------------", + "---------------", "-------------"); } static void dump_metaslabs(spa_t *spa) { - vdev_t *rvd = spa->spa_root_vdev; - vdev_t *vd; - int c, m; + vdev_t *vd, *rvd = spa->spa_root_vdev; + uint64_t m, c = 0, children = rvd->vdev_children; (void) printf("\nMetaslabs:\n"); - for (c = 0; c < rvd->vdev_children; c++) { - vd = rvd->vdev_child[c]; + if (!dump_opt['d'] && zopt_objects > 0) { + c = zopt_object[0]; - (void) printf("\n vdev %llu\n\n", (u_longlong_t)vd->vdev_id); + if (c >= children) + (void) fatal("bad vdev id: %llu", (u_longlong_t)c); - if (dump_opt['d'] <= 5) { - (void) printf("\t%10s %10s %5s\n", - "offset", "spacemap", "free"); - (void) printf("\t%10s %10s %5s\n", - "------", "--------", "----"); + if (zopt_objects > 1) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + + for (m = 1; m < zopt_objects; m++) { + if (zopt_object[m] < vd->vdev_ms_count) + dump_metaslab( + vd->vdev_ms[zopt_object[m]]); + else + (void) fprintf(stderr, "bad metaslab " + "number %llu\n", + (u_longlong_t)zopt_object[m]); + } + (void) printf("\n"); + return; } + children = c + 1; + } + for (; c < children; c++) { + vd = rvd->vdev_child[c]; + print_vdev_metaslab_header(vd); + for (m = 0; m < vd->vdev_ms_count; m++) dump_metaslab(vd->vdev_ms[m]); (void) printf("\n"); } } +static void +dump_dde(const ddt_t *ddt, const ddt_entry_t *dde, uint64_t index) +{ + const ddt_phys_t *ddp = dde->dde_phys; + const ddt_key_t *ddk = &dde->dde_key; + char *types[4] = { "ditto", "single", "double", "triple" }; + char blkbuf[BP_SPRINTF_LEN]; + blkptr_t blk; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + sprintf_blkptr(blkbuf, &blk); + (void) printf("index %llx refcnt %llu %s %s\n", + (u_longlong_t)index, (u_longlong_t)ddp->ddp_refcnt, + types[p], blkbuf); + } +} + +static void +dump_dedup_ratio(const ddt_stat_t *dds) +{ + double rL, rP, rD, D, dedup, compress, copies; + + if (dds->dds_blocks == 0) + return; + + rL = (double)dds->dds_ref_lsize; + rP = (double)dds->dds_ref_psize; + rD = (double)dds->dds_ref_dsize; + D = (double)dds->dds_dsize; + + dedup = rD / D; + compress = rL / rP; + copies = rD / rP; + + (void) printf("dedup = %.2f, compress = %.2f, copies = %.2f, " + "dedup * compress / copies = %.2f\n\n", + dedup, compress, copies, dedup * compress / copies); +} + +static void +dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + char name[DDT_NAMELEN]; + ddt_entry_t dde; + uint64_t walk = 0; + dmu_object_info_t doi; + uint64_t count, dspace, mspace; + int error; + + error = ddt_object_info(ddt, type, class, &doi); + + if (error == ENOENT) + return; + ASSERT(error == 0); + + count = ddt_object_count(ddt, type, class); + dspace = doi.doi_physical_blocks_512 << 9; + mspace = doi.doi_fill_count * doi.doi_data_block_size; + + ASSERT(count != 0); /* we should have destroyed it */ + + ddt_object_name(ddt, type, class, name); + + (void) printf("%s: %llu entries, size %llu on disk, %llu in core\n", + name, + (u_longlong_t)count, + (u_longlong_t)(dspace / count), + (u_longlong_t)(mspace / count)); + + if (dump_opt['D'] < 3) + return; + + zpool_dump_ddt(NULL, &ddt->ddt_histogram[type][class]); + + if (dump_opt['D'] < 4) + return; + + if (dump_opt['D'] < 5 && class == DDT_CLASS_UNIQUE) + return; + + (void) printf("%s contents:\n\n", name); + + while ((error = ddt_object_walk(ddt, type, class, &walk, &dde)) == 0) + dump_dde(ddt, &dde, walk); + + ASSERT(error == ENOENT); + + (void) printf("\n"); +} + +static void +dump_all_ddts(spa_t *spa) +{ + ddt_histogram_t ddh_total = { 0 }; + ddt_stat_t dds_total = { 0 }; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + dump_ddt(ddt, type, class); + } + } + } + + ddt_get_dedup_stats(spa, &dds_total); + + if (dds_total.dds_blocks == 0) { + (void) printf("All DDTs are empty\n"); + return; + } + + (void) printf("\n"); + + if (dump_opt['D'] > 1) { + (void) printf("DDT histogram (aggregated over all DDTs):\n"); + ddt_get_dedup_histogram(spa, &ddh_total); + zpool_dump_ddt(&dds_total, &ddh_total); + } + + dump_dedup_ratio(&dds_total); +} + +static void +dump_dtl_seg(space_map_t *sm, uint64_t start, uint64_t size) +{ + char *prefix = (void *)sm; + + (void) printf("%s [%llu,%llu) length %llu\n", + prefix, + (u_longlong_t)start, + (u_longlong_t)(start + size), + (u_longlong_t)(size)); +} + static void dump_dtl(vdev_t *vd, int indent) { - avl_tree_t *t = &vd->vdev_dtl_map.sm_root; - space_seg_t *ss; - vdev_t *pvd; - int c; + spa_t *spa = vd->vdev_spa; + boolean_t required; + char *name[DTL_TYPES] = { "missing", "partial", "scrub", "outage" }; + char prefix[256]; + + spa_vdev_state_enter(spa, SCL_NONE); + required = vdev_dtl_required(vd); + (void) spa_vdev_state_exit(spa, NULL, 0); if (indent == 0) (void) printf("\nDirty time logs:\n\n"); - (void) printf("\t%*s%s\n", indent, "", + (void) printf("\t%*s%s [%s]\n", indent, "", vd->vdev_path ? vd->vdev_path : - vd->vdev_parent ? vd->vdev_ops->vdev_op_type : - spa_name(vd->vdev_spa)); + vd->vdev_parent ? vd->vdev_ops->vdev_op_type : spa_name(spa), + required ? "DTL-required" : "DTL-expendable"); - for (ss = avl_first(t); ss; ss = AVL_NEXT(t, ss)) { - /* - * Everything in this DTL must appear in all parent DTL unions. - */ - for (pvd = vd; pvd; pvd = pvd->vdev_parent) - ASSERT(vdev_dtl_contains(&pvd->vdev_dtl_map, - ss->ss_start, ss->ss_end - ss->ss_start)); - (void) printf("\t%*soutage [%llu,%llu] length %llu\n", - indent, "", - (u_longlong_t)ss->ss_start, - (u_longlong_t)ss->ss_end - 1, - (u_longlong_t)(ss->ss_end - ss->ss_start)); + for (int t = 0; t < DTL_TYPES; t++) { + space_map_t *sm = &vd->vdev_dtl[t]; + if (sm->sm_space == 0) + continue; + (void) snprintf(prefix, sizeof (prefix), "\t%*s%s", + indent + 2, "", name[t]); + mutex_enter(sm->sm_lock); + space_map_walk(sm, dump_dtl_seg, (void *)prefix); + mutex_exit(sm->sm_lock); + if (dump_opt['d'] > 5 && vd->vdev_children == 0) + dump_spacemap(spa->spa_meta_objset, + &vd->vdev_dtl_smo, sm); } - (void) printf("\n"); + for (int c = 0; c < vd->vdev_children; c++) + dump_dtl(vd->vdev_child[c], indent + 4); +} - if (dump_opt['d'] > 5 && vd->vdev_children == 0) { - dump_spacemap(vd->vdev_spa->spa_meta_objset, &vd->vdev_dtl, - &vd->vdev_dtl_map); - (void) printf("\n"); - } +static void +dump_history(spa_t *spa) +{ + nvlist_t **events = NULL; + char buf[SPA_MAXBLOCKSIZE]; + uint64_t resid, len, off = 0; + uint_t num = 0; + int error; + time_t tsec; + struct tm t; + char tbuf[30]; + char internalstr[MAXPATHLEN]; + + do { + len = sizeof (buf); + + if ((error = spa_history_get(spa, &off, &len, buf)) != 0) { + (void) fprintf(stderr, "Unable to read history: " + "error %d\n", error); + return; + } - for (c = 0; c < vd->vdev_children; c++) - dump_dtl(vd->vdev_child[c], indent + 4); + if (zpool_history_unpack(buf, len, &resid, &events, &num) != 0) + break; + + off -= resid; + } while (len != 0); + + (void) printf("\nHistory:\n"); + for (int i = 0; i < num; i++) { + uint64_t time, txg, ievent; + char *cmd, *intstr; + + if (nvlist_lookup_uint64(events[i], ZPOOL_HIST_TIME, + &time) != 0) + continue; + if (nvlist_lookup_string(events[i], ZPOOL_HIST_CMD, + &cmd) != 0) { + if (nvlist_lookup_uint64(events[i], + ZPOOL_HIST_INT_EVENT, &ievent) != 0) + continue; + verify(nvlist_lookup_uint64(events[i], + ZPOOL_HIST_TXG, &txg) == 0); + verify(nvlist_lookup_string(events[i], + ZPOOL_HIST_INT_STR, &intstr) == 0); + if (ievent >= LOG_END) + continue; + + (void) snprintf(internalstr, + sizeof (internalstr), + "[internal %s txg:%lld] %s", + hist_event_table[ievent], txg, + intstr); + cmd = internalstr; + } + tsec = time; + (void) localtime_r(&tsec, &t); + (void) strftime(tbuf, sizeof (tbuf), "%F.%T", &t); + (void) printf("%s %s\n", tbuf, cmd); + } } /*ARGSUSED*/ @@ -567,35 +805,48 @@ dump_dnode(objset_t *os, uint64_t object, void *data, size_t size) } static uint64_t -blkid2offset(const dnode_phys_t *dnp, int level, uint64_t blkid) +blkid2offset(const dnode_phys_t *dnp, const blkptr_t *bp, const zbookmark_t *zb) { - if (level < 0) - return (blkid); + if (dnp == NULL) { + ASSERT(zb->zb_level < 0); + if (zb->zb_object == 0) + return (zb->zb_blkid); + return (zb->zb_blkid * BP_GET_LSIZE(bp)); + } + + ASSERT(zb->zb_level >= 0); - return ((blkid << (level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * + return ((zb->zb_blkid << + (zb->zb_level * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) * dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT); } static void -sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp, int alldvas) +sprintf_blkptr_compact(char *blkbuf, blkptr_t *bp) { dva_t *dva = bp->blk_dva; - int ndvas = alldvas ? BP_GET_NDVAS(bp) : 1; - int i; + int ndvas = dump_opt['d'] > 5 ? BP_GET_NDVAS(bp) : 1; + + if (dump_opt['b'] >= 5) { + sprintf_blkptr(blkbuf, bp); + return; + } blkbuf[0] = '\0'; - for (i = 0; i < ndvas; i++) + for (int i = 0; i < ndvas; i++) (void) sprintf(blkbuf + strlen(blkbuf), "%llu:%llx:%llx ", (u_longlong_t)DVA_GET_VDEV(&dva[i]), (u_longlong_t)DVA_GET_OFFSET(&dva[i]), (u_longlong_t)DVA_GET_ASIZE(&dva[i])); - (void) sprintf(blkbuf + strlen(blkbuf), "%llxL/%llxP F=%llu B=%llu", + (void) sprintf(blkbuf + strlen(blkbuf), + "%llxL/%llxP F=%llu B=%llu/%llu", (u_longlong_t)BP_GET_LSIZE(bp), (u_longlong_t)BP_GET_PSIZE(bp), (u_longlong_t)bp->blk_fill, - (u_longlong_t)bp->blk_birth); + (u_longlong_t)bp->blk_birth, + (u_longlong_t)BP_PHYSICAL_BIRTH(bp)); } static void @@ -608,8 +859,7 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb, ASSERT3U(BP_GET_TYPE(bp), ==, dnp->dn_type); ASSERT3U(BP_GET_LEVEL(bp), ==, zb->zb_level); - (void) printf("%16llx ", - (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid)); + (void) printf("%16llx ", (u_longlong_t)blkid2offset(dnp, bp, zb)); ASSERT(zb->zb_level >= 0); @@ -621,23 +871,15 @@ print_indirect(blkptr_t *bp, const zbookmark_t *zb, } } - sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0); + sprintf_blkptr_compact(blkbuf, bp); (void) printf("%s\n", blkbuf); } -#define SET_BOOKMARK(zb, objset, object, level, blkid) \ -{ \ - (zb)->zb_objset = objset; \ - (zb)->zb_object = object; \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - static int visit_indirect(spa_t *spa, const dnode_phys_t *dnp, blkptr_t *bp, const zbookmark_t *zb) { - int err; + int err = 0; if (bp->blk_birth == 0) return (0); @@ -670,7 +912,8 @@ visit_indirect(spa_t *spa, const dnode_phys_t *dnp, break; fill += cbp->blk_fill; } - ASSERT3U(fill, ==, bp->blk_fill); + if (!err) + ASSERT3U(fill, ==, bp->blk_fill); (void) arc_buf_remove_ref(buf, &buf); } @@ -687,11 +930,11 @@ dump_indirect(dnode_t *dn) (void) printf("Indirect blocks:\n"); - SET_BOOKMARK(&czb, dmu_objset_id(&dn->dn_objset->os), + SET_BOOKMARK(&czb, dmu_objset_id(dn->dn_objset), dn->dn_object, dnp->dn_nlevels - 1, 0); for (j = 0; j < dnp->dn_nblkptr; j++) { czb.zb_blkid = j; - (void) visit_indirect(dmu_objset_spa(&dn->dn_objset->os), dnp, + (void) visit_indirect(dmu_objset_spa(dn->dn_objset), dnp, &dnp->dn_blkptr[j], &czb); } @@ -767,7 +1010,7 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) nicenum(ds->ds_compressed_bytes, compressed); nicenum(ds->ds_uncompressed_bytes, uncompressed); nicenum(ds->ds_unique_bytes, unique); - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ds->ds_bp); + sprintf_blkptr(blkbuf, &ds->ds_bp); (void) printf("\t\tdir_obj = %llu\n", (u_longlong_t)ds->ds_dir_obj); @@ -781,6 +1024,8 @@ dump_dsl_dataset(objset_t *os, uint64_t object, void *data, size_t size) (u_longlong_t)ds->ds_snapnames_zapobj); (void) printf("\t\tnum_children = %llu\n", (u_longlong_t)ds->ds_num_children); + (void) printf("\t\tuserrefs_obj = %llu\n", + (u_longlong_t)ds->ds_userrefs_obj); (void) printf("\t\tcreation_time = %s", ctime(&crtime)); (void) printf("\t\tcreation_txg = %llu\n", (u_longlong_t)ds->ds_creation_txg); @@ -816,11 +1061,11 @@ dump_bplist(objset_t *mos, uint64_t object, char *name) if (dump_opt['d'] < 3) return; - mutex_init(&bpl.bpl_lock, NULL, MUTEX_DEFAULT, NULL); + bplist_init(&bpl); VERIFY(0 == bplist_open(&bpl, mos, object)); if (bplist_empty(&bpl)) { bplist_close(&bpl); - mutex_destroy(&bpl.bpl_lock); + bplist_fini(&bpl); return; } @@ -838,7 +1083,7 @@ dump_bplist(objset_t *mos, uint64_t object, char *name) if (dump_opt['d'] < 5) { bplist_close(&bpl); - mutex_destroy(&bpl.bpl_lock); + bplist_fini(&bpl); return; } @@ -848,13 +1093,13 @@ dump_bplist(objset_t *mos, uint64_t object, char *name) char blkbuf[BP_SPRINTF_LEN]; ASSERT(bp->blk_birth != 0); - sprintf_blkptr_compact(blkbuf, bp, dump_opt['d'] > 5 ? 1 : 0); + sprintf_blkptr_compact(blkbuf, bp); (void) printf("\tItem %3llu: %s\n", (u_longlong_t)itor - 1, blkbuf); } bplist_close(&bpl); - mutex_destroy(&bpl.bpl_lock); + bplist_fini(&bpl); } static avl_tree_t idx_tree; @@ -906,6 +1151,7 @@ dump_uidgid(objset_t *os, znode_phys_t *zp) /* first find the fuid object. It lives in the master node */ VERIFY(zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, &fuid_obj) == 0); + zfs_fuid_avl_tree_create(&idx_tree, &domain_tree); (void) zfs_fuid_table_load(os, fuid_obj, &idx_tree, &domain_tree); fuid_table_loaded = B_TRUE; @@ -969,7 +1215,7 @@ dump_dmu_objset(objset_t *os, uint64_t object, void *data, size_t size) { } -static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = { +static object_viewer_t *object_viewer[DMU_OT_NUMTYPES + 1] = { dump_none, /* unallocated */ dump_zap, /* object directory */ dump_uint64, /* object array */ @@ -1009,6 +1255,12 @@ static object_viewer_t *object_viewer[DMU_OT_NUMTYPES] = { dump_packed_nvlist, /* FUID nvlist size */ dump_zap, /* DSL dataset next clones */ dump_zap, /* DSL scrub queue */ + dump_zap, /* ZFS user/group used */ + dump_zap, /* ZFS user/group quota */ + dump_zap, /* snapshot refcount tags */ + dump_ddt_zap, /* DDT ZAP object */ + dump_zap, /* DDT statistics */ + dump_unknown /* Unknown type, must be last */ }; static void @@ -1019,18 +1271,19 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) dnode_t *dn; void *bonus = NULL; size_t bsize = 0; - char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], segsize[6]; + char iblk[6], dblk[6], lsize[6], asize[6], bonus_size[6], fill[7]; char aux[50]; int error; if (*print_header) { - (void) printf("\n Object lvl iblk dblk lsize" - " asize type\n"); + (void) printf("\n%10s %3s %5s %5s %5s %5s %6s %s\n", + "Object", "lvl", "iblk", "dblk", "dsize", "lsize", + "%full", "type"); *print_header = 0; } if (object == 0) { - dn = os->os->os_meta_dnode; + dn = os->os_meta_dnode; } else { error = dmu_bonus_hold(os, object, FTAG, &db); if (error) @@ -1044,36 +1297,47 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) nicenum(doi.doi_metadata_block_size, iblk); nicenum(doi.doi_data_block_size, dblk); - nicenum(doi.doi_data_block_size * (doi.doi_max_block_offset + 1), - lsize); - nicenum(doi.doi_physical_blks << 9, asize); + nicenum(doi.doi_max_offset, lsize); + nicenum(doi.doi_physical_blocks_512 << 9, asize); nicenum(doi.doi_bonus_size, bonus_size); + (void) sprintf(fill, "%6.2f", 100.0 * doi.doi_fill_count * + doi.doi_data_block_size / (object == 0 ? DNODES_PER_BLOCK : 1) / + doi.doi_max_offset); aux[0] = '\0'; if (doi.doi_checksum != ZIO_CHECKSUM_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (K=%s)", - zio_checksum_table[doi.doi_checksum].ci_name); + ZDB_CHECKSUM_NAME(doi.doi_checksum)); } if (doi.doi_compress != ZIO_COMPRESS_INHERIT || verbosity >= 6) { (void) snprintf(aux + strlen(aux), sizeof (aux), " (Z=%s)", - zio_compress_table[doi.doi_compress].ci_name); + ZDB_COMPRESS_NAME(doi.doi_compress)); } - (void) printf("%10lld %3u %5s %5s %5s %5s %s%s\n", - (u_longlong_t)object, doi.doi_indirection, iblk, dblk, lsize, - asize, dmu_ot[doi.doi_type].ot_name, aux); + (void) printf("%10lld %3u %5s %5s %5s %5s %6s %s%s\n", + (u_longlong_t)object, doi.doi_indirection, iblk, dblk, + asize, lsize, fill, ZDB_OT_NAME(doi.doi_type), aux); if (doi.doi_bonus_type != DMU_OT_NONE && verbosity > 3) { - (void) printf("%10s %3s %5s %5s %5s %5s %s\n", - "", "", "", "", bonus_size, "bonus", - dmu_ot[doi.doi_bonus_type].ot_name); + (void) printf("%10s %3s %5s %5s %5s %5s %6s %s\n", + "", "", "", "", "", bonus_size, "bonus", + ZDB_OT_NAME(doi.doi_bonus_type)); } if (verbosity >= 4) { - object_viewer[doi.doi_bonus_type](os, object, bonus, bsize); - object_viewer[doi.doi_type](os, object, NULL, 0); + (void) printf("\tdnode flags: %s%s\n", + (dn->dn_phys->dn_flags & DNODE_FLAG_USED_BYTES) ? + "USED_BYTES " : "", + (dn->dn_phys->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) ? + "USERUSED_ACCOUNTED " : ""); + (void) printf("\tdnode maxblkid: %llu\n", + (longlong_t)dn->dn_phys->dn_maxblkid); + + object_viewer[ZDB_OT_TYPE(doi.doi_bonus_type)](os, object, + bonus, bsize); + object_viewer[ZDB_OT_TYPE(doi.doi_type)](os, object, NULL, 0); *print_header = 1; } @@ -1095,6 +1359,7 @@ dump_object(objset_t *os, uint64_t object, int verbosity, int *print_header) } for (;;) { + char segsize[6]; error = dnode_next_offset(dn, 0, &start, minlvl, blkfill, 0); if (error) @@ -1126,7 +1391,7 @@ dump_dir(objset_t *os) uint64_t object, object_count; uint64_t refdbytes, usedobjs, scratch; char numbuf[8]; - char blkbuf[BP_SPRINTF_LEN]; + char blkbuf[BP_SPRINTF_LEN + 20]; char osname[MAXNAMELEN]; char *type = "UNKNOWN"; int verbosity = dump_opt['d']; @@ -1140,21 +1405,20 @@ dump_dir(objset_t *os) if (dds.dds_type == DMU_OST_META) { dds.dds_creation_txg = TXG_INITIAL; - usedobjs = os->os->os_rootbp->blk_fill; - refdbytes = os->os->os_spa->spa_dsl_pool-> + usedobjs = os->os_rootbp->blk_fill; + refdbytes = os->os_spa->spa_dsl_pool-> dp_mos_dir->dd_phys->dd_used_bytes; } else { dmu_objset_space(os, &refdbytes, &scratch, &usedobjs, &scratch); } - ASSERT3U(usedobjs, ==, os->os->os_rootbp->blk_fill); + ASSERT3U(usedobjs, ==, os->os_rootbp->blk_fill); nicenum(refdbytes, numbuf); if (verbosity >= 4) { - (void) strcpy(blkbuf, ", rootbp "); - sprintf_blkptr(blkbuf + strlen(blkbuf), - BP_SPRINTF_LEN - strlen(blkbuf), os->os->os_rootbp); + (void) sprintf(blkbuf, ", rootbp "); + (void) sprintf_blkptr(blkbuf + strlen(blkbuf), os->os_rootbp); } else { blkbuf[0] = '\0'; } @@ -1167,7 +1431,16 @@ dump_dir(objset_t *os) (u_longlong_t)dds.dds_creation_txg, numbuf, (u_longlong_t)usedobjs, blkbuf); - dump_intent_log(dmu_objset_zil(os)); + if (zopt_objects != 0) { + for (i = 0; i < zopt_objects; i++) + dump_object(os, zopt_object[i], verbosity, + &print_header); + (void) printf("\n"); + return; + } + + if (dump_opt['i'] != 0 || verbosity >= 2) + dump_intent_log(dmu_objset_zil(os)); if (dmu_objset_ds(os) != NULL) dump_bplist(dmu_objset_pool(os)->dp_meta_objset, @@ -1176,19 +1449,16 @@ dump_dir(objset_t *os) if (verbosity < 2) return; - if (os->os->os_rootbp->blk_birth == 0) - return; - - if (zopt_objects != 0) { - for (i = 0; i < zopt_objects; i++) - dump_object(os, zopt_object[i], verbosity, - &print_header); - (void) printf("\n"); + if (os->os_rootbp->blk_birth == 0) return; - } dump_object(os, 0, verbosity, &print_header); - object_count = 1; + object_count = 0; + if (os->os_userused_dnode && + os->os_userused_dnode->dn_type != 0) { + dump_object(os, DMU_USERUSED_OBJECT, verbosity, &print_header); + dump_object(os, DMU_GROUPUSED_OBJECT, verbosity, &print_header); + } object = 0; while ((error = dmu_object_next(os, &object, B_FALSE, 0)) == 0) { @@ -1200,16 +1470,18 @@ dump_dir(objset_t *os) (void) printf("\n"); - if (error != ESRCH) - fatal("dmu_object_next() = %d", error); + if (error != ESRCH) { + (void) fprintf(stderr, "dmu_object_next() = %d\n", error); + abort(); + } } static void -dump_uberblock(uberblock_t *ub) +dump_uberblock(uberblock_t *ub, const char *header, const char *footer) { time_t timestamp = ub->ub_timestamp; - (void) printf("Uberblock\n\n"); + (void) printf(header ? header : ""); (void) printf("\tmagic = %016llx\n", (u_longlong_t)ub->ub_magic); (void) printf("\tversion = %llu\n", (u_longlong_t)ub->ub_version); (void) printf("\ttxg = %llu\n", (u_longlong_t)ub->ub_txg); @@ -1218,25 +1490,34 @@ dump_uberblock(uberblock_t *ub) (u_longlong_t)ub->ub_timestamp, asctime(localtime(×tamp))); if (dump_opt['u'] >= 3) { char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &ub->ub_rootbp); + sprintf_blkptr(blkbuf, &ub->ub_rootbp); (void) printf("\trootbp = %s\n", blkbuf); } - (void) printf("\n"); + (void) printf(footer ? footer : ""); } static void -dump_config(const char *pool) +dump_config(spa_t *spa) { - spa_t *spa = NULL; + dmu_buf_t *db; + size_t nvsize = 0; + int error = 0; - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (pool == NULL) - (void) printf("%s\n", spa_name(spa)); - if (pool == NULL || strcmp(pool, spa_name(spa)) == 0) - dump_nvlist(spa->spa_config, 4); + + error = dmu_bonus_hold(spa->spa_meta_objset, + spa->spa_config_object, FTAG, &db); + + if (error == 0) { + nvsize = *(uint64_t *)db->db_data; + dmu_buf_rele(db, FTAG); + + (void) printf("\nMOS Configuration:\n"); + dump_packed_nvlist(spa->spa_meta_objset, + spa->spa_config_object, (void *)&nvsize, 1); + } else { + (void) fprintf(stderr, "dmu_bonus_hold(%llu) failed, errno %d", + (u_longlong_t)spa->spa_config_object, error); } - mutex_exit(&spa_namespace_lock); } static void @@ -1285,6 +1566,30 @@ dump_cachefile(const char *cachefile) nvlist_free(config); } +#define ZDB_MAX_UB_HEADER_SIZE 32 + +static void +dump_label_uberblocks(vdev_label_t *lbl, uint64_t ashift) +{ + vdev_t vd; + vdev_t *vdp = &vd; + char header[ZDB_MAX_UB_HEADER_SIZE]; + + vd.vdev_ashift = ashift; + vdp->vdev_top = vdp; + + for (int i = 0; i < VDEV_UBERBLOCK_COUNT(vdp); i++) { + uint64_t uoff = VDEV_UBERBLOCK_OFFSET(vdp, i); + uberblock_t *ub = (void *)((char *)lbl + uoff); + + if (uberblock_verify(ub)) + continue; + (void) snprintf(header, ZDB_MAX_UB_HEADER_SIZE, + "Uberblock[%d]\n", i); + dump_uberblock(ub, header, ""); + } +} + static void dump_label(const char *dev) { @@ -1293,8 +1598,7 @@ dump_label(const char *dev) char *buf = label.vl_vdev_phys.vp_nvlist; size_t buflen = sizeof (label.vl_vdev_phys.vp_nvlist); struct stat64 statbuf; - uint64_t psize; - int l; + uint64_t psize, ashift; if ((fd = open64(dev, O_RDONLY)) < 0) { (void) printf("cannot open '%s': %s\n", dev, strerror(errno)); @@ -1304,14 +1608,12 @@ dump_label(const char *dev) if (fstat64(fd, &statbuf) != 0) { (void) printf("failed to stat '%s': %s\n", dev, strerror(errno)); - exit(1); } psize = statbuf.st_size; psize = P2ALIGN(psize, (uint64_t)sizeof (vdev_label_t)); - for (l = 0; l < VDEV_LABELS; l++) { - + for (int l = 0; l < VDEV_LABELS; l++) { nvlist_t *config = NULL; (void) printf("--------------------------------------------\n"); @@ -1326,130 +1628,89 @@ dump_label(const char *dev) if (nvlist_unpack(buf, buflen, &config, 0) != 0) { (void) printf("failed to unpack label %d\n", l); - continue; + ashift = SPA_MINBLOCKSHIFT; + } else { + nvlist_t *vdev_tree = NULL; + + dump_nvlist(config, 4); + if ((nvlist_lookup_nvlist(config, + ZPOOL_CONFIG_VDEV_TREE, &vdev_tree) != 0) || + (nvlist_lookup_uint64(vdev_tree, + ZPOOL_CONFIG_ASHIFT, &ashift) != 0)) + ashift = SPA_MINBLOCKSHIFT; + nvlist_free(config); } - dump_nvlist(config, 4); - nvlist_free(config); + if (dump_opt['u']) + dump_label_uberblocks(&label, ashift); } } /*ARGSUSED*/ static int -dump_one_dir(char *dsname, void *arg) +dump_one_dir(const char *dsname, void *arg) { int error; objset_t *os; - error = dmu_objset_open(dsname, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os); + error = dmu_objset_own(dsname, DMU_OST_ANY, B_TRUE, FTAG, &os); if (error) { - (void) printf("Could not open %s\n", dsname); + (void) printf("Could not open %s, error %d\n", dsname, error); return (0); } dump_dir(os); - dmu_objset_close(os); + dmu_objset_disown(os, FTAG); fuid_table_destroy(); return (0); } -static void -zdb_leak(space_map_t *sm, uint64_t start, uint64_t size) -{ - vdev_t *vd = sm->sm_ppd; +/* + * Block statistics. + */ +typedef struct zdb_blkstats { + uint64_t zb_asize; + uint64_t zb_lsize; + uint64_t zb_psize; + uint64_t zb_count; +} zdb_blkstats_t; - (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", - (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); -} +/* + * Extended object types to report deferred frees and dedup auto-ditto blocks. + */ +#define ZDB_OT_DEFERRED (DMU_OT_NUMTYPES + 0) +#define ZDB_OT_DITTO (DMU_OT_NUMTYPES + 1) +#define ZDB_OT_TOTAL (DMU_OT_NUMTYPES + 2) + +static char *zdb_ot_extname[] = { + "deferred free", + "dedup ditto", + "Total", +}; -/* ARGSUSED */ -static void -zdb_space_map_load(space_map_t *sm) -{ -} +#define ZB_TOTAL DN_MAX_LEVELS -static void -zdb_space_map_unload(space_map_t *sm) -{ - space_map_vacate(sm, zdb_leak, sm); -} +typedef struct zdb_cb { + zdb_blkstats_t zcb_type[ZB_TOTAL + 1][ZDB_OT_TOTAL + 1]; + uint64_t zcb_dedup_asize; + uint64_t zcb_dedup_blocks; + uint64_t zcb_errors[256]; + int zcb_readfails; + int zcb_haderrors; +} zdb_cb_t; -/* ARGSUSED */ static void -zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) +zdb_count_block(spa_t *spa, zilog_t *zilog, zdb_cb_t *zcb, const blkptr_t *bp, + dmu_object_type_t type) { -} + uint64_t refcnt = 0; -static space_map_ops_t zdb_space_map_ops = { - zdb_space_map_load, - zdb_space_map_unload, - NULL, /* alloc */ - zdb_space_map_claim, - NULL /* free */ -}; + ASSERT(type < ZDB_OT_TOTAL); -static void -zdb_leak_init(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - mutex_enter(&msp->ms_lock); - VERIFY(space_map_load(&msp->ms_map, &zdb_space_map_ops, - SM_ALLOC, &msp->ms_smo, spa->spa_meta_objset) == 0); - msp->ms_map.sm_ppd = vd; - mutex_exit(&msp->ms_lock); - } - } -} - -static void -zdb_leak_fini(spa_t *spa) -{ - vdev_t *rvd = spa->spa_root_vdev; - - for (int c = 0; c < rvd->vdev_children; c++) { - vdev_t *vd = rvd->vdev_child[c]; - for (int m = 0; m < vd->vdev_ms_count; m++) { - metaslab_t *msp = vd->vdev_ms[m]; - mutex_enter(&msp->ms_lock); - space_map_unload(&msp->ms_map); - mutex_exit(&msp->ms_lock); - } - } -} - -/* - * Verify that the sum of the sizes of all blocks in the pool adds up - * to the SPA's sa_alloc total. - */ -typedef struct zdb_blkstats { - uint64_t zb_asize; - uint64_t zb_lsize; - uint64_t zb_psize; - uint64_t zb_count; -} zdb_blkstats_t; - -#define DMU_OT_DEFERRED DMU_OT_NONE -#define DMU_OT_TOTAL DMU_OT_NUMTYPES - -#define ZB_TOTAL DN_MAX_LEVELS - -typedef struct zdb_cb { - zdb_blkstats_t zcb_type[ZB_TOTAL + 1][DMU_OT_TOTAL + 1]; - uint64_t zcb_errors[256]; - int zcb_readfails; - int zcb_haderrors; -} zdb_cb_t; + if (zilog && zil_bp_tree_add(zilog, bp) != 0) + return; -static void -zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type) -{ for (int i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : ZB_TOTAL; - int t = (i & 1) ? type : DMU_OT_TOTAL; + int t = (i & 1) ? type : ZDB_OT_TOTAL; zdb_blkstats_t *zb = &zcb->zcb_type[l][t]; zb->zb_asize += BP_GET_ASIZE(bp); @@ -1458,114 +1719,240 @@ zdb_count_block(spa_t *spa, zdb_cb_t *zcb, blkptr_t *bp, dmu_object_type_t type) zb->zb_count++; } - if (dump_opt['S']) { - boolean_t print_sig; - - print_sig = !zdb_sig_user_data || (BP_GET_LEVEL(bp) == 0 && - BP_GET_TYPE(bp) == DMU_OT_PLAIN_FILE_CONTENTS); - - if (BP_GET_CHECKSUM(bp) < zdb_sig_cksumalg) - print_sig = B_FALSE; - - if (print_sig) { - (void) printf("%llu\t%lld\t%lld\t%s\t%s\t%s\t" - "%llx:%llx:%llx:%llx\n", - (u_longlong_t)BP_GET_LEVEL(bp), - (longlong_t)BP_GET_PSIZE(bp), - (longlong_t)BP_GET_NDVAS(bp), - dmu_ot[BP_GET_TYPE(bp)].ot_name, - zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, - zio_compress_table[BP_GET_COMPRESS(bp)].ci_name, - (u_longlong_t)bp->blk_cksum.zc_word[0], - (u_longlong_t)bp->blk_cksum.zc_word[1], - (u_longlong_t)bp->blk_cksum.zc_word[2], - (u_longlong_t)bp->blk_cksum.zc_word[3]); + if (dump_opt['L']) + return; + + if (BP_GET_DEDUP(bp)) { + ddt_t *ddt; + ddt_entry_t *dde; + + ddt = ddt_select(spa, bp); + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_FALSE); + + if (dde == NULL) { + refcnt = 0; + } else { + ddt_phys_t *ddp = ddt_phys_select(dde, bp); + ddt_phys_decref(ddp); + refcnt = ddp->ddp_refcnt; + if (ddt_phys_total_refcnt(dde) == 0) + ddt_remove(ddt, dde); } + ddt_exit(ddt); } - if (!dump_opt['L']) - VERIFY(zio_wait(zio_claim(NULL, spa, spa_first_txg(spa), bp, - NULL, NULL, ZIO_FLAG_MUSTSUCCEED)) == 0); + VERIFY3U(zio_wait(zio_claim(NULL, spa, + refcnt ? 0 : spa_first_txg(spa), + bp, NULL, NULL, ZIO_FLAG_CANFAIL)), ==, 0); } static int -zdb_blkptr_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, - const dnode_phys_t *dnp, void *arg) +zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { zdb_cb_t *zcb = arg; char blkbuf[BP_SPRINTF_LEN]; + dmu_object_type_t type; + boolean_t is_metadata; if (bp == NULL) return (0); - zdb_count_block(spa, zcb, bp, BP_GET_TYPE(bp)); + type = BP_GET_TYPE(bp); + + zdb_count_block(spa, zilog, zcb, bp, type); + + is_metadata = (BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata); - if (dump_opt['c'] || dump_opt['S']) { - int ioerr, size; - void *data; + if (dump_opt['c'] > 1 || (dump_opt['c'] && is_metadata)) { + int ioerr; + size_t size = BP_GET_PSIZE(bp); + void *data = malloc(size); + int flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB | ZIO_FLAG_RAW; + + /* If it's an intent log block, failure is expected. */ + if (zb->zb_level == ZB_ZIL_LEVEL) + flags |= ZIO_FLAG_SPECULATIVE; - size = BP_GET_LSIZE(bp); - data = malloc(size); ioerr = zio_wait(zio_read(NULL, spa, bp, data, size, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SCRUB, zb)); + NULL, NULL, ZIO_PRIORITY_ASYNC_READ, flags, zb)); + free(data); - /* We expect io errors on intent log */ - if (ioerr && BP_GET_TYPE(bp) != DMU_OT_INTENT_LOG) { + if (ioerr && !(flags & ZIO_FLAG_SPECULATIVE)) { zcb->zcb_haderrors = 1; zcb->zcb_errors[ioerr]++; if (dump_opt['b'] >= 2) - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp); + sprintf_blkptr(blkbuf, bp); else blkbuf[0] = '\0'; - if (!dump_opt['S']) { - (void) printf("zdb_blkptr_cb: " - "Got error %d reading " - "<%llu, %llu, %lld, %llx> %s -- skipping\n", - ioerr, - (u_longlong_t)zb->zb_objset, - (u_longlong_t)zb->zb_object, - (u_longlong_t)zb->zb_level, - (u_longlong_t)zb->zb_blkid, - blkbuf); - } + (void) printf("zdb_blkptr_cb: " + "Got error %d reading " + "<%llu, %llu, %lld, %llx> %s -- skipping\n", + ioerr, + (u_longlong_t)zb->zb_objset, + (u_longlong_t)zb->zb_object, + (u_longlong_t)zb->zb_level, + (u_longlong_t)zb->zb_blkid, + blkbuf); } } zcb->zcb_readfails = 0; if (dump_opt['b'] >= 4) { - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp); - (void) printf("objset %llu object %llu offset 0x%llx %s\n", + sprintf_blkptr(blkbuf, bp); + (void) printf("objset %llu object %llu " + "level %lld offset 0x%llx %s\n", (u_longlong_t)zb->zb_objset, (u_longlong_t)zb->zb_object, - (u_longlong_t)blkid2offset(dnp, zb->zb_level, zb->zb_blkid), + (longlong_t)zb->zb_level, + (u_longlong_t)blkid2offset(dnp, bp, zb), blkbuf); } return (0); } +static void +zdb_leak(space_map_t *sm, uint64_t start, uint64_t size) +{ + vdev_t *vd = sm->sm_ppd; + + (void) printf("leaked space: vdev %llu, offset 0x%llx, size %llu\n", + (u_longlong_t)vd->vdev_id, (u_longlong_t)start, (u_longlong_t)size); +} + +/* ARGSUSED */ +static void +zdb_space_map_load(space_map_t *sm) +{ +} + +static void +zdb_space_map_unload(space_map_t *sm) +{ + space_map_vacate(sm, zdb_leak, sm); +} + +/* ARGSUSED */ +static void +zdb_space_map_claim(space_map_t *sm, uint64_t start, uint64_t size) +{ +} + +static space_map_ops_t zdb_space_map_ops = { + zdb_space_map_load, + zdb_space_map_unload, + NULL, /* alloc */ + zdb_space_map_claim, + NULL, /* free */ + NULL /* maxsize */ +}; + +static void +zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + ddt_bookmark_t ddb = { 0 }; + ddt_entry_t dde; + int error; + + while ((error = ddt_walk(spa, &ddb, &dde)) == 0) { + blkptr_t blk; + ddt_phys_t *ddp = dde.dde_phys; + + if (ddb.ddb_class == DDT_CLASS_UNIQUE) + return; + + ASSERT(ddt_phys_total_refcnt(&dde) > 1); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(ddb.ddb_checksum, + &dde.dde_key, ddp, &blk); + if (p == DDT_PHYS_DITTO) { + zdb_count_block(spa, NULL, zcb, &blk, + ZDB_OT_DITTO); + } else { + zcb->zcb_dedup_asize += + BP_GET_ASIZE(&blk) * (ddp->ddp_refcnt - 1); + zcb->zcb_dedup_blocks++; + } + } + if (!dump_opt['L']) { + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); + VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); + ddt_exit(ddt); + } + } + + ASSERT(error == ENOENT); +} + +static void +zdb_leak_init(spa_t *spa, zdb_cb_t *zcb) +{ + if (!dump_opt['L']) { + vdev_t *rvd = spa->spa_root_vdev; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + mutex_enter(&msp->ms_lock); + space_map_unload(&msp->ms_map); + VERIFY(space_map_load(&msp->ms_map, + &zdb_space_map_ops, SM_ALLOC, &msp->ms_smo, + spa->spa_meta_objset) == 0); + msp->ms_map.sm_ppd = vd; + mutex_exit(&msp->ms_lock); + } + } + } + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + zdb_ddt_leak_init(spa, zcb); + + spa_config_exit(spa, SCL_CONFIG, FTAG); +} + +static void +zdb_leak_fini(spa_t *spa) +{ + if (!dump_opt['L']) { + vdev_t *rvd = spa->spa_root_vdev; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + mutex_enter(&msp->ms_lock); + space_map_unload(&msp->ms_map); + mutex_exit(&msp->ms_lock); + } + } + } +} + static int dump_block_stats(spa_t *spa) { zdb_cb_t zcb = { 0 }; zdb_blkstats_t *zb, *tzb; - uint64_t alloc, space, logalloc; - vdev_t *rvd = spa->spa_root_vdev; + uint64_t norm_alloc, norm_space, total_alloc, total_found; + int flags = TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA | TRAVERSE_HARD; int leaks = 0; - int c, e; - if (!dump_opt['S']) { - (void) printf("\nTraversing all blocks %s%s%s%s...\n", - (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", - dump_opt['c'] ? "checksums " : "", - (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", - !dump_opt['L'] ? "nothing leaked " : ""); - } + (void) printf("\nTraversing all blocks %s%s%s%s%s...\n", + (dump_opt['c'] || !dump_opt['L']) ? "to verify " : "", + (dump_opt['c'] == 1) ? "metadata " : "", + dump_opt['c'] ? "checksums " : "", + (dump_opt['c'] && !dump_opt['L']) ? "and verify " : "", + !dump_opt['L'] ? "nothing leaked " : ""); /* * Load all space maps as SM_ALLOC maps, then traverse the pool @@ -1575,39 +1962,41 @@ dump_block_stats(spa_t *spa) * it's not part of any space map) is a double allocation, * reference to a freed block, or an unclaimed log block. */ - if (!dump_opt['L']) - zdb_leak_init(spa); + zdb_leak_init(spa, &zcb); /* * If there's a deferred-free bplist, process that first. */ - if (spa->spa_sync_bplist_obj != 0) { - bplist_t *bpl = &spa->spa_sync_bplist; + if (spa->spa_deferred_bplist_obj != 0) { + bplist_t *bpl = &spa->spa_deferred_bplist; blkptr_t blk; uint64_t itor = 0; VERIFY(0 == bplist_open(bpl, spa->spa_meta_objset, - spa->spa_sync_bplist_obj)); + spa->spa_deferred_bplist_obj)); while (bplist_iterate(bpl, &itor, &blk) == 0) { if (dump_opt['b'] >= 4) { char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, &blk); + sprintf_blkptr(blkbuf, &blk); (void) printf("[%s] %s\n", "deferred free", blkbuf); } - zdb_count_block(spa, &zcb, &blk, DMU_OT_DEFERRED); + zdb_count_block(spa, NULL, &zcb, &blk, ZDB_OT_DEFERRED); } bplist_close(bpl); } - zcb.zcb_haderrors |= traverse_pool(spa, zdb_blkptr_cb, &zcb); + if (dump_opt['c'] > 1) + flags |= TRAVERSE_PREFETCH_DATA; + + zcb.zcb_haderrors |= traverse_pool(spa, 0, flags, zdb_blkptr_cb, &zcb); - if (zcb.zcb_haderrors && !dump_opt['S']) { + if (zcb.zcb_haderrors) { (void) printf("\nError counts:\n\n"); (void) printf("\t%5s %s\n", "errno", "count"); - for (e = 0; e < 256; e++) { + for (int e = 0; e < 256; e++) { if (zcb.zcb_errors[e] != 0) { (void) printf("\t%5d %llu\n", e, (u_longlong_t)zcb.zcb_errors[e]); @@ -1618,43 +2007,27 @@ dump_block_stats(spa_t *spa) /* * Report any leaked segments. */ - if (!dump_opt['L']) - zdb_leak_fini(spa); + zdb_leak_fini(spa); - /* - * If we're interested in printing out the blkptr signatures, - * return now as we don't print out anything else (including - * errors and leaks). - */ - if (dump_opt['S']) - return (zcb.zcb_haderrors ? 3 : 0); - - alloc = spa_get_alloc(spa); - space = spa_get_space(spa); - - /* - * Log blocks allocated from a separate log device don't count - * as part of the normal pool space; factor them in here. - */ - logalloc = 0; + tzb = &zcb.zcb_type[ZB_TOTAL][ZDB_OT_TOTAL]; - for (c = 0; c < rvd->vdev_children; c++) - if (rvd->vdev_child[c]->vdev_islog) - logalloc += rvd->vdev_child[c]->vdev_stat.vs_alloc; + norm_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + norm_space = metaslab_class_get_space(spa_normal_class(spa)); - tzb = &zcb.zcb_type[ZB_TOTAL][DMU_OT_TOTAL]; + total_alloc = norm_alloc + metaslab_class_get_alloc(spa_log_class(spa)); + total_found = tzb->zb_asize - zcb.zcb_dedup_asize; - if (tzb->zb_asize == alloc + logalloc) { + if (total_found == total_alloc) { if (!dump_opt['L']) (void) printf("\n\tNo leaks (block sum matches space" " maps exactly)\n"); } else { (void) printf("block traversal size %llu != alloc %llu " "(%s %lld)\n", - (u_longlong_t)tzb->zb_asize, - (u_longlong_t)alloc + logalloc, + (u_longlong_t)total_found, + (u_longlong_t)total_alloc, (dump_opt['L']) ? "unreachable" : "leaked", - (longlong_t)(alloc + logalloc - tzb->zb_asize)); + (longlong_t)(total_alloc - total_found)); leaks = 1; } @@ -1664,33 +2037,40 @@ dump_block_stats(spa_t *spa) (void) printf("\n"); (void) printf("\tbp count: %10llu\n", (u_longlong_t)tzb->zb_count); - (void) printf("\tbp logical: %10llu\t avg: %6llu\n", + (void) printf("\tbp logical: %10llu avg: %6llu\n", (u_longlong_t)tzb->zb_lsize, (u_longlong_t)(tzb->zb_lsize / tzb->zb_count)); - (void) printf("\tbp physical: %10llu\t avg:" - " %6llu\tcompression: %6.2f\n", + (void) printf("\tbp physical: %10llu avg:" + " %6llu compression: %6.2f\n", (u_longlong_t)tzb->zb_psize, (u_longlong_t)(tzb->zb_psize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_psize); - (void) printf("\tbp allocated: %10llu\t avg:" - " %6llu\tcompression: %6.2f\n", + (void) printf("\tbp allocated: %10llu avg:" + " %6llu compression: %6.2f\n", (u_longlong_t)tzb->zb_asize, (u_longlong_t)(tzb->zb_asize / tzb->zb_count), (double)tzb->zb_lsize / tzb->zb_asize); - (void) printf("\tSPA allocated: %10llu\tused: %5.2f%%\n", - (u_longlong_t)alloc, 100.0 * alloc / space); + (void) printf("\tbp deduped: %10llu ref>1:" + " %6llu deduplication: %6.2f\n", + (u_longlong_t)zcb.zcb_dedup_asize, + (u_longlong_t)zcb.zcb_dedup_blocks, + (double)zcb.zcb_dedup_asize / tzb->zb_asize + 1.0); + (void) printf("\tSPA allocated: %10llu used: %5.2f%%\n", + (u_longlong_t)norm_alloc, 100.0 * norm_alloc / norm_space); if (dump_opt['b'] >= 2) { int l, t, level; (void) printf("\nBlocks\tLSIZE\tPSIZE\tASIZE" "\t avg\t comp\t%%Total\tType\n"); - for (t = 0; t <= DMU_OT_NUMTYPES; t++) { + for (t = 0; t <= ZDB_OT_TOTAL; t++) { char csize[6], lsize[6], psize[6], asize[6], avg[6]; char *typename; - typename = t == DMU_OT_DEFERRED ? "deferred free" : - t == DMU_OT_TOTAL ? "Total" : dmu_ot[t].ot_name; + if (t < DMU_OT_NUMTYPES) + typename = dmu_ot[t].ot_name; + else + typename = zdb_ot_extname[t - DMU_OT_NUMTYPES]; if (zcb.zcb_type[ZB_TOTAL][t].zb_asize == 0) { (void) printf("%6s\t%5s\t%5s\t%5s" @@ -1752,33 +2132,154 @@ dump_block_stats(spa_t *spa) return (0); } +typedef struct zdb_ddt_entry { + ddt_key_t zdde_key; + uint64_t zdde_ref_blocks; + uint64_t zdde_ref_lsize; + uint64_t zdde_ref_psize; + uint64_t zdde_ref_dsize; + avl_node_t zdde_node; +} zdb_ddt_entry_t; + +/* ARGSUSED */ +static int +zdb_ddt_add_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + avl_tree_t *t = arg; + avl_index_t where; + zdb_ddt_entry_t *zdde, zdde_search; + + if (bp == NULL) + return (0); + + if (dump_opt['S'] > 1 && zb->zb_level == ZB_ROOT_LEVEL) { + (void) printf("traversing objset %llu, %llu objects, " + "%lu blocks so far\n", + (u_longlong_t)zb->zb_objset, + (u_longlong_t)bp->blk_fill, + avl_numnodes(t)); + } + + if (BP_IS_HOLE(bp) || BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_OFF || + BP_GET_LEVEL(bp) > 0 || dmu_ot[BP_GET_TYPE(bp)].ot_metadata) + return (0); + + ddt_key_fill(&zdde_search.zdde_key, bp); + + zdde = avl_find(t, &zdde_search, &where); + + if (zdde == NULL) { + zdde = umem_zalloc(sizeof (*zdde), UMEM_NOFAIL); + zdde->zdde_key = zdde_search.zdde_key; + avl_insert(t, zdde, where); + } + + zdde->zdde_ref_blocks += 1; + zdde->zdde_ref_lsize += BP_GET_LSIZE(bp); + zdde->zdde_ref_psize += BP_GET_PSIZE(bp); + zdde->zdde_ref_dsize += bp_get_dsize_sync(spa, bp); + + return (0); +} + +static void +dump_simulated_ddt(spa_t *spa) +{ + avl_tree_t t; + void *cookie = NULL; + zdb_ddt_entry_t *zdde; + ddt_histogram_t ddh_total = { 0 }; + ddt_stat_t dds_total = { 0 }; + + avl_create(&t, ddt_entry_compare, + sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); + + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + + (void) traverse_pool(spa, 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, + zdb_ddt_add_cb, &t); + + spa_config_exit(spa, SCL_CONFIG, FTAG); + + while ((zdde = avl_destroy_nodes(&t, &cookie)) != NULL) { + ddt_stat_t dds; + uint64_t refcnt = zdde->zdde_ref_blocks; + ASSERT(refcnt != 0); + + dds.dds_blocks = zdde->zdde_ref_blocks / refcnt; + dds.dds_lsize = zdde->zdde_ref_lsize / refcnt; + dds.dds_psize = zdde->zdde_ref_psize / refcnt; + dds.dds_dsize = zdde->zdde_ref_dsize / refcnt; + + dds.dds_ref_blocks = zdde->zdde_ref_blocks; + dds.dds_ref_lsize = zdde->zdde_ref_lsize; + dds.dds_ref_psize = zdde->zdde_ref_psize; + dds.dds_ref_dsize = zdde->zdde_ref_dsize; + + ddt_stat_add(&ddh_total.ddh_stat[highbit(refcnt) - 1], &dds, 0); + + umem_free(zdde, sizeof (*zdde)); + } + + avl_destroy(&t); + + ddt_histogram_stat(&dds_total, &ddh_total); + + (void) printf("Simulated DDT histogram:\n"); + + zpool_dump_ddt(&dds_total, &ddh_total); + + dump_dedup_ratio(&dds_total); +} + static void dump_zpool(spa_t *spa) { dsl_pool_t *dp = spa_get_dsl(spa); int rc = 0; + if (dump_opt['S']) { + dump_simulated_ddt(spa); + return; + } + + if (!dump_opt['e'] && dump_opt['C'] > 1) { + (void) printf("\nCached configuration:\n"); + dump_nvlist(spa->spa_config, 8); + } + + if (dump_opt['C']) + dump_config(spa); + if (dump_opt['u']) - dump_uberblock(&spa->spa_uberblock); + dump_uberblock(&spa->spa_uberblock, "\nUberblock:\n", "\n"); + + if (dump_opt['D']) + dump_all_ddts(spa); + + if (dump_opt['d'] > 2 || dump_opt['m']) + dump_metaslabs(spa); if (dump_opt['d'] || dump_opt['i']) { dump_dir(dp->dp_meta_objset); if (dump_opt['d'] >= 3) { dump_bplist(dp->dp_meta_objset, - spa->spa_sync_bplist_obj, "Deferred frees"); + spa->spa_deferred_bplist_obj, "Deferred frees"); dump_dtl(spa->spa_root_vdev, 0); - dump_metaslabs(spa); } - (void) dmu_objset_find(spa_name(spa), dump_one_dir, NULL, - DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); + (void) dmu_objset_find(spa_name(spa), dump_one_dir, + NULL, DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); } - - if (dump_opt['b'] || dump_opt['c'] || dump_opt['S']) + if (dump_opt['b'] || dump_opt['c']) rc = dump_block_stats(spa); if (dump_opt['s']) show_pool_stats(spa); + if (dump_opt['h']) + dump_history(spa); + if (rc != 0) exit(rc); } @@ -1797,51 +2298,13 @@ int flagbits[256]; static void zdb_print_blkptr(blkptr_t *bp, int flags) { - dva_t *dva = bp->blk_dva; - int d; + char blkbuf[BP_SPRINTF_LEN]; if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array((void *)bp, sizeof (blkptr_t)); - /* - * Super-ick warning: This code is also duplicated in - * cmd/mdb/common/modules/zfs/zfs.c . Yeah, I hate code - * replication, too. - */ - for (d = 0; d < BP_GET_NDVAS(bp); d++) { - (void) printf("\tDVA[%d]: vdev_id %lld / %llx\n", d, - (longlong_t)DVA_GET_VDEV(&dva[d]), - (longlong_t)DVA_GET_OFFSET(&dva[d])); - (void) printf("\tDVA[%d]: GANG: %-5s GRID: %04llx\t" - "ASIZE: %llx\n", d, - DVA_GET_GANG(&dva[d]) ? "TRUE" : "FALSE", - (longlong_t)DVA_GET_GRID(&dva[d]), - (longlong_t)DVA_GET_ASIZE(&dva[d])); - (void) printf("\tDVA[%d]: :%llu:%llx:%llx:%s%s%s%s\n", d, - (u_longlong_t)DVA_GET_VDEV(&dva[d]), - (longlong_t)DVA_GET_OFFSET(&dva[d]), - (longlong_t)BP_GET_PSIZE(bp), - BP_SHOULD_BYTESWAP(bp) ? "e" : "", - !DVA_GET_GANG(&dva[d]) && BP_GET_LEVEL(bp) != 0 ? - "d" : "", - DVA_GET_GANG(&dva[d]) ? "g" : "", - BP_GET_COMPRESS(bp) != 0 ? "d" : ""); - } - (void) printf("\tLSIZE: %-16llx\t\tPSIZE: %llx\n", - (longlong_t)BP_GET_LSIZE(bp), (longlong_t)BP_GET_PSIZE(bp)); - (void) printf("\tENDIAN: %6s\t\t\t\t\tTYPE: %s\n", - BP_GET_BYTEORDER(bp) ? "LITTLE" : "BIG", - dmu_ot[BP_GET_TYPE(bp)].ot_name); - (void) printf("\tBIRTH: %-16llx LEVEL: %-2llu\tFILL: %llx\n", - (u_longlong_t)bp->blk_birth, (u_longlong_t)BP_GET_LEVEL(bp), - (u_longlong_t)bp->blk_fill); - (void) printf("\tCKFUNC: %-16s\t\tCOMP: %s\n", - zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, - zio_compress_table[BP_GET_COMPRESS(bp)].ci_name); - (void) printf("\tCKSUM: %llx:%llx:%llx:%llx\n", - (u_longlong_t)bp->blk_cksum.zc_word[0], - (u_longlong_t)bp->blk_cksum.zc_word[1], - (u_longlong_t)bp->blk_cksum.zc_word[2], - (u_longlong_t)bp->blk_cksum.zc_word[3]); + + sprintf_blkptr(blkbuf, bp); + (void) printf("%s\n", blkbuf); } static void @@ -1864,7 +2327,7 @@ zdb_dump_block_raw(void *buf, uint64_t size, int flags) { if (flags & ZDB_FLAG_BSWAP) byteswap_uint64_array(buf, size); - (void) write(2, buf, size); + (void) write(1, buf, size); } static void @@ -1967,31 +2430,30 @@ zdb_vdev_lookup(vdev_t *vdev, char *path) * flags - A string of characters specifying options * b: Decode a blkptr at given offset within block * *c: Calculate and display checksums - * *d: Decompress data before dumping + * d: Decompress data before dumping * e: Byteswap data before dumping - * *g: Display data as a gang block header - * *i: Display as an indirect block + * g: Display data as a gang block header + * i: Display as an indirect block * p: Do I/O to physical offset * r: Dump raw data to stdout * * * = not yet implemented */ static void -zdb_read_block(char *thing, spa_t **spap) +zdb_read_block(char *thing, spa_t *spa) { - spa_t *spa = *spap; + blkptr_t blk, *bp = &blk; + dva_t *dva = bp->blk_dva; int flags = 0; - uint64_t offset = 0, size = 0, blkptr_offset = 0; + uint64_t offset = 0, size = 0, psize = 0, lsize = 0, blkptr_offset = 0; zio_t *zio; vdev_t *vd; - void *buf; - char *s, *p, *dup, *pool, *vdev, *flagstr; - int i, error, zio_flags; + void *pbuf, *lbuf, *buf; + char *s, *p, *dup, *vdev, *flagstr; + int i, error; dup = strdup(thing); s = strtok(dup, ":"); - pool = s ? s : ""; - s = strtok(NULL, ":"); vdev = s ? s : ""; s = strtok(NULL, ":"); offset = strtoull(s ? s : "", NULL, 16); @@ -2025,7 +2487,7 @@ zdb_read_block(char *thing, spa_t **spap) flags |= bit; /* If it's not something with an argument, keep going */ - if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_DECOMPRESS | + if ((bit & (ZDB_FLAG_CHECKSUM | ZDB_FLAG_PRINT_BLKPTR)) == 0) continue; @@ -2040,16 +2502,6 @@ zdb_read_block(char *thing, spa_t **spap) } } - if (spa == NULL || strcmp(spa_name(spa), pool) != 0) { - if (spa) - spa_close(spa, (void *)zdb_read_block); - error = spa_open(pool, spap, (void *)zdb_read_block); - if (error) - fatal("Failed to open pool '%s': %s", - pool, strerror(error)); - spa = *spap; - } - vd = zdb_vdev_lookup(spa->spa_root_vdev, vdev); if (vd == NULL) { (void) printf("***Invalid vdev: %s\n", vdev); @@ -2057,22 +2509,58 @@ zdb_read_block(char *thing, spa_t **spap) return; } else { if (vd->vdev_path) - (void) printf("Found vdev: %s\n", vd->vdev_path); + (void) fprintf(stderr, "Found vdev: %s\n", + vd->vdev_path); else - (void) printf("Found vdev type: %s\n", + (void) fprintf(stderr, "Found vdev type: %s\n", vd->vdev_ops->vdev_op_type); } - buf = umem_alloc(size, UMEM_NOFAIL); + psize = size; + lsize = size; + + pbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + lbuf = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + + BP_ZERO(bp); + + DVA_SET_VDEV(&dva[0], vd->vdev_id); + DVA_SET_OFFSET(&dva[0], offset); + DVA_SET_GANG(&dva[0], !!(flags & ZDB_FLAG_GBH)); + DVA_SET_ASIZE(&dva[0], vdev_psize_to_asize(vd, psize)); + + BP_SET_BIRTH(bp, TXG_INITIAL, TXG_INITIAL); - zio_flags = ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | - ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY; + BP_SET_LSIZE(bp, lsize); + BP_SET_PSIZE(bp, psize); + BP_SET_COMPRESS(bp, ZIO_COMPRESS_OFF); + BP_SET_CHECKSUM(bp, ZIO_CHECKSUM_OFF); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); zio = zio_root(spa, NULL, NULL, 0); - /* XXX todo - cons up a BP so RAID-Z will be happy */ - zio_nowait(zio_vdev_child_io(zio, NULL, vd, offset, buf, size, - ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, zio_flags, NULL, NULL)); + + if (vd == vd->vdev_top) { + /* + * Treat this as a normal block read. + */ + zio_nowait(zio_read(zio, spa, bp, pbuf, psize, NULL, NULL, + ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL)); + } else { + /* + * Treat this as a vdev child I/O. + */ + zio_nowait(zio_vdev_child_io(zio, bp, vd, offset, pbuf, psize, + ZIO_TYPE_READ, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE | + ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY | + ZIO_FLAG_CANFAIL | ZIO_FLAG_RAW, NULL, NULL)); + } + error = zio_wait(zio); spa_config_exit(spa, SCL_STATE, FTAG); @@ -2081,6 +2569,52 @@ zdb_read_block(char *thing, spa_t **spap) goto out; } + if (flags & ZDB_FLAG_DECOMPRESS) { + /* + * We don't know how the data was compressed, so just try + * every decompress function at every inflated blocksize. + */ + enum zio_compress c; + void *pbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + void *lbuf2 = umem_alloc(SPA_MAXBLOCKSIZE, UMEM_NOFAIL); + + bcopy(pbuf, pbuf2, psize); + + VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf + psize, + SPA_MAXBLOCKSIZE - psize) == 0); + + VERIFY(random_get_pseudo_bytes((uint8_t *)pbuf2 + psize, + SPA_MAXBLOCKSIZE - psize) == 0); + + for (lsize = SPA_MAXBLOCKSIZE; lsize > psize; + lsize -= SPA_MINBLOCKSIZE) { + for (c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) { + if (zio_decompress_data(c, pbuf, lbuf, + psize, lsize) == 0 && + zio_decompress_data(c, pbuf2, lbuf2, + psize, lsize) == 0 && + bcmp(lbuf, lbuf2, lsize) == 0) + break; + } + if (c != ZIO_COMPRESS_FUNCTIONS) + break; + lsize -= SPA_MINBLOCKSIZE; + } + + umem_free(pbuf2, SPA_MAXBLOCKSIZE); + umem_free(lbuf2, SPA_MAXBLOCKSIZE); + + if (lsize <= psize) { + (void) printf("Decompress of %s failed\n", thing); + goto out; + } + buf = lbuf; + size = lsize; + } else { + buf = pbuf; + size = psize; + } + if (flags & ZDB_FLAG_PRINT_BLKPTR) zdb_print_blkptr((blkptr_t *)(void *) ((uintptr_t)buf + (uintptr_t)blkptr_offset), flags); @@ -2095,134 +2629,92 @@ zdb_read_block(char *thing, spa_t **spap) zdb_dump_block(thing, buf, size, flags); out: - umem_free(buf, size); + umem_free(pbuf, SPA_MAXBLOCKSIZE); + umem_free(lbuf, SPA_MAXBLOCKSIZE); free(dup); } static boolean_t -nvlist_string_match(nvlist_t *config, char *name, char *tgt) +pool_match(nvlist_t *cfg, char *tgt) { + uint64_t v, guid = strtoull(tgt, NULL, 0); char *s; - if (nvlist_lookup_string(config, name, &s) != 0) - return (B_FALSE); - - return (strcmp(s, tgt) == 0); -} - -static boolean_t -nvlist_uint64_match(nvlist_t *config, char *name, uint64_t tgt) -{ - uint64_t val; - - if (nvlist_lookup_uint64(config, name, &val) != 0) - return (B_FALSE); - - return (val == tgt); -} - -static boolean_t -vdev_child_guid_match(nvlist_t *vdev, uint64_t guid) -{ - nvlist_t **child; - uint_t c, children; - - verify(nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0); - for (c = 0; c < children; ++c) - if (nvlist_uint64_match(child[c], ZPOOL_CONFIG_GUID, guid)) - return (B_TRUE); - return (B_FALSE); -} - -static boolean_t -vdev_child_string_match(nvlist_t *vdev, char *tgt) -{ - nvlist_t **child; - uint_t c, children; - - verify(nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0); - for (c = 0; c < children; ++c) { - if (nvlist_string_match(child[c], ZPOOL_CONFIG_PATH, tgt) || - nvlist_string_match(child[c], ZPOOL_CONFIG_DEVID, tgt)) - return (B_TRUE); - } - return (B_FALSE); -} - -static boolean_t -vdev_guid_match(nvlist_t *config, uint64_t guid) -{ - nvlist_t *nvroot; - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - - return (nvlist_uint64_match(nvroot, ZPOOL_CONFIG_GUID, guid) || - vdev_child_guid_match(nvroot, guid)); -} - -static boolean_t -vdev_string_match(nvlist_t *config, char *tgt) -{ - nvlist_t *nvroot; - - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - - return (vdev_child_string_match(nvroot, tgt)); -} - -static boolean_t -pool_match(nvlist_t *config, char *tgt) -{ - uint64_t guid = strtoull(tgt, NULL, 0); - if (guid != 0) { - return ( - nvlist_uint64_match(config, ZPOOL_CONFIG_POOL_GUID, guid) || - vdev_guid_match(config, guid)); + if (nvlist_lookup_uint64(cfg, ZPOOL_CONFIG_POOL_GUID, &v) == 0) + return (v == guid); } else { - return ( - nvlist_string_match(config, ZPOOL_CONFIG_POOL_NAME, tgt) || - vdev_string_match(config, tgt)); + if (nvlist_lookup_string(cfg, ZPOOL_CONFIG_POOL_NAME, &s) == 0) + return (strcmp(s, tgt) == 0); } + return (B_FALSE); } -static int -find_exported_zpool(char *pool_id, nvlist_t **configp, char *vdev_dir) +static char * +find_zpool(char **target, nvlist_t **configp, int dirc, char **dirv) { nvlist_t *pools; - int error = ENOENT; nvlist_t *match = NULL; + char *name = NULL; + char *sepp = NULL; + char sep; + int count = 0; + importargs_t args = { 0 }; - if (vdev_dir != NULL) - pools = zpool_find_import_activeok(g_zfs, 1, &vdev_dir); - else - pools = zpool_find_import_activeok(g_zfs, 0, NULL); + args.paths = dirc; + args.path = dirv; + args.can_be_active = B_TRUE; + + if ((sepp = strpbrk(*target, "/@")) != NULL) { + sep = *sepp; + *sepp = '\0'; + } + + pools = zpool_search_import(g_zfs, &args); if (pools != NULL) { nvpair_t *elem = NULL; - while ((elem = nvlist_next_nvpair(pools, elem)) != NULL) { verify(nvpair_value_nvlist(elem, configp) == 0); - if (pool_match(*configp, pool_id)) { + if (pool_match(*configp, *target)) { + count++; if (match != NULL) { - (void) fatal( - "More than one matching pool - " - "specify guid/devid/device path."); + /* print previously found config */ + if (name != NULL) { + (void) printf("%s\n", name); + dump_nvlist(match, 8); + name = NULL; + } + (void) printf("%s\n", + nvpair_name(elem)); + dump_nvlist(*configp, 8); } else { match = *configp; - error = 0; + name = nvpair_name(elem); } } } } + if (count > 1) + (void) fatal("\tMatched %d pools - use pool GUID " + "instead of pool name or \n" + "\tpool name part of a dataset name to select pool", count); - *configp = error ? NULL : match; + if (sepp) + *sepp = sep; + /* + * If pool GUID was specified for pool id, replace it with pool name + */ + if (name && (strstr(*target, name) != *target)) { + int sz = 1 + strlen(name) + ((sepp) ? strlen(sepp) : 0); - return (error); + *target = umem_alloc(sz, UMEM_NOFAIL); + (void) snprintf(*target, sz, "%s%s", name, sepp ? sepp : ""); + } + + *configp = name ? match : NULL; + + return (name); } int @@ -2230,66 +2722,76 @@ main(int argc, char **argv) { int i, c; struct rlimit rl = { 1024, 1024 }; - spa_t *spa; + spa_t *spa = NULL; objset_t *os = NULL; - char *endstr; int dump_all = 1; int verbose = 0; - int error; - int exported = 0; - char *vdev_dir = NULL; + int error = 0; + char **searchdirs = NULL; + int nsearch = 0; + char *target; + nvlist_t *policy = NULL; + uint64_t max_txg = UINT64_MAX; + int rewind = ZPOOL_NEVER_REWIND; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); dprintf_setup(&argc, argv); - while ((c = getopt(argc, argv, "udibcsvCLS:U:lRep:")) != -1) { + while ((c = getopt(argc, argv, "bcdhilmsuCDRSAFLXevp:t:U:")) != -1) { switch (c) { - case 'u': - case 'd': - case 'i': case 'b': case 'c': + case 'd': + case 'h': + case 'i': + case 'l': + case 'm': case 's': + case 'u': case 'C': - case 'l': + case 'D': case 'R': + case 'S': dump_opt[c]++; dump_all = 0; break; + case 'A': + case 'F': case 'L': + case 'X': + case 'e': dump_opt[c]++; break; case 'v': verbose++; break; - case 'U': - spa_config_path = optarg; - break; - case 'e': - exported = 1; - break; case 'p': - vdev_dir = optarg; + if (searchdirs == NULL) { + searchdirs = umem_alloc(sizeof (char *), + UMEM_NOFAIL); + } else { + char **tmp = umem_alloc((nsearch + 1) * + sizeof (char *), UMEM_NOFAIL); + bcopy(searchdirs, tmp, nsearch * + sizeof (char *)); + umem_free(searchdirs, + nsearch * sizeof (char *)); + searchdirs = tmp; + } + searchdirs[nsearch++] = optarg; break; - case 'S': - dump_opt[c]++; - dump_all = 0; - zdb_sig_user_data = (strncmp(optarg, "user:", 5) == 0); - if (!zdb_sig_user_data && strncmp(optarg, "all:", 4)) - usage(); - endstr = strchr(optarg, ':') + 1; - if (strcmp(endstr, "fletcher2") == 0) - zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2; - else if (strcmp(endstr, "fletcher4") == 0) - zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_4; - else if (strcmp(endstr, "sha256") == 0) - zdb_sig_cksumalg = ZIO_CHECKSUM_SHA256; - else if (strcmp(endstr, "all") == 0) - zdb_sig_cksumalg = ZIO_CHECKSUM_FLETCHER_2; - else + case 't': + max_txg = strtoull(optarg, NULL, 0); + if (max_txg < TXG_INITIAL) { + (void) fprintf(stderr, "incorrect txg " + "specified: %s\n", optarg); usage(); + } + break; + case 'U': + spa_config_path = optarg; break; default: usage(); @@ -2297,7 +2799,7 @@ main(int argc, char **argv) } } - if (vdev_dir != NULL && exported == 0) { + if (!dump_opt['e'] && searchdirs != NULL) { (void) fprintf(stderr, "-p option requires use of -e\n"); usage(); } @@ -2306,18 +2808,26 @@ main(int argc, char **argv) g_zfs = libzfs_init(); ASSERT(g_zfs != NULL); + if (dump_all) + verbose = MAX(verbose, 1); + for (c = 0; c < 256; c++) { - if (dump_all && c != 'l' && c != 'R') + if (dump_all && !strchr("elAFLRSX", c)) dump_opt[c] = 1; if (dump_opt[c]) dump_opt[c] += verbose; } + aok = (dump_opt['A'] == 1) || (dump_opt['A'] > 2); + zfs_recover = (dump_opt['A'] > 1); + argc -= optind; argv += optind; + if (argc < 2 && dump_opt['R']) + usage(); if (argc < 1) { - if (dump_opt['C']) { + if (!dump_opt['e'] && dump_opt['C']) { dump_cachefile(spa_config_path); return (0); } @@ -2329,98 +2839,102 @@ main(int argc, char **argv) return (0); } - if (dump_opt['R']) { - flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; - flagbits['c'] = ZDB_FLAG_CHECKSUM; - flagbits['d'] = ZDB_FLAG_DECOMPRESS; - flagbits['e'] = ZDB_FLAG_BSWAP; - flagbits['g'] = ZDB_FLAG_GBH; - flagbits['i'] = ZDB_FLAG_INDIRECT; - flagbits['p'] = ZDB_FLAG_PHYS; - flagbits['r'] = ZDB_FLAG_RAW; - - spa = NULL; - while (argv[0]) { - zdb_read_block(argv[0], &spa); - argv++; - argc--; - } - if (spa) - spa_close(spa, (void *)zdb_read_block); - return (0); - } + if (dump_opt['X'] || dump_opt['F']) + rewind = ZPOOL_DO_REWIND | + (dump_opt['X'] ? ZPOOL_EXTREME_REWIND : 0); - if (dump_opt['C']) - dump_config(argv[0]); + if (nvlist_alloc(&policy, NV_UNIQUE_NAME_TYPE, 0) != 0 || + nvlist_add_uint64(policy, ZPOOL_REWIND_REQUEST_TXG, max_txg) != 0 || + nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind) != 0) + fatal("internal error: %s", strerror(ENOMEM)); error = 0; - if (exported) { - /* - * Check to see if the name refers to an exported zpool - */ - char *slash; - nvlist_t *exported_conf = NULL; - - if ((slash = strchr(argv[0], '/')) != NULL) - *slash = '\0'; - - error = find_exported_zpool(argv[0], &exported_conf, vdev_dir); - if (error == 0) { - nvlist_t *nvl = NULL; - - if (vdev_dir != NULL) { - if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) - error = ENOMEM; - else if (nvlist_add_string(nvl, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), - vdev_dir) != 0) - error = ENOMEM; - } + target = argv[0]; - if (error == 0) - error = spa_import_faulted(argv[0], - exported_conf, nvl); + if (dump_opt['e']) { + nvlist_t *cfg = NULL; + char *name = find_zpool(&target, &cfg, nsearch, searchdirs); - nvlist_free(nvl); + error = ENOENT; + if (name) { + if (dump_opt['C'] > 1) { + (void) printf("\nConfiguration for import:\n"); + dump_nvlist(cfg, 8); + } + if (nvlist_add_nvlist(cfg, + ZPOOL_REWIND_POLICY, policy) != 0) { + fatal("can't open '%s': %s", + target, strerror(ENOMEM)); + } + if ((error = spa_import(name, cfg, NULL)) != 0) + error = spa_import_verbatim(name, cfg, NULL); } - - if (slash != NULL) - *slash = '/'; } if (error == 0) { - if (strchr(argv[0], '/') != NULL) { - error = dmu_objset_open(argv[0], DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os); + if (strpbrk(target, "/@") == NULL || dump_opt['R']) { + error = spa_open_rewind(target, &spa, FTAG, policy, + NULL); + if (error) { + /* + * If we're missing the log device then + * try opening the pool after clearing the + * log state. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(target)) != NULL && + spa->spa_log_state == SPA_LOG_MISSING) { + spa->spa_log_state = SPA_LOG_CLEAR; + error = 0; + } + mutex_exit(&spa_namespace_lock); + + if (!error) { + error = spa_open_rewind(target, &spa, + FTAG, policy, NULL); + } + } } else { - error = spa_open(argv[0], &spa, FTAG); + error = dmu_objset_own(target, DMU_OST_ANY, + B_TRUE, FTAG, &os); } } + nvlist_free(policy); if (error) - fatal("can't open %s: %s", argv[0], strerror(error)); + fatal("can't open '%s': %s", target, strerror(error)); argv++; - if (--argc > 0) { - zopt_objects = argc; - zopt_object = calloc(zopt_objects, sizeof (uint64_t)); - for (i = 0; i < zopt_objects; i++) { - errno = 0; - zopt_object[i] = strtoull(argv[i], NULL, 0); - if (zopt_object[i] == 0 && errno != 0) - fatal("bad object number %s: %s", - argv[i], strerror(errno)); + argc--; + if (!dump_opt['R']) { + if (argc > 0) { + zopt_objects = argc; + zopt_object = calloc(zopt_objects, sizeof (uint64_t)); + for (i = 0; i < zopt_objects; i++) { + errno = 0; + zopt_object[i] = strtoull(argv[i], NULL, 0); + if (zopt_object[i] == 0 && errno != 0) + fatal("bad number %s: %s", + argv[i], strerror(errno)); + } } - } - - if (os != NULL) { - dump_dir(os); - dmu_objset_close(os); + (os != NULL) ? dump_dir(os) : dump_zpool(spa); } else { - dump_zpool(spa); - spa_close(spa, FTAG); + flagbits['b'] = ZDB_FLAG_PRINT_BLKPTR; + flagbits['c'] = ZDB_FLAG_CHECKSUM; + flagbits['d'] = ZDB_FLAG_DECOMPRESS; + flagbits['e'] = ZDB_FLAG_BSWAP; + flagbits['g'] = ZDB_FLAG_GBH; + flagbits['i'] = ZDB_FLAG_INDIRECT; + flagbits['p'] = ZDB_FLAG_PHYS; + flagbits['r'] = ZDB_FLAG_RAW; + + for (i = 0; i < argc; i++) + zdb_read_block(argv[i], spa); } + (os != NULL) ? dmu_objset_disown(os, FTAG) : spa_close(spa, FTAG); + fuid_table_destroy(); libzfs_fini(g_zfs); diff --git a/external/cddl/osnet/dist/cmd/zdb/zdb_il.c b/external/cddl/osnet/dist/cmd/zdb/zdb_il.c index 02d35a050332e..a0ed985f52b77 100644 --- a/external/cddl/osnet/dist/cmd/zdb/zdb_il.c +++ b/external/cddl/osnet/dist/cmd/zdb/zdb_il.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Print intent log header and statistics. */ @@ -42,12 +40,14 @@ extern uint8_t dump_opt[256]; +static char prefix[4] = "\t\t\t"; + static void print_log_bp(const blkptr_t *bp, const char *prefix) { char blkbuf[BP_SPRINTF_LEN]; - sprintf_blkptr(blkbuf, BP_SPRINTF_LEN, bp); + sprintf_blkptr(blkbuf, bp); (void) printf("%s%s\n", prefix, blkbuf); } @@ -56,19 +56,29 @@ static void zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr) { time_t crtime = lr->lr_crtime[0]; - char *name = (char *)(lr + 1); - char *link = name + strlen(name) + 1; + char *name, *link; + lr_attr_t *lrattr; - if (txtype == TX_SYMLINK) - (void) printf("\t\t\t%s -> %s\n", name, link); - else - (void) printf("\t\t\t%s\n", name); + name = (char *)(lr + 1); - (void) printf("\t\t\t%s", ctime(&crtime)); - (void) printf("\t\t\tdoid %llu, foid %llu, mode %llo\n", + if (lr->lr_common.lrc_txtype == TX_CREATE_ATTR || + lr->lr_common.lrc_txtype == TX_MKDIR_ATTR) { + lrattr = (lr_attr_t *)(lr + 1); + name += ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); + } + + if (txtype == TX_SYMLINK) { + link = name + strlen(name) + 1; + (void) printf("%s%s -> %s\n", prefix, name, link); + } else if (txtype != TX_MKXATTR) { + (void) printf("%s%s\n", prefix, name); + } + + (void) printf("%s%s", prefix, ctime(&crtime)); + (void) printf("%sdoid %llu, foid %llu, mode %llo\n", prefix, (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_mode); - (void) printf("\t\t\tuid %llu, gid %llu, gen %llu, rdev 0x%llx\n", + (void) printf("%suid %llu, gid %llu, gen %llu, rdev 0x%llx\n", prefix, (u_longlong_t)lr->lr_uid, (u_longlong_t)lr->lr_gid, (u_longlong_t)lr->lr_gen, (u_longlong_t)lr->lr_rdev); } @@ -77,7 +87,7 @@ zil_prt_rec_create(zilog_t *zilog, int txtype, lr_create_t *lr) static void zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr) { - (void) printf("\t\t\tdoid %llu, name %s\n", + (void) printf("%sdoid %llu, name %s\n", prefix, (u_longlong_t)lr->lr_doid, (char *)(lr + 1)); } @@ -85,7 +95,7 @@ zil_prt_rec_remove(zilog_t *zilog, int txtype, lr_remove_t *lr) static void zil_prt_rec_link(zilog_t *zilog, int txtype, lr_link_t *lr) { - (void) printf("\t\t\tdoid %llu, link_obj %llu, name %s\n", + (void) printf("%sdoid %llu, link_obj %llu, name %s\n", prefix, (u_longlong_t)lr->lr_doid, (u_longlong_t)lr->lr_link_obj, (char *)(lr + 1)); } @@ -97,9 +107,9 @@ zil_prt_rec_rename(zilog_t *zilog, int txtype, lr_rename_t *lr) char *snm = (char *)(lr + 1); char *tnm = snm + strlen(snm) + 1; - (void) printf("\t\t\tsdoid %llu, tdoid %llu\n", + (void) printf("%ssdoid %llu, tdoid %llu\n", prefix, (u_longlong_t)lr->lr_sdoid, (u_longlong_t)lr->lr_tdoid); - (void) printf("\t\t\tsrc %s tgt %s\n", snm, tnm); + (void) printf("%ssrc %s tgt %s\n", prefix, snm, tnm); } /* ARGSUSED */ @@ -108,43 +118,48 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) { char *data, *dlimit; blkptr_t *bp = &lr->lr_blkptr; + zbookmark_t zb; char buf[SPA_MAXBLOCKSIZE]; int verbose = MAX(dump_opt['d'], dump_opt['i']); int error; - (void) printf("\t\t\tfoid %llu, offset 0x%llx," - " length 0x%llx, blkoff 0x%llx\n", - (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, - (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blkoff); + (void) printf("%sfoid %llu, offset %llx, length %llx\n", prefix, + (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, + (u_longlong_t)lr->lr_length); - if (verbose < 5) + if (txtype == TX_WRITE2 || verbose < 5) return; if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { - (void) printf("\t\t\thas blkptr, %s\n", + (void) printf("%shas blkptr, %s\n", prefix, bp->blk_birth >= spa_first_txg(zilog->zl_spa) ? "will claim" : "won't claim"); - print_log_bp(bp, "\t\t\t"); + print_log_bp(bp, prefix); + + if (BP_IS_HOLE(bp)) { + (void) printf("\t\t\tLSIZE 0x%llx\n", + (u_longlong_t)BP_GET_LSIZE(bp)); + } if (bp->blk_birth == 0) { bzero(buf, sizeof (buf)); - } else { - zbookmark_t zb; - - ASSERT3U(bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], ==, - dmu_objset_id(zilog->zl_os)); - - zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; - - error = zio_wait(zio_read(NULL, zilog->zl_spa, - bp, buf, BP_GET_LSIZE(bp), NULL, NULL, - ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); - if (error) - return; + (void) printf("%s\n", prefix); + return; } - data = buf + lr->lr_blkoff; + if (bp->blk_birth < zilog->zl_header->zh_claim_txg) { + (void) printf("%s\n", prefix); + return; + } + + SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); + + error = zio_wait(zio_read(NULL, zilog->zl_spa, + bp, buf, BP_GET_LSIZE(bp), NULL, NULL, + ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &zb)); + if (error) + return; + data = buf; } else { data = (char *)(lr + 1); } @@ -152,7 +167,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) dlimit = data + MIN(lr->lr_length, (verbose < 6 ? 20 : SPA_MAXBLOCKSIZE)); - (void) printf("\t\t\t"); + (void) printf("%s", prefix); while (data < dlimit) { if (isprint(*data)) (void) printf("%c ", *data); @@ -167,7 +182,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, lr_write_t *lr) static void zil_prt_rec_truncate(zilog_t *zilog, int txtype, lr_truncate_t *lr) { - (void) printf("\t\t\tfoid %llu, offset 0x%llx, length 0x%llx\n", + (void) printf("%sfoid %llu, offset 0x%llx, length 0x%llx\n", prefix, (u_longlong_t)lr->lr_foid, (longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length); } @@ -179,38 +194,38 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr) time_t atime = (time_t)lr->lr_atime[0]; time_t mtime = (time_t)lr->lr_mtime[0]; - (void) printf("\t\t\tfoid %llu, mask 0x%llx\n", + (void) printf("%sfoid %llu, mask 0x%llx\n", prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_mask); if (lr->lr_mask & AT_MODE) { - (void) printf("\t\t\tAT_MODE %llo\n", + (void) printf("%sAT_MODE %llo\n", prefix, (longlong_t)lr->lr_mode); } if (lr->lr_mask & AT_UID) { - (void) printf("\t\t\tAT_UID %llu\n", + (void) printf("%sAT_UID %llu\n", prefix, (u_longlong_t)lr->lr_uid); } if (lr->lr_mask & AT_GID) { - (void) printf("\t\t\tAT_GID %llu\n", + (void) printf("%sAT_GID %llu\n", prefix, (u_longlong_t)lr->lr_gid); } if (lr->lr_mask & AT_SIZE) { - (void) printf("\t\t\tAT_SIZE %llu\n", + (void) printf("%sAT_SIZE %llu\n", prefix, (u_longlong_t)lr->lr_size); } if (lr->lr_mask & AT_ATIME) { - (void) printf("\t\t\tAT_ATIME %llu.%09llu %s", + (void) printf("%sAT_ATIME %llu.%09llu %s", prefix, (u_longlong_t)lr->lr_atime[0], (u_longlong_t)lr->lr_atime[1], ctime(&atime)); } if (lr->lr_mask & AT_MTIME) { - (void) printf("\t\t\tAT_MTIME %llu.%09llu %s", + (void) printf("%sAT_MTIME %llu.%09llu %s", prefix, (u_longlong_t)lr->lr_mtime[0], (u_longlong_t)lr->lr_mtime[1], ctime(&mtime)); @@ -221,7 +236,7 @@ zil_prt_rec_setattr(zilog_t *zilog, int txtype, lr_setattr_t *lr) static void zil_prt_rec_acl(zilog_t *zilog, int txtype, lr_acl_t *lr) { - (void) printf("\t\t\tfoid %llu, aclcnt %llu\n", + (void) printf("%sfoid %llu, aclcnt %llu\n", prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_aclcnt); } @@ -253,10 +268,11 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { { zil_prt_rec_create, "TX_MKDIR_ACL " }, { zil_prt_rec_create, "TX_MKDIR_ATTR " }, { zil_prt_rec_create, "TX_MKDIR_ACL_ATTR " }, + { zil_prt_rec_write, "TX_WRITE2 " }, }; /* ARGSUSED */ -static void +static int print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) { int txtype; @@ -280,23 +296,24 @@ print_log_record(zilog_t *zilog, lr_t *lr, void *arg, uint64_t claim_txg) zil_rec_info[txtype].zri_count++; zil_rec_info[0].zri_count++; + + return (0); } /* ARGSUSED */ -static void +static int print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { - char blkbuf[BP_SPRINTF_LEN]; + char blkbuf[BP_SPRINTF_LEN + 10]; int verbose = MAX(dump_opt['d'], dump_opt['i']); char *claim; if (verbose <= 3) - return; + return (0); if (verbose >= 5) { (void) strcpy(blkbuf, ", "); - sprintf_blkptr(blkbuf + strlen(blkbuf), - BP_SPRINTF_LEN - strlen(blkbuf), bp); + sprintf_blkptr(blkbuf + strlen(blkbuf), bp); } else { blkbuf[0] = '\0'; } @@ -310,6 +327,8 @@ print_log_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) (void) printf("\tBlock seqno %llu, %s%s\n", (u_longlong_t)bp->blk_cksum.zc_word[ZIL_ZC_SEQ], claim, blkbuf); + + return (0); } static void @@ -342,14 +361,16 @@ dump_intent_log(zilog_t *zilog) int verbose = MAX(dump_opt['d'], dump_opt['i']); int i; - if (zh->zh_log.blk_birth == 0 || verbose < 2) + if (zh->zh_log.blk_birth == 0 || verbose < 1) return; - (void) printf("\n ZIL header: claim_txg %llu, seq %llu\n", - (u_longlong_t)zh->zh_claim_txg, (u_longlong_t)zh->zh_replay_seq); - - if (verbose >= 4) - print_log_bp(&zh->zh_log, "\n\tfirst block: "); + (void) printf("\n ZIL header: claim_txg %llu, " + "claim_blk_seq %llu, claim_lr_seq %llu", + (u_longlong_t)zh->zh_claim_txg, + (u_longlong_t)zh->zh_claim_blk_seq, + (u_longlong_t)zh->zh_claim_lr_seq); + (void) printf(" replay_seq %llu, flags 0x%llx\n", + (u_longlong_t)zh->zh_replay_seq, (u_longlong_t)zh->zh_flags); for (i = 0; i < TX_MAX_TYPE; i++) zil_rec_info[i].zri_count = 0; diff --git a/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c b/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c index a22370a027956..f70bebe00b53a 100644 --- a/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c +++ b/external/cddl/osnet/dist/cmd/zfs/zfs_iter.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -53,11 +53,14 @@ typedef struct zfs_node { } zfs_node_t; typedef struct callback_data { - uu_avl_t *cb_avl; - int cb_flags; - zfs_type_t cb_types; - zfs_sort_column_t *cb_sortcol; - zprop_list_t **cb_proplist; + uu_avl_t *cb_avl; + int cb_flags; + zfs_type_t cb_types; + zfs_sort_column_t *cb_sortcol; + zprop_list_t **cb_proplist; + int cb_depth_limit; + int cb_depth; + uint8_t cb_props_table[ZFS_NUM_PROPS]; } callback_data_t; uu_avl_pool_t *avl_pool; @@ -98,10 +101,18 @@ zfs_callback(zfs_handle_t *zhp, void *data) uu_avl_node_init(node, &node->zn_avlnode, avl_pool); if (uu_avl_find(cb->cb_avl, node, cb->cb_sortcol, &idx) == NULL) { - if (cb->cb_proplist && - zfs_expand_proplist(zhp, cb->cb_proplist) != 0) { - free(node); - return (-1); + if (cb->cb_proplist) { + if ((*cb->cb_proplist) && + !(*cb->cb_proplist)->pl_all) + zfs_prune_proplist(zhp, + cb->cb_props_table); + + if (zfs_expand_proplist(zhp, cb->cb_proplist, + (cb->cb_flags & ZFS_ITER_RECVD_PROPS)) + != 0) { + free(node); + return (-1); + } } uu_avl_insert(cb->cb_avl, node, idx); dontclose = 1; @@ -113,11 +124,15 @@ zfs_callback(zfs_handle_t *zhp, void *data) /* * Recurse if necessary. */ - if (cb->cb_flags & ZFS_ITER_RECURSE) { + if (cb->cb_flags & ZFS_ITER_RECURSE && + ((cb->cb_flags & ZFS_ITER_DEPTH_LIMIT) == 0 || + cb->cb_depth < cb->cb_depth_limit)) { + cb->cb_depth++; if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) (void) zfs_iter_filesystems(zhp, zfs_callback, data); if ((zfs_get_type(zhp) != ZFS_TYPE_SNAPSHOT) && include_snaps) (void) zfs_iter_snapshots(zhp, zfs_callback, data); + cb->cb_depth--; } if (!dontclose) @@ -325,10 +340,10 @@ zfs_sort(const void *larg, const void *rarg, void *data) int zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, - zfs_sort_column_t *sortcol, zprop_list_t **proplist, + zfs_sort_column_t *sortcol, zprop_list_t **proplist, int limit, zfs_iter_f callback, void *data) { - callback_data_t cb; + callback_data_t cb = {0}; int ret = 0; zfs_node_t *node; uu_avl_walk_t *walk; @@ -346,6 +361,45 @@ zfs_for_each(int argc, char **argv, int flags, zfs_type_t types, cb.cb_flags = flags; cb.cb_proplist = proplist; cb.cb_types = types; + cb.cb_depth_limit = limit; + /* + * If cb_proplist is provided then in the zfs_handles created we + * retain only those properties listed in cb_proplist and sortcol. + * The rest are pruned. So, the caller should make sure that no other + * properties other than those listed in cb_proplist/sortcol are + * accessed. + * + * If cb_proplist is NULL then we retain all the properties. We + * always retain the zoned property, which some other properties + * need (userquota & friends), and the createtxg property, which + * we need to sort snapshots. + */ + if (cb.cb_proplist && *cb.cb_proplist) { + zprop_list_t *p = *cb.cb_proplist; + + while (p) { + if (p->pl_prop >= ZFS_PROP_TYPE && + p->pl_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[p->pl_prop] = B_TRUE; + } + p = p->pl_next; + } + + while (sortcol) { + if (sortcol->sc_prop >= ZFS_PROP_TYPE && + sortcol->sc_prop < ZFS_NUM_PROPS) { + cb.cb_props_table[sortcol->sc_prop] = B_TRUE; + } + sortcol = sortcol->sc_next; + } + + cb.cb_props_table[ZFS_PROP_ZONED] = B_TRUE; + cb.cb_props_table[ZFS_PROP_CREATETXG] = B_TRUE; + } else { + (void) memset(cb.cb_props_table, B_TRUE, + sizeof (cb.cb_props_table)); + } + if ((cb.cb_avl = uu_avl_create(avl_pool, NULL, UU_DEFAULT)) == NULL) { (void) fprintf(stderr, gettext("internal error: out of memory\n")); diff --git a/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h b/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h index 76a11085a1ef5..8c6b9fdef54f0 100644 --- a/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h +++ b/external/cddl/osnet/dist/cmd/zfs/zfs_iter.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -41,9 +41,11 @@ typedef struct zfs_sort_column { #define ZFS_ITER_RECURSE (1 << 0) #define ZFS_ITER_ARGS_CAN_BE_PATHS (1 << 1) #define ZFS_ITER_PROP_LISTSNAPS (1 << 2) +#define ZFS_ITER_DEPTH_LIMIT (1 << 3) +#define ZFS_ITER_RECVD_PROPS (1 << 4) int zfs_for_each(int, char **, int options, zfs_type_t, - zfs_sort_column_t *, zprop_list_t **, zfs_iter_f, void *); + zfs_sort_column_t *, zprop_list_t **, int, zfs_iter_f, void *); int zfs_add_sort_column(zfs_sort_column_t **, const char *, boolean_t); void zfs_free_sort_columns(zfs_sort_column_t *); diff --git a/external/cddl/osnet/dist/cmd/zfs/zfs_main.c b/external/cddl/osnet/dist/cmd/zfs/zfs_main.c index a343b5c563fbd..ce65fd57a8d6b 100644 --- a/external/cddl/osnet/dist/cmd/zfs/zfs_main.c +++ b/external/cddl/osnet/dist/cmd/zfs/zfs_main.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,12 +39,14 @@ #include #include #include +#include +#include #include #include #include #include #include -#include +#include #include #include @@ -56,6 +58,7 @@ libzfs_handle_t *g_zfs; static FILE *mnttab_file; static char history_str[HIS_MAX_RECORD_LEN]; +const char *pypath = "/usr/lib/zfs/pyzfs.py"; static int zfs_do_clone(int argc, char **argv); static int zfs_do_create(int argc, char **argv); @@ -75,8 +78,10 @@ static int zfs_do_unshare(int argc, char **argv); static int zfs_do_send(int argc, char **argv); static int zfs_do_receive(int argc, char **argv); static int zfs_do_promote(int argc, char **argv); -static int zfs_do_allow(int argc, char **argv); -static int zfs_do_unallow(int argc, char **argv); +static int zfs_do_userspace(int argc, char **argv); +static int zfs_do_python(int argc, char **argv); +static int zfs_do_hold(int argc, char **argv); +static int zfs_do_release(int argc, char **argv); /* * Enable a reasonable set of defaults for libumem debugging on DEBUG builds. @@ -116,7 +121,12 @@ typedef enum { HELP_UNMOUNT, HELP_UNSHARE, HELP_ALLOW, - HELP_UNALLOW + HELP_UNALLOW, + HELP_USERSPACE, + HELP_GROUPSPACE, + HELP_HOLD, + HELP_HOLDS, + HELP_RELEASE } zfs_help_t; typedef struct zfs_command { @@ -147,9 +157,11 @@ static zfs_command_t command_table[] = { { "list", zfs_do_list, HELP_LIST }, { NULL }, { "set", zfs_do_set, HELP_SET }, - { "get", zfs_do_get, HELP_GET }, + { "get", zfs_do_get, HELP_GET }, { "inherit", zfs_do_inherit, HELP_INHERIT }, { "upgrade", zfs_do_upgrade, HELP_UPGRADE }, + { "userspace", zfs_do_userspace, HELP_USERSPACE }, + { "groupspace", zfs_do_userspace, HELP_GROUPSPACE }, { NULL }, { "mount", zfs_do_mount, HELP_MOUNT }, { "unmount", zfs_do_unmount, HELP_UNMOUNT }, @@ -159,9 +171,13 @@ static zfs_command_t command_table[] = { { "send", zfs_do_send, HELP_SEND }, { "receive", zfs_do_receive, HELP_RECEIVE }, { NULL }, - { "allow", zfs_do_allow, HELP_ALLOW }, + { "allow", zfs_do_python, HELP_ALLOW }, { NULL }, - { "unallow", zfs_do_unallow, HELP_UNALLOW }, + { "unallow", zfs_do_python, HELP_UNALLOW }, + { NULL }, + { "hold", zfs_do_hold, HELP_HOLD }, + { "holds", zfs_do_python, HELP_HOLDS }, + { "release", zfs_do_release, HELP_RELEASE }, }; #define NCOMMAND (sizeof (command_table) / sizeof (command_table[0])) @@ -181,22 +197,22 @@ get_usage(zfs_help_t idx) "\tcreate [-ps] [-b blocksize] [-o property=value] ... " "-V \n")); case HELP_DESTROY: - return (gettext("\tdestroy [-rRf] " - "\n")); + return (gettext("\tdestroy [-rRf] \n" + "\tdestroy [-rRd] \n")); case HELP_GET: - return (gettext("\tget [-rHp] [-o field[,...]] " - "[-s source[,...]]\n" + return (gettext("\tget [-rHp] [-d max] " + "[-o \"all\" | field[,...]] [-s source[,...]]\n" "\t <\"all\" | property[,...]> " "[filesystem|volume|snapshot] ...\n")); case HELP_INHERIT: - return (gettext("\tinherit [-r] " + return (gettext("\tinherit [-rS] " " ...\n")); case HELP_UPGRADE: return (gettext("\tupgrade [-v]\n" "\tupgrade [-r] [-V version] <-a | filesystem ...>\n")); case HELP_LIST: - return (gettext("\tlist [-rH] [-o property[,...]] " - "[-t type[,...]] [-s property] ...\n" + return (gettext("\tlist [-rH][-d max] " + "[-o property[,...]] [-t type[,...]] [-s property] ...\n" "\t [-S property] ... " "[filesystem|volume|snapshot] ...\n")); case HELP_MOUNT: @@ -216,7 +232,7 @@ get_usage(zfs_help_t idx) case HELP_ROLLBACK: return (gettext("\trollback [-rRf] \n")); case HELP_SEND: - return (gettext("\tsend [-R] [-[iI] snapshot] \n")); + return (gettext("\tsend [-RDp] [-[iI] snapshot] \n")); case HELP_SET: return (gettext("\tset " " ...\n")); @@ -229,10 +245,11 @@ get_usage(zfs_help_t idx) return (gettext("\tunmount [-f] " "<-a | filesystem|mountpoint>\n")); case HELP_UNSHARE: - return (gettext("\tunshare [-f] " + return (gettext("\tunshare " "<-a | filesystem|mountpoint>\n")); case HELP_ALLOW: - return (gettext("\tallow [-ldug] " + return (gettext("\tallow \n" + "\tallow [-ldug] " "<\"everyone\"|user|group>[,...] [,...]\n" "\t \n" "\tallow [-ld] -e [,...] " @@ -250,6 +267,20 @@ get_usage(zfs_help_t idx) "\n" "\tunallow [-r] -s @setname [[,...]] " "\n")); + case HELP_USERSPACE: + return (gettext("\tuserspace [-hniHp] [-o field[,...]] " + "[-sS field] ... [-t type[,...]]\n" + "\t \n")); + case HELP_GROUPSPACE: + return (gettext("\tgroupspace [-hniHpU] [-o field[,...]] " + "[-sS field] ... [-t type[,...]]\n" + "\t \n")); + case HELP_HOLD: + return (gettext("\thold [-r] ...\n")); + case HELP_HOLDS: + return (gettext("\tholds [-r] ...\n")); + case HELP_RELEASE: + return (gettext("\trelease [-r] ...\n")); } abort(); @@ -311,7 +342,6 @@ usage(boolean_t requested) { int i; boolean_t show_properties = B_FALSE; - boolean_t show_permissions = B_FALSE; FILE *fp = requested ? stdout : stderr; if (current_command == NULL) { @@ -342,13 +372,7 @@ usage(boolean_t requested) strcmp(current_command->name, "list") == 0)) show_properties = B_TRUE; - if (current_command != NULL && - (strcmp(current_command->name, "allow") == 0 || - strcmp(current_command->name, "unallow") == 0)) - show_permissions = B_TRUE; - if (show_properties) { - (void) fprintf(fp, gettext("\nThe following properties are supported:\n")); @@ -359,29 +383,33 @@ usage(boolean_t requested) (void) zprop_iter(usage_prop_cb, fp, B_FALSE, B_TRUE, ZFS_TYPE_DATASET); + (void) fprintf(fp, "\t%-15s ", "userused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "groupused@..."); + (void) fprintf(fp, " NO NO \n"); + (void) fprintf(fp, "\t%-15s ", "userquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, "\t%-15s ", "groupquota@..."); + (void) fprintf(fp, "YES NO | none\n"); + (void) fprintf(fp, gettext("\nSizes are specified in bytes " "with standard units such as K, M, G, etc.\n")); (void) fprintf(fp, gettext("\nUser-defined properties can " "be specified by using a name containing a colon (:).\n")); - - } else if (show_permissions) { - (void) fprintf(fp, - gettext("\nThe following permissions are supported:\n")); - - zfs_deleg_permissions(); + (void) fprintf(fp, gettext("\nThe {user|group}{used|quota}@ " + "properties must be appended with\n" + "a user or group specifier of one of these forms:\n" + " POSIX name (eg: \"matt\")\n" + " POSIX id (eg: \"126829\")\n" + " SMB name@domain (eg: \"matt@sun\")\n" + " SMB SID (eg: \"S-1-234-567-89\")\n")); } else { - /* - * TRANSLATION NOTE: - * "zfs set|get" must not be localised this is the - * command name and arguments. - */ - (void) fprintf(fp, - gettext("\nFor the property list, run: zfs set|get\n")); - + gettext("\nFor the property list, run: %s\n"), + "zfs set|get"); (void) fprintf(fp, - gettext("\nFor the delegated permission list, run:" - " zfs allow|unallow\n")); + gettext("\nFor the delegated permission list, run: %s\n"), + "zfs allow|unallow"); } /* @@ -419,7 +447,27 @@ parseprop(nvlist_t *props) return (-1); } return (0); +} +static int +parse_depth(char *opt, int *flags) +{ + char *tmp; + int depth; + + depth = (int)strtol(opt, &tmp, 0); + if (*tmp) { + (void) fprintf(stderr, + gettext("%s is not an integer\n"), optarg); + usage(B_FALSE); + } + if (depth < 0) { + (void) fprintf(stderr, + gettext("Depth can not be negative.\n")); + usage(B_FALSE); + } + *flags |= (ZFS_ITER_DEPTH_LIMIT|ZFS_ITER_RECURSE); + return (depth); } /* @@ -666,6 +714,7 @@ zfs_do_create(int argc, char **argv) resv_prop = ZFS_PROP_REFRESERVATION; else resv_prop = ZFS_PROP_RESERVATION; + volsize = zvol_volsize_to_reservation(volsize, props); if (nvlist_lookup_string(props, zfs_prop_to_name(resv_prop), &strval) != 0) { @@ -736,11 +785,13 @@ zfs_do_create(int argc, char **argv) } /* - * zfs destroy [-rf] + * zfs destroy [-rRf] + * zfs destroy [-rRd] * - * -r Recursively destroy all children - * -R Recursively destroy all dependents, including clones - * -f Force unmounting of any dependents + * -r Recursively destroy all children + * -R Recursively destroy all dependents, including clones + * -f Force unmounting of any dependents + * -d If we can't destroy now, mark for deferred destruction * * Destroys the given dataset. By default, it will unmount any filesystems, * and refuse to destroy a dataset that has any dependents. A dependent can @@ -756,6 +807,7 @@ typedef struct destroy_cbdata { boolean_t cb_closezhp; zfs_handle_t *cb_target; char *cb_snapname; + boolean_t cb_defer_destroy; } destroy_cbdata_t; /* @@ -824,7 +876,7 @@ destroy_callback(zfs_handle_t *zhp, void *data) /* * Ignore pools (which we've already flagged as an error before getting - * here. + * here). */ if (strchr(zfs_get_name(zhp), '/') == NULL && zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) { @@ -836,7 +888,7 @@ destroy_callback(zfs_handle_t *zhp, void *data) * Bail out on the first error. */ if (zfs_unmount(zhp, NULL, cbp->cb_force ? MS_FORCE : 0) != 0 || - zfs_destroy(zhp) != 0) { + zfs_destroy(zhp, cbp->cb_defer_destroy) != 0) { zfs_close(zhp); return (-1); } @@ -888,10 +940,15 @@ zfs_do_destroy(int argc, char **argv) int c; zfs_handle_t *zhp; char *cp; + zfs_type_t type = ZFS_TYPE_DATASET; /* check options */ - while ((c = getopt(argc, argv, "frR")) != -1) { + while ((c = getopt(argc, argv, "dfrR")) != -1) { switch (c) { + case 'd': + cb.cb_defer_destroy = B_TRUE; + type = ZFS_TYPE_SNAPSHOT; + break; case 'f': cb.cb_force = 1; break; @@ -937,14 +994,22 @@ zfs_do_destroy(int argc, char **argv) cp++; if (cb.cb_doclones) { + boolean_t defer = cb.cb_defer_destroy; + + /* + * Temporarily ignore the defer_destroy setting since + * it's not supported for clones. + */ + cb.cb_defer_destroy = B_FALSE; cb.cb_snapname = cp; if (destroy_snap_clones(zhp, &cb) != 0) { zfs_close(zhp); return (1); } + cb.cb_defer_destroy = defer; } - ret = zfs_destroy_snaps(zhp, cp); + ret = zfs_destroy_snaps(zhp, cp, cb.cb_defer_destroy); zfs_close(zhp); if (ret) { (void) fprintf(stderr, @@ -953,9 +1018,8 @@ zfs_do_destroy(int argc, char **argv) return (ret != 0); } - /* Open the given dataset */ - if ((zhp = zfs_open(g_zfs, argv[0], ZFS_TYPE_DATASET)) == NULL) + if ((zhp = zfs_open(g_zfs, argv[0], type)) == NULL) return (1); cb.cb_target = zhp; @@ -981,15 +1045,15 @@ zfs_do_destroy(int argc, char **argv) * Check for any dependents and/or clones. */ cb.cb_first = B_TRUE; - if (!cb.cb_doclones && + if (!cb.cb_doclones && !cb.cb_defer_destroy && zfs_iter_dependents(zhp, B_TRUE, destroy_check_dependent, &cb) != 0) { zfs_close(zhp); return (1); } - if (cb.cb_error || - zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0) { + if (cb.cb_error || (!cb.cb_defer_destroy && + (zfs_iter_dependents(zhp, B_FALSE, destroy_callback, &cb) != 0))) { zfs_close(zhp); return (1); } @@ -1002,22 +1066,35 @@ zfs_do_destroy(int argc, char **argv) if (destroy_callback(zhp, &cb) != 0) return (1); - return (0); } +static boolean_t +is_recvd_column(zprop_get_cbdata_t *cbp) +{ + int i; + zfs_get_column_t col; + + for (i = 0; i < ZFS_GET_NCOLS && + (col = cbp->cb_columns[i]) != GET_COL_NONE; i++) + if (col == GET_COL_RECVD) + return (B_TRUE); + return (B_FALSE); +} + /* - * zfs get [-rHp] [-o field[,field]...] [-s source[,source]...] - * < all | property[,property]... > < fs | snap | vol > ... + * zfs get [-rHp] [-o all | field[,field]...] [-s source[,source]...] + * < all | property[,property]... > < fs | snap | vol > ... * * -r recurse over any child datasets * -H scripted mode. Headers are stripped, and fields are separated * by tabs instead of spaces. - * -o Set of fields to display. One of "name,property,value,source". - * Default is all four. + * -o Set of fields to display. One of "name,property,value, + * received,source". Default is "name,property,value,source". + * "all" is an alias for all five. * -s Set of sources to allow. One of - * "local,default,inherited,temporary,none". Default is all - * five. + * "local,default,inherited,received,temporary,none". Default is + * all six. * -p Display values in parsable (literal) format. * * Prints properties for the given datasets. The user can control which @@ -1031,16 +1108,19 @@ static int get_callback(zfs_handle_t *zhp, void *data) { char buf[ZFS_MAXPROPLEN]; + char rbuf[ZFS_MAXPROPLEN]; zprop_source_t sourcetype; char source[ZFS_MAXNAMELEN]; zprop_get_cbdata_t *cbp = data; - nvlist_t *userprop = zfs_get_user_props(zhp); + nvlist_t *user_props = zfs_get_user_props(zhp); zprop_list_t *pl = cbp->cb_proplist; nvlist_t *propval; char *strval; char *sourceval; + boolean_t received = is_recvd_column(cbp); for (; pl != NULL; pl = pl->pl_next) { + char *recvdval = NULL; /* * Skip the special fake placeholder. This will also skip over * the name property when 'all' is specified. @@ -1067,11 +1147,27 @@ get_callback(zfs_handle_t *zhp, void *data) (void) strlcpy(buf, "-", sizeof (buf)); } + if (received && (zfs_prop_get_recvd(zhp, + zfs_prop_to_name(pl->pl_prop), rbuf, sizeof (rbuf), + cbp->cb_literal) == 0)) + recvdval = rbuf; + zprop_print_one_property(zfs_get_name(zhp), cbp, zfs_prop_to_name(pl->pl_prop), - buf, sourcetype, source); + buf, sourcetype, source, recvdval); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + sourcetype = ZPROP_SRC_LOCAL; + + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + buf, sizeof (buf), cbp->cb_literal) != 0) { + sourcetype = ZPROP_SRC_NONE; + (void) strlcpy(buf, "-", sizeof (buf)); + } + + zprop_print_one_property(zfs_get_name(zhp), cbp, + pl->pl_user_prop, buf, sourcetype, source, NULL); } else { - if (nvlist_lookup_nvlist(userprop, + if (nvlist_lookup_nvlist(user_props, pl->pl_user_prop, &propval) != 0) { if (pl->pl_all) continue; @@ -1086,6 +1182,9 @@ get_callback(zfs_handle_t *zhp, void *data) if (strcmp(sourceval, zfs_get_name(zhp)) == 0) { sourcetype = ZPROP_SRC_LOCAL; + } else if (strcmp(sourceval, + ZPROP_SOURCE_VAL_RECVD) == 0) { + sourcetype = ZPROP_SRC_RECEIVED; } else { sourcetype = ZPROP_SRC_INHERITED; (void) strlcpy(source, @@ -1093,9 +1192,14 @@ get_callback(zfs_handle_t *zhp, void *data) } } + if (received && (zfs_prop_get_recvd(zhp, + pl->pl_user_prop, rbuf, sizeof (rbuf), + cbp->cb_literal) == 0)) + recvdval = rbuf; + zprop_print_one_property(zfs_get_name(zhp), cbp, pl->pl_user_prop, strval, sourcetype, - source); + source, recvdval); } } @@ -1109,6 +1213,7 @@ zfs_do_get(int argc, char **argv) int i, c, flags = 0; char *value, *fields; int ret; + int limit = 0; zprop_list_t fake_name = { 0 }; /* @@ -1122,11 +1227,14 @@ zfs_do_get(int argc, char **argv) cb.cb_type = ZFS_TYPE_DATASET; /* check options */ - while ((c = getopt(argc, argv, ":o:s:rHp")) != -1) { + while ((c = getopt(argc, argv, ":d:o:s:rHp")) != -1) { switch (c) { case 'p': cb.cb_literal = B_TRUE; break; + case 'd': + limit = parse_depth(optarg, &flags); + break; case 'r': flags |= ZFS_ITER_RECURSE; break; @@ -1147,10 +1255,10 @@ zfs_do_get(int argc, char **argv) i = 0; while (*optarg != '\0') { static char *col_subopts[] = - { "name", "property", "value", "source", - NULL }; + { "name", "property", "value", "received", + "source", "all", NULL }; - if (i == 4) { + if (i == ZFS_GET_NCOLS) { (void) fprintf(stderr, gettext("too " "many fields given to -o " "option\n")); @@ -1169,8 +1277,28 @@ zfs_do_get(int argc, char **argv) cb.cb_columns[i++] = GET_COL_VALUE; break; case 3: + cb.cb_columns[i++] = GET_COL_RECVD; + flags |= ZFS_ITER_RECVD_PROPS; + break; + case 4: cb.cb_columns[i++] = GET_COL_SOURCE; break; + case 5: + if (i > 0) { + (void) fprintf(stderr, + gettext("\"all\" conflicts " + "with specific fields " + "given to -o option\n")); + usage(B_FALSE); + } + cb.cb_columns[0] = GET_COL_NAME; + cb.cb_columns[1] = GET_COL_PROPERTY; + cb.cb_columns[2] = GET_COL_VALUE; + cb.cb_columns[3] = GET_COL_RECVD; + cb.cb_columns[4] = GET_COL_SOURCE; + flags |= ZFS_ITER_RECVD_PROPS; + i = ZFS_GET_NCOLS; + break; default: (void) fprintf(stderr, gettext("invalid column name " @@ -1185,7 +1313,8 @@ zfs_do_get(int argc, char **argv) while (*optarg != '\0') { static char *source_subopts[] = { "local", "default", "inherited", - "temporary", "none", NULL }; + "received", "temporary", "none", + NULL }; switch (getsubopt(&optarg, source_subopts, &value)) { @@ -1199,9 +1328,12 @@ zfs_do_get(int argc, char **argv) cb.cb_sources |= ZPROP_SRC_INHERITED; break; case 3: - cb.cb_sources |= ZPROP_SRC_TEMPORARY; + cb.cb_sources |= ZPROP_SRC_RECEIVED; break; case 4: + cb.cb_sources |= ZPROP_SRC_TEMPORARY; + break; + case 5: cb.cb_sources |= ZPROP_SRC_NONE; break; default: @@ -1257,7 +1389,7 @@ zfs_do_get(int argc, char **argv) /* run for each object */ ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, NULL, - &cb.cb_proplist, get_callback, &cb); + &cb.cb_proplist, limit, get_callback, &cb); if (cb.cb_proplist == &fake_name) zprop_free_list(fake_name.pl_next); @@ -1268,9 +1400,10 @@ zfs_do_get(int argc, char **argv) } /* - * inherit [-r] ... + * inherit [-rS] ... * - * -r Recurse over all children + * -r Recurse over all children + * -S Revert to received value, if any * * For each dataset specified on the command line, inherit the given property * from its parent. Inheriting a property at the pool level will cause it to @@ -1279,11 +1412,16 @@ zfs_do_get(int argc, char **argv) * local modifications for each dataset. */ +typedef struct inherit_cbdata { + const char *cb_propname; + boolean_t cb_received; +} inherit_cbdata_t; + static int inherit_recurse_cb(zfs_handle_t *zhp, void *data) { - char *propname = data; - zfs_prop_t prop = zfs_name_to_prop(propname); + inherit_cbdata_t *cb = data; + zfs_prop_t prop = zfs_name_to_prop(cb->cb_propname); /* * If we're doing it recursively, then ignore properties that @@ -1293,15 +1431,15 @@ inherit_recurse_cb(zfs_handle_t *zhp, void *data) !zfs_prop_valid_for_type(prop, zfs_get_type(zhp))) return (0); - return (zfs_prop_inherit(zhp, propname) != 0); + return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int inherit_cb(zfs_handle_t *zhp, void *data) { - char *propname = data; + inherit_cbdata_t *cb = data; - return (zfs_prop_inherit(zhp, propname) != 0); + return (zfs_prop_inherit(zhp, cb->cb_propname, cb->cb_received) != 0); } static int @@ -1309,16 +1447,21 @@ zfs_do_inherit(int argc, char **argv) { int c; zfs_prop_t prop; + inherit_cbdata_t cb = { 0 }; char *propname; int ret; int flags = 0; + boolean_t received = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "r")) != -1) { + while ((c = getopt(argc, argv, "rS")) != -1) { switch (c) { case 'r': flags |= ZFS_ITER_RECURSE; break; + case 'S': + received = B_TRUE; + break; case '?': default: (void) fprintf(stderr, gettext("invalid option '%c'\n"), @@ -1351,7 +1494,7 @@ zfs_do_inherit(int argc, char **argv) propname); return (1); } - if (!zfs_prop_inheritable(prop)) { + if (!zfs_prop_inheritable(prop) && !received) { (void) fprintf(stderr, gettext("'%s' property cannot " "be inherited\n"), propname); if (prop == ZFS_PROP_QUOTA || @@ -1362,18 +1505,27 @@ zfs_do_inherit(int argc, char **argv) "%s=none' to clear\n"), propname); return (1); } + if (received && (prop == ZFS_PROP_VOLSIZE || + prop == ZFS_PROP_VERSION)) { + (void) fprintf(stderr, gettext("'%s' property cannot " + "be reverted to a received value\n"), propname); + return (1); + } } else if (!zfs_prop_user(propname)) { (void) fprintf(stderr, gettext("invalid property '%s'\n"), propname); usage(B_FALSE); } + cb.cb_propname = propname; + cb.cb_received = received; + if (flags & ZFS_ITER_RECURSE) { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, - NULL, NULL, inherit_recurse_cb, propname); + NULL, NULL, 0, inherit_recurse_cb, &cb); } else { ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_DATASET, - NULL, NULL, inherit_cb, propname); + NULL, NULL, 0, inherit_cb, &cb); } return (ret); @@ -1442,21 +1594,30 @@ upgrade_set_callback(zfs_handle_t *zhp, void *data) { upgrade_cbdata_t *cb = data; int version = zfs_prop_get_int(zhp, ZFS_PROP_VERSION); - - if (cb->cb_version >= ZPL_VERSION_FUID) { - int spa_version; - - if (zfs_spa_version(zhp, &spa_version) < 0) - return (-1); - - if (spa_version < SPA_VERSION_FUID) { - /* can't upgrade */ - (void) printf(gettext("%s: can not be upgraded; " - "the pool version needs to first be upgraded\nto " - "version %d\n\n"), - zfs_get_name(zhp), SPA_VERSION_FUID); - cb->cb_numfailed++; - return (0); + int i; + static struct { int zplver; int spaver; } table[] = { + {ZPL_VERSION_FUID, SPA_VERSION_FUID}, + {ZPL_VERSION_USERSPACE, SPA_VERSION_USERSPACE}, + {0, 0} + }; + + + for (i = 0; table[i].zplver; i++) { + if (cb->cb_version >= table[i].zplver) { + int spa_version; + + if (zfs_spa_version(zhp, &spa_version) < 0) + return (-1); + + if (spa_version < table[i].spaver) { + /* can't upgrade */ + (void) printf(gettext("%s: can not be " + "upgraded; the pool version needs to first " + "be upgraded\nto version %d\n\n"), + zfs_get_name(zhp), table[i].spaver); + cb->cb_numfailed++; + return (0); + } } } @@ -1556,7 +1717,9 @@ zfs_do_upgrade(int argc, char **argv) (void) printf(gettext(" 1 Initial ZFS filesystem version\n")); (void) printf(gettext(" 2 Enhanced directory entries\n")); (void) printf(gettext(" 3 Case insensitive and File system " - "unique identifer (FUID)\n")); + "unique identifier (FUID)\n")); + (void) printf(gettext(" 4 userquota, groupquota " + "properties\n")); (void) printf(gettext("\nFor more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" @@ -1568,7 +1731,7 @@ zfs_do_upgrade(int argc, char **argv) if (cb.cb_version == 0) cb.cb_version = ZPL_VERSION; ret = zfs_for_each(argc, argv, flags, ZFS_TYPE_FILESYSTEM, - NULL, NULL, upgrade_set_callback, &cb); + NULL, NULL, 0, upgrade_set_callback, &cb); (void) printf(gettext("%llu filesystems upgraded\n"), cb.cb_numupgraded); if (cb.cb_numsamegraded) { @@ -1586,14 +1749,14 @@ zfs_do_upgrade(int argc, char **argv) flags |= ZFS_ITER_RECURSE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, - NULL, NULL, upgrade_list_callback, &cb); + NULL, NULL, 0, upgrade_list_callback, &cb); found = cb.cb_foundone; cb.cb_foundone = B_FALSE; cb.cb_newer = B_TRUE; ret = zfs_for_each(0, NULL, flags, ZFS_TYPE_FILESYSTEM, - NULL, NULL, upgrade_list_callback, &cb); + NULL, NULL, 0, upgrade_list_callback, &cb); if (!cb.cb_foundone && !found) { (void) printf(gettext("All filesystems are " @@ -1605,14 +1768,93 @@ zfs_do_upgrade(int argc, char **argv) } /* - * list [-rH] [-o property[,property]...] [-t type[,type]...] + * zfs userspace + */ +static int +userspace_cb(void *arg, const char *domain, uid_t rid, uint64_t space) +{ + zfs_userquota_prop_t *typep = arg; + zfs_userquota_prop_t p = *typep; + char *name = NULL; + char *ug, *propname; + char namebuf[32]; + char sizebuf[32]; + + if (domain == NULL || domain[0] == '\0') { + if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) { + struct group *g = getgrgid(rid); + if (g) + name = g->gr_name; + } else { + struct passwd *p = getpwuid(rid); + if (p) + name = p->pw_name; + } + } + + if (p == ZFS_PROP_GROUPUSED || p == ZFS_PROP_GROUPQUOTA) + ug = "group"; + else + ug = "user"; + + if (p == ZFS_PROP_USERUSED || p == ZFS_PROP_GROUPUSED) + propname = "used"; + else + propname = "quota"; + + if (name == NULL) { + (void) snprintf(namebuf, sizeof (namebuf), + "%llu", (longlong_t)rid); + name = namebuf; + } + zfs_nicenum(space, sizebuf, sizeof (sizebuf)); + + (void) printf("%s %s %s%c%s %s\n", propname, ug, domain, + domain[0] ? '-' : ' ', name, sizebuf); + + return (0); +} + +static int +zfs_do_userspace(int argc, char **argv) +{ + zfs_handle_t *zhp; + zfs_userquota_prop_t p; + int error; + + /* + * Try the python version. If the execv fails, we'll continue + * and do a simplistic implementation. + */ + (void) execv(pypath, argv-1); + + (void) printf("internal error: %s not found\n" + "falling back on built-in implementation, " + "some features will not work\n", pypath); + + if ((zhp = zfs_open(g_zfs, argv[argc-1], ZFS_TYPE_DATASET)) == NULL) + return (1); + + (void) printf("PROP TYPE NAME VALUE\n"); + + for (p = 0; p < ZFS_NUM_USERQUOTA_PROPS; p++) { + error = zfs_userspace(zhp, p, userspace_cb, &p); + if (error) + break; + } + return (error); +} + +/* + * list [-r][-d max] [-H] [-o property[,property]...] [-t type[,type]...] * [-s property [-s property]...] [-S property [-S property]...] * ... * - * -r Recurse over all children - * -H Scripted mode; elide headers and separate columns by tabs - * -o Control which fields to display. - * -t Control which object types to display. + * -r Recurse over all children + * -d Limit recursion by depth. + * -H Scripted mode; elide headers and separate columns by tabs + * -o Control which fields to display. + * -t Control which object types to display. * -s Specify sort columns, descending order. * -S Specify sort columns, ascending order. * @@ -1692,7 +1934,6 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) first = B_FALSE; } - right_justify = B_FALSE; if (pl->pl_prop != ZPROP_INVAL) { if (zfs_prop_get(zhp, pl->pl_prop, property, sizeof (property), NULL, NULL, 0, B_FALSE) != 0) @@ -1701,6 +1942,13 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) propstr = property; right_justify = zfs_prop_align_right(pl->pl_prop); + } else if (zfs_prop_userquota(pl->pl_user_prop)) { + if (zfs_prop_get_userquota(zhp, pl->pl_user_prop, + property, sizeof (property), B_FALSE) != 0) + propstr = "-"; + else + propstr = property; + right_justify = B_TRUE; } else { if (nvlist_lookup_nvlist(userprops, pl->pl_user_prop, &propval) != 0) @@ -1708,6 +1956,7 @@ print_dataset(zfs_handle_t *zhp, zprop_list_t *pl, boolean_t scripted) else verify(nvlist_lookup_string(propval, ZPROP_VALUE, &propstr) == 0); + right_justify = B_FALSE; } width = pl->pl_width; @@ -1759,16 +2008,20 @@ zfs_do_list(int argc, char **argv) char *fields = NULL; list_cbdata_t cb = { 0 }; char *value; + int limit = 0; int ret; zfs_sort_column_t *sortcol = NULL; int flags = ZFS_ITER_PROP_LISTSNAPS | ZFS_ITER_ARGS_CAN_BE_PATHS; /* check options */ - while ((c = getopt(argc, argv, ":o:rt:Hs:S:")) != -1) { + while ((c = getopt(argc, argv, ":d:o:rt:Hs:S:")) != -1) { switch (c) { case 'o': fields = optarg; break; + case 'd': + limit = parse_depth(optarg, &flags); + break; case 'r': flags |= ZFS_ITER_RECURSE; break; @@ -1859,7 +2112,7 @@ zfs_do_list(int argc, char **argv) cb.cb_first = B_TRUE; ret = zfs_for_each(argc, argv, flags, types, sortcol, &cb.cb_proplist, - list_callback, &cb); + limit, list_callback, &cb); zprop_free_list(cb.cb_proplist); zfs_free_sort_columns(sortcol); @@ -1998,9 +2251,9 @@ zfs_do_promote(int argc, char **argv) /* * zfs rollback [-rRf] * - * -r Delete any intervening snapshots before doing rollback - * -R Delete any snapshots and their clones - * -f ignored for backwards compatability + * -r Delete any intervening snapshots before doing rollback + * -R Delete any snapshots and their clones + * -f ignored for backwards compatability * * Given a filesystem, rollback to a specific snapshot, discarding any changes * since then and making it the active dataset. If more recent snapshots exist, @@ -2242,7 +2495,7 @@ zfs_do_set(int argc, char **argv) } ret = zfs_for_each(argc - 2, argv + 2, NULL, - ZFS_TYPE_DATASET, NULL, NULL, set_callback, &cb); + ZFS_TYPE_DATASET, NULL, NULL, 0, set_callback, &cb); return (ret); } @@ -2310,8 +2563,8 @@ zfs_do_snapshot(int argc, char **argv) } /* - * zfs send [-v] -R [-i|-I <@snap>] - * zfs send [-v] [-i|-I <@snap>] + * zfs send [-vDp] -R [-i|-I <@snap>] + * zfs send [-vDp] [-i|-I <@snap>] * * Send a backup stream to stdout. */ @@ -2322,14 +2575,11 @@ zfs_do_send(int argc, char **argv) char *toname = NULL; char *cp; zfs_handle_t *zhp; - boolean_t doall = B_FALSE; - boolean_t replicate = B_FALSE; - boolean_t fromorigin = B_FALSE; - boolean_t verbose = B_FALSE; + sendflags_t flags = { 0 }; int c, err; /* check options */ - while ((c = getopt(argc, argv, ":i:I:Rv")) != -1) { + while ((c = getopt(argc, argv, ":i:I:RDpv")) != -1) { switch (c) { case 'i': if (fromname) @@ -2340,13 +2590,19 @@ zfs_do_send(int argc, char **argv) if (fromname) usage(B_FALSE); fromname = optarg; - doall = B_TRUE; + flags.doall = B_TRUE; break; case 'R': - replicate = B_TRUE; + flags.replicate = B_TRUE; + break; + case 'p': + flags.props = B_TRUE; break; case 'v': - verbose = B_TRUE; + flags.verbose = B_TRUE; + break; + case 'D': + flags.dedup = B_TRUE; break; case ':': (void) fprintf(stderr, gettext("missing argument for " @@ -2406,7 +2662,7 @@ zfs_do_send(int argc, char **argv) if (strcmp(origin, fromname) == 0) { fromname = NULL; - fromorigin = B_TRUE; + flags.fromorigin = B_TRUE; } else { *cp = '\0'; if (cp != fromname && strcmp(argv[0], fromname)) { @@ -2424,18 +2680,17 @@ zfs_do_send(int argc, char **argv) } } - if (replicate && fromname == NULL) - doall = B_TRUE; + if (flags.replicate && fromname == NULL) + flags.doall = B_TRUE; - err = zfs_send(zhp, fromname, toname, replicate, doall, fromorigin, - verbose, STDOUT_FILENO); + err = zfs_send(zhp, fromname, toname, flags, STDOUT_FILENO, NULL, 0); zfs_close(zhp); return (err != 0); } /* - * zfs receive [-dnvF] + * zfs receive [-denvF] * * Restore a backup stream from stdin. */ @@ -2443,18 +2698,24 @@ static int zfs_do_receive(int argc, char **argv) { int c, err; - recvflags_t flags; + recvflags_t flags = { 0 }; - bzero(&flags, sizeof (recvflags_t)); /* check options */ - while ((c = getopt(argc, argv, ":dnvF")) != -1) { + while ((c = getopt(argc, argv, ":denuvF")) != -1) { switch (c) { case 'd': flags.isprefix = B_TRUE; break; + case 'e': + flags.isprefix = B_TRUE; + flags.istail = B_TRUE; + break; case 'n': flags.dryrun = B_TRUE; break; + case 'u': + flags.nomount = B_TRUE; + break; case 'v': flags.verbose = B_TRUE; break; @@ -2499,386 +2760,111 @@ zfs_do_receive(int argc, char **argv) return (err != 0); } -typedef struct allow_cb { - int a_permcnt; - size_t a_treeoffset; -} allow_cb_t; - -static void -zfs_print_perms(avl_tree_t *tree) -{ - zfs_perm_node_t *permnode; - - permnode = avl_first(tree); - while (permnode != NULL) { - (void) printf("%s", permnode->z_pname); - permnode = AVL_NEXT(tree, permnode); - if (permnode) - (void) printf(","); - else - (void) printf("\n"); - } -} - -/* - * Iterate over user/groups/everyone/... and the call perm_iter - * function to print actual permission when tree has >0 nodes. - */ -static void -zfs_iter_perms(avl_tree_t *tree, const char *banner, allow_cb_t *cb) -{ - zfs_allow_node_t *item; - avl_tree_t *ptree; - - item = avl_first(tree); - while (item) { - ptree = (void *)((char *)item + cb->a_treeoffset); - if (avl_numnodes(ptree)) { - if (cb->a_permcnt++ == 0) - (void) printf("%s\n", banner); - (void) printf("\t%s", item->z_key); - /* - * Avoid an extra space being printed - * for "everyone" which is keyed with a null - * string - */ - if (item->z_key[0] != '\0') - (void) printf(" "); - zfs_print_perms(ptree); - } - item = AVL_NEXT(tree, item); - } -} - -#define LINES "-------------------------------------------------------------\n" static int -zfs_print_allows(char *ds) -{ - zfs_allow_t *curperms, *perms; - zfs_handle_t *zhp; - allow_cb_t allowcb = { 0 }; - char banner[MAXPATHLEN]; - - if (ds[0] == '-') - usage(B_FALSE); - - if (strrchr(ds, '@')) { - (void) fprintf(stderr, gettext("Snapshots don't have 'allow'" - " permissions\n")); - return (1); - } - if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL) - return (1); - - if (zfs_perm_get(zhp, &perms)) { - (void) fprintf(stderr, - gettext("Failed to retrieve 'allows' on %s\n"), ds); - zfs_close(zhp); - return (1); - } - - zfs_close(zhp); - - if (perms != NULL) - (void) printf("%s", LINES); - for (curperms = perms; curperms; curperms = curperms->z_next) { - - (void) snprintf(banner, sizeof (banner), - "Permission sets on (%s)", curperms->z_setpoint); - allowcb.a_treeoffset = - offsetof(zfs_allow_node_t, z_localdescend); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_sets, banner, &allowcb); - - (void) snprintf(banner, sizeof (banner), - "Create time permissions on (%s)", curperms->z_setpoint); - allowcb.a_treeoffset = - offsetof(zfs_allow_node_t, z_localdescend); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_crperms, banner, &allowcb); - - - (void) snprintf(banner, sizeof (banner), - "Local permissions on (%s)", curperms->z_setpoint); - allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_local); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_user, banner, &allowcb); - zfs_iter_perms(&curperms->z_group, banner, &allowcb); - zfs_iter_perms(&curperms->z_everyone, banner, &allowcb); - - (void) snprintf(banner, sizeof (banner), - "Descendent permissions on (%s)", curperms->z_setpoint); - allowcb.a_treeoffset = offsetof(zfs_allow_node_t, z_descend); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_user, banner, &allowcb); - zfs_iter_perms(&curperms->z_group, banner, &allowcb); - zfs_iter_perms(&curperms->z_everyone, banner, &allowcb); - - (void) snprintf(banner, sizeof (banner), - "Local+Descendent permissions on (%s)", - curperms->z_setpoint); - allowcb.a_treeoffset = - offsetof(zfs_allow_node_t, z_localdescend); - allowcb.a_permcnt = 0; - zfs_iter_perms(&curperms->z_user, banner, &allowcb); - zfs_iter_perms(&curperms->z_group, banner, &allowcb); - zfs_iter_perms(&curperms->z_everyone, banner, &allowcb); - - (void) printf("%s", LINES); - } - zfs_free_allows(perms); - return (0); -} - -#define ALLOWOPTIONS "ldcsu:g:e" -#define UNALLOWOPTIONS "ldcsu:g:er" - -/* - * Validate options, and build necessary datastructure to display/remove/add - * permissions. - * Returns 0 - If permissions should be added/removed - * Returns 1 - If permissions should be displayed. - * Returns -1 - on failure - */ -int -parse_allow_args(int *argc, char **argv[], boolean_t unallow, - char **ds, int *recurse, nvlist_t **zperms) +zfs_do_hold_rele_impl(int argc, char **argv, boolean_t holding) { + int errors = 0; + int i; + const char *tag; + boolean_t recursive = B_FALSE; + boolean_t temphold = B_FALSE; + const char *opts = holding ? "rt" : "r"; int c; - char *options = unallow ? UNALLOWOPTIONS : ALLOWOPTIONS; - zfs_deleg_inherit_t deleg_type = ZFS_DELEG_NONE; - zfs_deleg_who_type_t who_type = ZFS_DELEG_WHO_UNKNOWN; - char *who = NULL; - char *perms = NULL; - zfs_handle_t *zhp; - while ((c = getopt(*argc, *argv, options)) != -1) { + /* check options */ + while ((c = getopt(argc, argv, opts)) != -1) { switch (c) { - case 'l': - if (who_type == ZFS_DELEG_CREATE || - who_type == ZFS_DELEG_NAMED_SET) - usage(B_FALSE); - - deleg_type |= ZFS_DELEG_PERM_LOCAL; - break; - case 'd': - if (who_type == ZFS_DELEG_CREATE || - who_type == ZFS_DELEG_NAMED_SET) - usage(B_FALSE); - - deleg_type |= ZFS_DELEG_PERM_DESCENDENT; - break; case 'r': - *recurse = B_TRUE; - break; - case 'c': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - if (deleg_type) - usage(B_FALSE); - who_type = ZFS_DELEG_CREATE; - break; - case 's': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - if (deleg_type) - usage(B_FALSE); - who_type = ZFS_DELEG_NAMED_SET; - break; - case 'u': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - who_type = ZFS_DELEG_USER; - who = optarg; - break; - case 'g': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - who_type = ZFS_DELEG_GROUP; - who = optarg; + recursive = B_TRUE; break; - case 'e': - if (who_type != ZFS_DELEG_WHO_UNKNOWN) - usage(B_FALSE); - who_type = ZFS_DELEG_EVERYONE; + case 't': + temphold = B_TRUE; break; - default: + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); usage(B_FALSE); - break; } } - if (deleg_type == 0) - deleg_type = ZFS_DELEG_PERM_LOCALDESCENDENT; - - *argc -= optind; - *argv += optind; - - if (unallow == B_FALSE && *argc == 1) { - /* - * Only print permissions if no options were processed - */ - if (optind == 1) - return (1); - else - usage(B_FALSE); - } - - /* - * initialize variables for zfs_build_perms based on number - * of arguments. - * 3 arguments ==> zfs [un]allow joe perm,perm,perm or - * zfs [un]allow -s @set1 perm,perm - * 2 arguments ==> zfs [un]allow -c perm,perm or - * zfs [un]allow -u|-g perm or - * zfs [un]allow -e perm,perm - * zfs unallow joe - * zfs unallow -s @set1 - * 1 argument ==> zfs [un]allow -e or - * zfs [un]allow -c - */ - - switch (*argc) { - case 3: - perms = (*argv)[1]; - who = (*argv)[0]; - *ds = (*argv)[2]; - - /* - * advance argc/argv for do_allow cases. - * for do_allow case make sure who have a know who type - * and its not a permission set. - */ - if (unallow == B_TRUE) { - *argc -= 2; - *argv += 2; - } else if (who_type != ZFS_DELEG_WHO_UNKNOWN && - who_type != ZFS_DELEG_NAMED_SET) - usage(B_FALSE); - break; - - case 2: - if (unallow == B_TRUE && (who_type == ZFS_DELEG_EVERYONE || - who_type == ZFS_DELEG_CREATE || who != NULL)) { - perms = (*argv)[0]; - *ds = (*argv)[1]; - } else { - if (unallow == B_FALSE && - (who_type == ZFS_DELEG_WHO_UNKNOWN || - who_type == ZFS_DELEG_NAMED_SET)) - usage(B_FALSE); - else if (who_type == ZFS_DELEG_WHO_UNKNOWN || - who_type == ZFS_DELEG_NAMED_SET) - who = (*argv)[0]; - else if (who_type != ZFS_DELEG_NAMED_SET) - perms = (*argv)[0]; - *ds = (*argv)[1]; - } - if (unallow == B_TRUE) { - (*argc)--; - (*argv)++; - } - break; - - case 1: - if (unallow == B_FALSE) - usage(B_FALSE); - if (who == NULL && who_type != ZFS_DELEG_CREATE && - who_type != ZFS_DELEG_EVERYONE) - usage(B_FALSE); - *ds = (*argv)[0]; - break; + argc -= optind; + argv += optind; - default: + /* check number of arguments */ + if (argc < 2) usage(B_FALSE); - } - if (strrchr(*ds, '@')) { - (void) fprintf(stderr, - gettext("Can't set or remove 'allow' permissions " - "on snapshots.\n")); - return (-1); - } + tag = argv[0]; + --argc; + ++argv; - if ((zhp = zfs_open(g_zfs, *ds, ZFS_TYPE_DATASET)) == NULL) - return (-1); - - if ((zfs_build_perms(zhp, who, perms, - who_type, deleg_type, zperms)) != 0) { - zfs_close(zhp); - return (-1); + if (holding && tag[0] == '.') { + /* tags starting with '.' are reserved for libzfs */ + (void) fprintf(stderr, gettext("tag may not start with '.'\n")); + usage(B_FALSE); } - zfs_close(zhp); - return (0); -} -static int -zfs_do_allow(int argc, char **argv) -{ - char *ds; - nvlist_t *zperms = NULL; - zfs_handle_t *zhp; - int unused; - int ret; - - if ((ret = parse_allow_args(&argc, &argv, B_FALSE, &ds, - &unused, &zperms)) == -1) - return (1); - - if (ret == 1) - return (zfs_print_allows(argv[0])); + for (i = 0; i < argc; ++i) { + zfs_handle_t *zhp; + char parent[ZFS_MAXNAMELEN]; + const char *delim; + char *path = argv[i]; - if ((zhp = zfs_open(g_zfs, ds, ZFS_TYPE_DATASET)) == NULL) - return (1); + delim = strchr(path, '@'); + if (delim == NULL) { + (void) fprintf(stderr, + gettext("'%s' is not a snapshot\n"), path); + ++errors; + continue; + } + (void) strncpy(parent, path, delim - path); + parent[delim - path] = '\0'; - if (zfs_perm_set(zhp, zperms)) { + zhp = zfs_open(g_zfs, parent, + ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME); + if (zhp == NULL) { + ++errors; + continue; + } + if (holding) { + if (zfs_hold(zhp, delim+1, tag, recursive, + temphold, B_FALSE) != 0) + ++errors; + } else { + if (zfs_release(zhp, delim+1, tag, recursive) != 0) + ++errors; + } zfs_close(zhp); - nvlist_free(zperms); - return (1); } - nvlist_free(zperms); - zfs_close(zhp); - return (0); + return (errors != 0); } +/* + * zfs hold [-r] [-t] ... + * + * -r Recursively hold + * -t Temporary hold (hidden option) + * + * Apply a user-hold with the given tag to the list of snapshots. + */ static int -unallow_callback(zfs_handle_t *zhp, void *data) +zfs_do_hold(int argc, char **argv) { - nvlist_t *nvp = (nvlist_t *)data; - int error; - - error = zfs_perm_remove(zhp, nvp); - if (error) { - (void) fprintf(stderr, gettext("Failed to remove permissions " - "on %s\n"), zfs_get_name(zhp)); - } - return (error); + return (zfs_do_hold_rele_impl(argc, argv, B_TRUE)); } +/* + * zfs release [-r] ... + * + * -r Recursively release + * + * Release a user-hold with the given tag from the list of snapshots. + */ static int -zfs_do_unallow(int argc, char **argv) +zfs_do_release(int argc, char **argv) { - int recurse = B_FALSE; - char *ds; - int error; - nvlist_t *zperms = NULL; - int flags = 0; - - if (parse_allow_args(&argc, &argv, B_TRUE, - &ds, &recurse, &zperms) == -1) - return (1); - - if (recurse) - flags |= ZFS_ITER_RECURSE; - error = zfs_for_each(argc, argv, flags, - ZFS_TYPE_FILESYSTEM|ZFS_TYPE_VOLUME, NULL, - NULL, unallow_callback, (void *)zperms); - - if (zperms) - nvlist_free(zperms); - - return (error); + return (zfs_do_hold_rele_impl(argc, argv, B_FALSE)); } typedef struct get_all_cbdata { @@ -3071,7 +3057,6 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0); verify(zfs_prop_get(zhp, ZFS_PROP_SHARESMB, smbshareopts, sizeof (smbshareopts), NULL, NULL, 0, B_FALSE) == 0); - canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); if (op == OP_SHARE && strcmp(shareopts, "off") == 0 && strcmp(smbshareopts, "off") == 0) { @@ -3081,7 +3066,8 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, (void) fprintf(stderr, gettext("cannot share '%s': " "legacy share\n"), zfs_get_name(zhp)); (void) fprintf(stderr, gettext("use share(1M) to " - "share this filesystem\n")); + "share this filesystem, or set " + "sharenfs property on\n")); return (1); } @@ -3119,6 +3105,7 @@ share_mount_one(zfs_handle_t *zhp, int op, int flags, char *protocol, * noauto no return 0 * noauto yes pass through */ + canmount = zfs_prop_get_int(zhp, ZFS_PROP_CANMOUNT); if (canmount == ZFS_CANMOUNT_OFF) { if (!explicit) return (0); @@ -3945,6 +3932,15 @@ zfs_do_unshare(int argc, char **argv) return (unshare_unmount(OP_SHARE, argc, argv)); } +/* ARGSUSED */ +static int +zfs_do_python(int argc, char **argv) +{ + (void) execv(pypath, argv-1); + (void) printf("internal error: %s not found\n", pypath); + return (-1); +} + /* * Called when invoked as /etc/fs/zfs/mount. Do the mount if the mountpoint is * 'legacy'. Otherwise, complain that use should be using 'zfs mount'. @@ -4080,27 +4076,6 @@ manual_unmount(int argc, char **argv) return (unshare_unmount_path(OP_MOUNT, argv[0], flags, B_TRUE)); } -static int -volcheck(zpool_handle_t *zhp, void *data) -{ - boolean_t isinit = *((boolean_t *)data); - - if (isinit) - return (zpool_create_zvol_links(zhp)); - else - return (zpool_remove_zvol_links(zhp)); -} - -/* - * Iterate over all pools in the system and either create or destroy /dev/zvol - * links, depending on the value of 'isinit'. - */ -static int -do_volcheck(boolean_t isinit) -{ - return (zpool_iter(g_zfs, volcheck, &isinit) ? 1 : 0); -} - static int find_command_idx(char *command, int *idx) { @@ -4186,18 +4161,10 @@ main(int argc, char **argv) if (strcmp(cmdname, "-?") == 0) usage(B_TRUE); - /* - * 'volinit' and 'volfini' do not appear in the usage message, - * so we have to special case them here. - */ - if (strcmp(cmdname, "volinit") == 0) - return (do_volcheck(B_TRUE)); - else if (strcmp(cmdname, "volfini") == 0) - return (do_volcheck(B_FALSE)); - /* * Run the appropriate command. */ + libzfs_mnttab_cache(g_zfs, B_TRUE); if (find_command_idx(cmdname, &i) == 0) { current_command = &command_table[i]; ret = command_table[i].func(argc - 1, argv + 1); @@ -4210,6 +4177,7 @@ main(int argc, char **argv) "command '%s'\n"), cmdname); usage(B_FALSE); } + libzfs_mnttab_cache(g_zfs, B_FALSE); } (void) fclose(mnttab_file); diff --git a/external/cddl/osnet/dist/cmd/zpool/zpool_main.c b/external/cddl/osnet/dist/cmd/zpool/zpool_main.c index 54bba8645c669..96fba62d0f886 100644 --- a/external/cddl/osnet/dist/cmd/zpool/zpool_main.c +++ b/external/cddl/osnet/dist/cmd/zpool/zpool_main.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -50,6 +50,8 @@ #include "zpool_util.h" #include "zfs_comutil.h" +#include "statcommon.h" + static int zpool_do_create(int, char **); static int zpool_do_destroy(int, char **); @@ -67,6 +69,7 @@ static int zpool_do_clear(int, char **); static int zpool_do_attach(int, char **); static int zpool_do_detach(int, char **); static int zpool_do_replace(int, char **); +static int zpool_do_split(int, char **); static int zpool_do_scrub(int, char **); @@ -119,7 +122,8 @@ typedef enum { HELP_STATUS, HELP_UPGRADE, HELP_GET, - HELP_SET + HELP_SET, + HELP_SPLIT } zpool_help_t; @@ -156,6 +160,7 @@ static zpool_command_t command_table[] = { { "attach", zpool_do_attach, HELP_ATTACH }, { "detach", zpool_do_detach, HELP_DETACH }, { "replace", zpool_do_replace, HELP_REPLACE }, + { "split", zpool_do_split, HELP_SPLIT }, { NULL }, { "scrub", zpool_do_scrub, HELP_SCRUB }, { NULL }, @@ -173,6 +178,8 @@ static zpool_command_t command_table[] = { zpool_command_t *current_command; static char history_str[HIS_MAX_RECORD_LEN]; +static uint_t timestamp_fmt = NODATE; + static const char * get_usage(zpool_help_t idx) { switch (idx) { @@ -182,7 +189,7 @@ get_usage(zpool_help_t idx) { return (gettext("\tattach [-f] " "\n")); case HELP_CLEAR: - return (gettext("\tclear [device]\n")); + return (gettext("\tclear [-nF] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fn] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" @@ -197,13 +204,14 @@ get_usage(zpool_help_t idx) { return (gettext("\thistory [-il] [] ...\n")); case HELP_IMPORT: return (gettext("\timport [-d dir] [-D]\n" + "\timport [-d dir | -c cachefile] [-n] -F \n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-f] [-R root] -a\n" "\timport [-o mntopts] [-o property=value] ... \n" "\t [-d dir | -c cachefile] [-D] [-f] [-R root] " " [newpool]\n")); case HELP_IOSTAT: - return (gettext("\tiostat [-v] [pool] ... [interval " + return (gettext("\tiostat [-v] [-T d|u] [pool] ... [interval " "[count]]\n")); case HELP_LIST: return (gettext("\tlist [-H] [-o property[,...]] " @@ -230,6 +238,10 @@ get_usage(zpool_help_t idx) { " ...\n")); case HELP_SET: return (gettext("\tset \n")); + case HELP_SPLIT: + return (gettext("\tsplit [-n] [-R altroot] [-o mntopts]\n" + "\t [-o property=value] " + "[ ...]\n")); } abort(); @@ -245,12 +257,12 @@ print_prop_cb(int prop, void *cb) { FILE *fp = cb; - (void) fprintf(fp, "\t%-13s ", zpool_prop_to_name(prop)); + (void) fprintf(fp, "\t%-15s ", zpool_prop_to_name(prop)); if (zpool_prop_readonly(prop)) (void) fprintf(fp, " NO "); else - (void) fprintf(fp, " YES "); + (void) fprintf(fp, " YES "); if (zpool_prop_values(prop) == NULL) (void) fprintf(fp, "-\n"); @@ -297,7 +309,7 @@ usage(boolean_t requested) (void) fprintf(fp, gettext("\nthe following properties are supported:\n")); - (void) fprintf(fp, "\n\t%-13s %s %s\n\n", + (void) fprintf(fp, "\n\t%-15s %s %s\n\n", "PROPERTY", "EDIT", "VALUES"); /* Iterate over all properties */ @@ -339,7 +351,7 @@ print_vdev_tree(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int indent, if ((is_log && !print_logs) || (!is_log && print_logs)) continue; - vname = zpool_vdev_name(g_zfs, zhp, child[c]); + vname = zpool_vdev_name(g_zfs, zhp, child[c], B_FALSE); print_vdev_tree(zhp, vname, child[c], indent + 2, B_FALSE); free(vname); @@ -376,12 +388,11 @@ add_prop_list(const char *propname, char *propval, nvlist_t **props, } normnm = zpool_prop_to_name(prop); } else { - if ((fprop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { - (void) fprintf(stderr, gettext("property '%s' is " - "not a valid file system property\n"), propname); - return (2); + if ((fprop = zfs_name_to_prop(propname)) != ZPROP_INVAL) { + normnm = zfs_prop_to_name(fprop); + } else { + normnm = propname; } - normnm = zfs_prop_to_name(fprop); } if (nvlist_lookup_string(proplist, normnm, &strval) == 0 && @@ -877,17 +888,21 @@ int zpool_do_export(int argc, char **argv) { boolean_t force = B_FALSE; + boolean_t hardforce = B_FALSE; int c; zpool_handle_t *zhp; int ret; int i; /* check options */ - while ((c = getopt(argc, argv, "f")) != -1) { + while ((c = getopt(argc, argv, "fF")) != -1) { switch (c) { case 'f': force = B_TRUE; break; + case 'F': + hardforce = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -917,8 +932,12 @@ zpool_do_export(int argc, char **argv) continue; } - if (zpool_export(zhp, force) != 0) + if (hardforce) { + if (zpool_export_force(zhp) != 0) + ret = 1; + } else if (zpool_export(zhp, force) != 0) { ret = 1; + } zpool_close(zhp); } @@ -933,7 +952,7 @@ zpool_do_export(int argc, char **argv) static int max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) { - char *name = zpool_vdev_name(g_zfs, zhp, nv); + char *name = zpool_vdev_name(g_zfs, zhp, nv, B_TRUE); nvlist_t **child; uint_t c, children; int ret; @@ -971,14 +990,199 @@ max_width(zpool_handle_t *zhp, nvlist_t *nv, int depth, int max) return (max); } +typedef struct spare_cbdata { + uint64_t cb_guid; + zpool_handle_t *cb_zhp; +} spare_cbdata_t; + +static boolean_t +find_vdev(nvlist_t *nv, uint64_t search) +{ + uint64_t guid; + nvlist_t **child; + uint_t c, children; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && + search == guid) + return (B_TRUE); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) == 0) { + for (c = 0; c < children; c++) + if (find_vdev(child[c], search)) + return (B_TRUE); + } + + return (B_FALSE); +} + +static int +find_spare(zpool_handle_t *zhp, void *data) +{ + spare_cbdata_t *cbp = data; + nvlist_t *config, *nvroot; + + config = zpool_get_config(zhp, NULL); + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + if (find_vdev(nvroot, cbp->cb_guid)) { + cbp->cb_zhp = zhp; + return (1); + } + + zpool_close(zhp); + return (0); +} + +/* + * Print out configuration state as requested by status_callback. + */ +void +print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, + int namewidth, int depth, boolean_t isspare) +{ + nvlist_t **child; + uint_t c, children; + vdev_stat_t *vs; + char rbuf[6], wbuf[6], cbuf[6], repaired[7]; + char *vname; + uint64_t notpresent; + spare_cbdata_t cb; + char *state; + + verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, + (uint64_t **)&vs, &c) == 0); + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children) != 0) + children = 0; + + state = zpool_state_to_name(vs->vs_state, vs->vs_aux); + if (isspare) { + /* + * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for + * online drives. + */ + if (vs->vs_aux == VDEV_AUX_SPARED) + state = "INUSE"; + else if (vs->vs_state == VDEV_STATE_HEALTHY) + state = "AVAIL"; + } + + (void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth, + name, state); + + if (!isspare) { + zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); + zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); + zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); + (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); + } + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + ¬present) == 0) { + char *path; + verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); + (void) printf(" was %s", path); + } else if (vs->vs_aux != 0) { + (void) printf(" "); + + switch (vs->vs_aux) { + case VDEV_AUX_OPEN_FAILED: + (void) printf(gettext("cannot open")); + break; + + case VDEV_AUX_BAD_GUID_SUM: + (void) printf(gettext("missing device")); + break; + + case VDEV_AUX_NO_REPLICAS: + (void) printf(gettext("insufficient replicas")); + break; + + case VDEV_AUX_VERSION_NEWER: + (void) printf(gettext("newer version")); + break; + + case VDEV_AUX_SPARED: + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, + &cb.cb_guid) == 0); + if (zpool_iter(g_zfs, find_spare, &cb) == 1) { + if (strcmp(zpool_get_name(cb.cb_zhp), + zpool_get_name(zhp)) == 0) + (void) printf(gettext("currently in " + "use")); + else + (void) printf(gettext("in use by " + "pool '%s'"), + zpool_get_name(cb.cb_zhp)); + zpool_close(cb.cb_zhp); + } else { + (void) printf(gettext("currently in use")); + } + break; + + case VDEV_AUX_ERR_EXCEEDED: + (void) printf(gettext("too many errors")); + break; + + case VDEV_AUX_IO_FAILURE: + (void) printf(gettext("experienced I/O failures")); + break; + + case VDEV_AUX_BAD_LOG: + (void) printf(gettext("bad intent log")); + break; + + case VDEV_AUX_EXTERNAL: + (void) printf(gettext("external device fault")); + break; + + case VDEV_AUX_SPLIT_POOL: + (void) printf(gettext("split into new pool")); + break; + + default: + (void) printf(gettext("corrupted data")); + break; + } + } else if (vs->vs_scrub_repaired != 0 && children == 0) { + /* + * Report bytes resilvered/repaired on leaf devices. + */ + zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired)); + (void) printf(gettext(" %s %s"), repaired, + (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ? + "resilvered" : "repaired"); + } + + (void) printf("\n"); + + for (c = 0; c < children; c++) { + uint64_t islog = B_FALSE, ishole = B_FALSE; + + /* Don't print logs or holes here */ + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &islog); + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &ishole); + if (islog || ishole) + continue; + vname = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE); + print_status_config(zhp, vname, child[c], + namewidth, depth + 2, isspare); + free(vname); + } +} + /* * Print the configuration of an exported pool. Iterate over all vdevs in the * pool, printing out the name and status for each one. */ void -print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, - boolean_t print_logs) +print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth) { nvlist_t **child; uint_t c, children; @@ -986,7 +1190,8 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, char *type, *vname; verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_MISSING) == 0) + if (strcmp(type, VDEV_TYPE_MISSING) == 0 || + strcmp(type, VDEV_TYPE_HOLE) == 0) return; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, @@ -1035,12 +1240,11 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, &is_log); - if ((is_log && !print_logs) || (!is_log && print_logs)) + if (is_log) continue; - vname = zpool_vdev_name(g_zfs, NULL, child[c]); - print_import_config(vname, child[c], - namewidth, depth + 2, B_FALSE); + vname = zpool_vdev_name(g_zfs, NULL, child[c], B_TRUE); + print_import_config(vname, child[c], namewidth, depth + 2); free(vname); } @@ -1048,7 +1252,7 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, &child, &children) == 0) { (void) printf(gettext("\tcache\n")); for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, NULL, child[c]); + vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE); (void) printf("\t %s\n", vname); free(vname); } @@ -1058,13 +1262,51 @@ print_import_config(const char *name, nvlist_t *nv, int namewidth, int depth, &child, &children) == 0) { (void) printf(gettext("\tspares\n")); for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, NULL, child[c]); + vname = zpool_vdev_name(g_zfs, NULL, child[c], B_FALSE); (void) printf("\t %s\n", vname); free(vname); } } } +/* + * Print log vdevs. + * Logs are recorded as top level vdevs in the main pool child array + * but with "is_log" set to 1. We use either print_status_config() or + * print_import_config() to print the top level logs then any log + * children (eg mirrored slogs) are printed recursively - which + * works because only the top level vdev is marked "is_log" + */ +static void +print_logs(zpool_handle_t *zhp, nvlist_t *nv, int namewidth, boolean_t verbose) +{ + uint_t c, children; + nvlist_t **child; + + if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return; + + (void) printf(gettext("\tlogs\n")); + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE; + char *name; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + if (!is_log) + continue; + name = zpool_vdev_name(g_zfs, zhp, child[c], B_TRUE); + if (verbose) + print_status_config(zhp, name, child[c], namewidth, + 2, B_FALSE); + else + print_import_config(name, child[c], namewidth, 2); + free(name); + } +} + /* * Display the status for the given pool. */ @@ -1233,11 +1475,9 @@ show_import(nvlist_t *config) if (namewidth < 10) namewidth = 10; - print_import_config(name, nvroot, namewidth, 0, B_FALSE); - if (num_logs(nvroot) > 0) { - (void) printf(gettext("\tlogs\n")); - print_import_config(name, nvroot, namewidth, 0, B_TRUE); - } + print_import_config(name, nvroot, namewidth, 0); + if (num_logs(nvroot) > 0) + print_logs(NULL, nvroot, namewidth, B_FALSE); if (reason == ZPOOL_STATUS_BAD_GUID_SUM) { (void) printf(gettext("\n\tAdditional devices are known to " @@ -1253,13 +1493,12 @@ show_import(nvlist_t *config) */ static int do_import(nvlist_t *config, const char *newname, const char *mntopts, - int force, nvlist_t *props, boolean_t allowfaulted) + int force, nvlist_t *props, boolean_t do_verbatim) { zpool_handle_t *zhp; char *name; uint64_t state; uint64_t version; - int error = 0; verify(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &name) == 0); @@ -1306,22 +1545,23 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, } } - if (zpool_import_props(g_zfs, config, newname, props, - allowfaulted) != 0) + if (zpool_import_props(g_zfs, config, newname, props, do_verbatim) != 0) return (1); if (newname != NULL) name = (char *)newname; - verify((zhp = zpool_open_canfail(g_zfs, name)) != NULL); + if ((zhp = zpool_open_canfail(g_zfs, name)) == NULL) + return (1); - if (zpool_enable_datasets(zhp, mntopts, 0) != 0) { + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + zpool_enable_datasets(zhp, mntopts, 0) != 0) { zpool_close(zhp); return (1); } zpool_close(zhp); - return (error); + return (0); } /* @@ -1329,7 +1569,7 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, * import [-o mntopts] [-o prop=value] ... [-R root] [-D] * [-d dir | -c cachefile] [-f] -a * import [-o mntopts] [-o prop=value] ... [-R root] [-D] - * [-d dir | -c cachefile] [-f] [newpool] + * [-d dir | -c cachefile] [-f] [-n] [-F] [newpool] * * -c Read pool information from a cachefile instead of searching * devices. @@ -1344,12 +1584,17 @@ do_import(nvlist_t *config, const char *newname, const char *mntopts, * the given root. The pool will remain exported when the machine * is rebooted. * - * -f Force import, even if it appears that the pool is active. - * - * -F Import even in the presence of faulted vdevs. This is an + * -V Import even in the presence of faulted vdevs. This is an * intentionally undocumented option for testing purposes, and * treats the pool configuration as complete, leaving any bad - * vdevs in the FAULTED state. + * vdevs in the FAULTED state. In other words, it does verbatim + * import. + * + * -f Force import, even if it appears that the pool is active. + * + * -F Attempt rewind if necessary. + * + * -n See if rewind would work, but don't actually rewind. * * -a Import all pools found. * @@ -1364,7 +1609,7 @@ zpool_do_import(int argc, char **argv) char **searchdirs = NULL; int nsearch = 0; int c; - int err; + int err = 0; nvlist_t *pools = NULL; boolean_t do_all = B_FALSE; boolean_t do_destroyed = B_FALSE; @@ -1376,14 +1621,20 @@ zpool_do_import(int argc, char **argv) char *searchname = NULL; char *propval; nvlist_t *found_config; + nvlist_t *policy = NULL; nvlist_t *props = NULL; boolean_t first; - boolean_t allow_faulted = B_FALSE; + boolean_t do_verbatim = B_FALSE; + uint32_t rewind_policy = ZPOOL_NO_REWIND; + boolean_t dryrun = B_FALSE; + boolean_t do_rewind = B_FALSE; + boolean_t xtreme_rewind = B_FALSE; uint64_t pool_state; char *cachefile = NULL; + importargs_t idata = { 0 }; /* check options */ - while ((c = getopt(argc, argv, ":ac:d:DfFo:p:R:")) != -1) { + while ((c = getopt(argc, argv, ":aCc:d:DEfFno:rR:VX")) != -1) { switch (c) { case 'a': do_all = B_TRUE; @@ -1411,7 +1662,10 @@ zpool_do_import(int argc, char **argv) do_force = B_TRUE; break; case 'F': - allow_faulted = B_TRUE; + do_rewind = B_TRUE; + break; + case 'n': + dryrun = B_TRUE; break; case 'o': if ((propval = strchr(optarg, '=')) != NULL) { @@ -1436,6 +1690,12 @@ zpool_do_import(int argc, char **argv) ZPOOL_PROP_CACHEFILE), "none", &props, B_TRUE)) goto error; break; + case 'V': + do_verbatim = B_TRUE; + break; + case 'X': + xtreme_rewind = B_TRUE; + break; case ':': (void) fprintf(stderr, gettext("missing argument for " "'%c' option\n"), optopt); @@ -1456,6 +1716,23 @@ zpool_do_import(int argc, char **argv) usage(B_FALSE); } + if ((dryrun || xtreme_rewind) && !do_rewind) { + (void) fprintf(stderr, + gettext("-n or -X only meaningful with -F\n")); + usage(B_FALSE); + } + if (dryrun) + rewind_policy = ZPOOL_TRY_REWIND; + else if (do_rewind) + rewind_policy = ZPOOL_DO_REWIND; + if (xtreme_rewind) + rewind_policy |= ZPOOL_EXTREME_REWIND; + + /* In the future, we can capture further policy and include it here */ + if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0) + goto error; + if (searchdirs == NULL) { searchdirs = safe_malloc(sizeof (char *)); searchdirs[0] = "/dev/dsk"; @@ -1483,6 +1760,7 @@ zpool_do_import(int argc, char **argv) (void) fprintf(stderr, gettext("cannot " "discover pools: permission denied\n")); free(searchdirs); + nvlist_free(policy); return (1); } } @@ -1508,28 +1786,49 @@ zpool_do_import(int argc, char **argv) if (errno != 0 || *endptr != '\0') searchname = argv[0]; found_config = NULL; - } - if (cachefile) { - pools = zpool_find_import_cached(g_zfs, cachefile, searchname, - searchguid); - } else if (searchname != NULL) { - pools = zpool_find_import_byname(g_zfs, nsearch, searchdirs, - searchname); - } else { /* - * It's OK to search by guid even if searchguid is 0. + * User specified a name or guid. Ensure it's unique. */ - pools = zpool_find_import_byguid(g_zfs, nsearch, searchdirs, - searchguid); - } - - if (pools == NULL) { + idata.unique = B_TRUE; + } + + + idata.path = searchdirs; + idata.paths = nsearch; + idata.poolname = searchname; + idata.guid = searchguid; + idata.cachefile = cachefile; + + pools = zpool_search_import(g_zfs, &idata); + + if (pools != NULL && idata.exists && + (argc == 1 || strcmp(argv[0], argv[1]) == 0)) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "a pool with that name already exists\n"), + argv[0]); + (void) fprintf(stderr, gettext("use the form '%s " + " ' to give it a new name\n"), + "zpool import"); + err = 1; + } else if (pools == NULL && idata.exists) { + (void) fprintf(stderr, gettext("cannot import '%s': " + "a pool with that name is already created/imported,\n"), + argv[0]); + (void) fprintf(stderr, gettext("and no additional pools " + "with that name were found\n")); + err = 1; + } else if (pools == NULL) { if (argc != 0) { (void) fprintf(stderr, gettext("cannot import '%s': " "no such pool available\n"), argv[0]); } + err = 1; + } + + if (err == 1) { free(searchdirs); + nvlist_free(policy); return (1); } @@ -1553,17 +1852,21 @@ zpool_do_import(int argc, char **argv) if (do_destroyed && pool_state != POOL_STATE_DESTROYED) continue; + verify(nvlist_add_nvlist(config, ZPOOL_REWIND_POLICY, + policy) == 0); + if (argc == 0) { if (first) first = B_FALSE; else if (!do_all) (void) printf("\n"); - if (do_all) + if (do_all) { err |= do_import(config, NULL, mntopts, - do_force, props, allow_faulted); - else + do_force, props, do_verbatim); + } else { show_import(config); + } } else if (searchname != NULL) { char *name; @@ -1609,7 +1912,7 @@ zpool_do_import(int argc, char **argv) err = B_TRUE; } else { err |= do_import(found_config, argc == 1 ? NULL : - argv[1], mntopts, do_force, props, allow_faulted); + argv[1], mntopts, do_force, props, do_verbatim); } } @@ -1624,6 +1927,7 @@ zpool_do_import(int argc, char **argv) error: nvlist_free(props); nvlist_free(pools); + nvlist_free(policy); free(searchdirs); return (err ? 1 : 0); @@ -1651,7 +1955,7 @@ print_iostat_header(iostat_cbdata_t *cb) { (void) printf("%*s capacity operations bandwidth\n", cb->cb_namewidth, ""); - (void) printf("%-*s used avail read write read write\n", + (void) printf("%-*s alloc free read write read write\n", cb->cb_namewidth, "pool"); print_iostat_separator(cb); } @@ -1742,7 +2046,7 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, return; for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, zhp, newchild[c]); + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], B_FALSE); print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); @@ -1763,7 +2067,8 @@ print_vdev_stats(zpool_handle_t *zhp, const char *name, nvlist_t *oldnv, (void) printf("%-*s - - - - - " "-\n", cb->cb_namewidth, "cache"); for (c = 0; c < children; c++) { - vname = zpool_vdev_name(g_zfs, zhp, newchild[c]); + vname = zpool_vdev_name(g_zfs, zhp, newchild[c], + B_FALSE); print_vdev_stats(zhp, vname, oldnv ? oldchild[c] : NULL, newchild[c], cb, depth + 2); free(vname); @@ -1852,8 +2157,9 @@ get_namewidth(zpool_handle_t *zhp, void *data) } /* - * zpool iostat [-v] [pool] ... [interval [count]] + * zpool iostat [-T d|u] [-v] [pool] ... [interval [count]] * + * -T Display a timestamp in date(1) or Unix format * -v Display statistics for individual vdevs * * This command can be tricky because we want to be able to deal with pool @@ -1874,8 +2180,20 @@ zpool_do_iostat(int argc, char **argv) iostat_cbdata_t cb; /* check options */ - while ((c = getopt(argc, argv, "v")) != -1) { + while ((c = getopt(argc, argv, "T:v")) != -1) { switch (c) { + case 'T': + if (optarg) { + if (*optarg == 'u') + timestamp_fmt = UDATE; + else if (*optarg == 'd') + timestamp_fmt = DDATE; + else + usage(B_FALSE); + } else { + usage(B_FALSE); + } + break; case 'v': verbose = B_TRUE; break; @@ -1992,6 +2310,9 @@ zpool_do_iostat(int argc, char **argv) cb.cb_namewidth = 0; (void) pool_list_iter(list, B_FALSE, get_namewidth, &cb); + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + /* * If it's the first time, or verbose mode, print the header. */ @@ -2148,7 +2469,7 @@ list_callback(zpool_handle_t *zhp, void *data) * -H Scripted mode. Don't display headers, and separate properties * by a single tab. * -o List of properties to display. Defaults to - * "name,size,used,available,capacity,health,altroot" + * "name,size,allocated,free,capacity,health,altroot" * * List all pools in the system, whether or not they're healthy. Output space * statistics for each one, as well as health status summary. @@ -2160,7 +2481,7 @@ zpool_do_list(int argc, char **argv) int ret; list_cbdata_t cb = { 0 }; static char default_props[] = - "name,size,used,available,capacity,health,altroot"; + "name,size,allocated,free,capacity,dedupratio,health,altroot"; char *props = default_props; /* check options */ @@ -2408,20 +2729,164 @@ zpool_do_detach(int argc, char **argv) } /* - * zpool online ... + * zpool split [-n] [-o prop=val] ... + * [-o mntopt] ... + * [-R altroot] [ ...] + * + * -n Do not split the pool, but display the resulting layout if + * it were to be split. + * -o Set property=value, or set mount options. + * -R Mount the split-off pool under an alternate root. + * + * Splits the named pool and gives it the new pool name. Devices to be split + * off may be listed, provided that no more than one device is specified + * per top-level vdev mirror. The newly split pool is left in an exported + * state unless -R is specified. + * + * Restrictions: the top-level of the pool pool must only be made up of + * mirrors; all devices in the pool must be healthy; no device may be + * undergoing a resilvering operation. */ int -zpool_do_online(int argc, char **argv) +zpool_do_split(int argc, char **argv) { - int c, i; - char *poolname; + char *srcpool, *newpool, *propval; + char *mntopts = NULL; + splitflags_t flags; + int c, ret = 0; zpool_handle_t *zhp; - int ret = 0; - vdev_state_t newstate; + nvlist_t *config, *props = NULL; + + flags.dryrun = B_FALSE; + flags.import = B_FALSE; /* check options */ - while ((c = getopt(argc, argv, "t")) != -1) { + while ((c = getopt(argc, argv, ":R:no:")) != -1) { switch (c) { + case 'R': + flags.import = B_TRUE; + if (add_prop_list( + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), optarg, + &props, B_TRUE) != 0) { + if (props) + nvlist_free(props); + usage(B_FALSE); + } + break; + case 'n': + flags.dryrun = B_TRUE; + break; + case 'o': + if ((propval = strchr(optarg, '=')) != NULL) { + *propval = '\0'; + propval++; + if (add_prop_list(optarg, propval, + &props, B_TRUE) != 0) { + if (props) + nvlist_free(props); + usage(B_FALSE); + } + } else { + mntopts = optarg; + } + break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + break; + } + } + + if (!flags.import && mntopts != NULL) { + (void) fprintf(stderr, gettext("setting mntopts is only " + "valid when importing the pool\n")); + usage(B_FALSE); + } + + argc -= optind; + argv += optind; + + if (argc < 1) { + (void) fprintf(stderr, gettext("Missing pool name\n")); + usage(B_FALSE); + } + if (argc < 2) { + (void) fprintf(stderr, gettext("Missing new pool name\n")); + usage(B_FALSE); + } + + srcpool = argv[0]; + newpool = argv[1]; + + argc -= 2; + argv += 2; + + if ((zhp = zpool_open(g_zfs, srcpool)) == NULL) + return (1); + + config = split_mirror_vdev(zhp, newpool, props, flags, argc, argv); + if (config == NULL) { + ret = 1; + } else { + if (flags.dryrun) { + (void) printf(gettext("would create '%s' with the " + "following layout:\n\n"), newpool); + print_vdev_tree(NULL, newpool, config, 0, B_FALSE); + } + nvlist_free(config); + } + + zpool_close(zhp); + + if (ret != 0 || flags.dryrun || !flags.import) + return (ret); + + /* + * The split was successful. Now we need to open the new + * pool and import it. + */ + if ((zhp = zpool_open_canfail(g_zfs, newpool)) == NULL) + return (1); + if (zpool_get_state(zhp) != POOL_STATE_UNAVAIL && + zpool_enable_datasets(zhp, mntopts, 0) != 0) { + ret = 1; + (void) fprintf(stderr, gettext("Split was succssful, but " + "the datasets could not all be mounted\n")); + (void) fprintf(stderr, gettext("Try doing '%s' with a " + "different altroot\n"), "zpool import"); + } + zpool_close(zhp); + + return (ret); +} + + + +/* + * zpool online ... + */ +int +zpool_do_online(int argc, char **argv) +{ + int c, i; + char *poolname; + zpool_handle_t *zhp; + int ret = 0; + vdev_state_t newstate; + int flags = 0; + + /* check options */ + while ((c = getopt(argc, argv, "et")) != -1) { + switch (c) { + case 'e': + flags |= ZFS_ONLINE_EXPAND; + break; case 't': case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), @@ -2449,7 +2914,7 @@ zpool_do_online(int argc, char **argv) return (1); for (i = 1; i < argc; i++) { - if (zpool_vdev_online(zhp, argv[i], 0, &newstate) == 0) { + if (zpool_vdev_online(zhp, argv[i], flags, &newstate) == 0) { if (newstate != VDEV_STATE_HEALTHY) { (void) printf(gettext("warning: device '%s' " "onlined, but remains in faulted state\n"), @@ -2543,31 +3008,80 @@ zpool_do_offline(int argc, char **argv) int zpool_do_clear(int argc, char **argv) { + int c; int ret = 0; + boolean_t dryrun = B_FALSE; + boolean_t do_rewind = B_FALSE; + boolean_t xtreme_rewind = B_FALSE; + uint32_t rewind_policy = ZPOOL_NO_REWIND; + nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; - if (argc < 2) { + /* check options */ + while ((c = getopt(argc, argv, "FnX")) != -1) { + switch (c) { + case 'F': + do_rewind = B_TRUE; + break; + case 'n': + dryrun = B_TRUE; + break; + case 'X': + xtreme_rewind = B_TRUE; + break; + case '?': + (void) fprintf(stderr, gettext("invalid option '%c'\n"), + optopt); + usage(B_FALSE); + } + } + + argc -= optind; + argv += optind; + + if (argc < 1) { (void) fprintf(stderr, gettext("missing pool name\n")); usage(B_FALSE); } - if (argc > 3) { + if (argc > 2) { (void) fprintf(stderr, gettext("too many arguments\n")); usage(B_FALSE); } - pool = argv[1]; - device = argc == 3 ? argv[2] : NULL; + if ((dryrun || xtreme_rewind) && !do_rewind) { + (void) fprintf(stderr, + gettext("-n or -X only meaningful with -F\n")); + usage(B_FALSE); + } + if (dryrun) + rewind_policy = ZPOOL_TRY_REWIND; + else if (do_rewind) + rewind_policy = ZPOOL_DO_REWIND; + if (xtreme_rewind) + rewind_policy |= ZPOOL_EXTREME_REWIND; + + /* In future, further rewind policy choices can be passed along here */ + if (nvlist_alloc(&policy, NV_UNIQUE_NAME, 0) != 0 || + nvlist_add_uint32(policy, ZPOOL_REWIND_REQUEST, rewind_policy) != 0) + return (1); + + pool = argv[0]; + device = argc == 2 ? argv[1] : NULL; - if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) + if ((zhp = zpool_open_canfail(g_zfs, pool)) == NULL) { + nvlist_free(policy); return (1); + } - if (zpool_clear(zhp, device) != 0) + if (zpool_clear(zhp, device, policy) != 0) ret = 1; zpool_close(zhp); + nvlist_free(policy); + return (ret); } @@ -2642,6 +3156,7 @@ typedef struct status_cbdata { boolean_t cb_verbose; boolean_t cb_explain; boolean_t cb_first; + boolean_t cb_dedup_stats; } status_cbdata_t; /* @@ -2706,181 +3221,6 @@ print_scrub_status(nvlist_t *nvroot) (u_longlong_t)(minutes_left / 60), (uint_t)(minutes_left % 60)); } -typedef struct spare_cbdata { - uint64_t cb_guid; - zpool_handle_t *cb_zhp; -} spare_cbdata_t; - -static boolean_t -find_vdev(nvlist_t *nv, uint64_t search) -{ - uint64_t guid; - nvlist_t **child; - uint_t c, children; - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) == 0 && - search == guid) - return (B_TRUE); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0) { - for (c = 0; c < children; c++) - if (find_vdev(child[c], search)) - return (B_TRUE); - } - - return (B_FALSE); -} - -static int -find_spare(zpool_handle_t *zhp, void *data) -{ - spare_cbdata_t *cbp = data; - nvlist_t *config, *nvroot; - - config = zpool_get_config(zhp, NULL); - verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - - if (find_vdev(nvroot, cbp->cb_guid)) { - cbp->cb_zhp = zhp; - return (1); - } - - zpool_close(zhp); - return (0); -} - -/* - * Print out configuration state as requested by status_callback. - */ -void -print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, - int namewidth, int depth, boolean_t isspare, boolean_t print_logs) -{ - nvlist_t **child; - uint_t c, children; - vdev_stat_t *vs; - char rbuf[6], wbuf[6], cbuf[6], repaired[7]; - char *vname; - uint64_t notpresent; - spare_cbdata_t cb; - char *state; - - verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_STATS, - (uint64_t **)&vs, &c) == 0); - - if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, - &child, &children) != 0) - children = 0; - - state = zpool_state_to_name(vs->vs_state, vs->vs_aux); - if (isspare) { - /* - * For hot spares, we use the terms 'INUSE' and 'AVAILABLE' for - * online drives. - */ - if (vs->vs_aux == VDEV_AUX_SPARED) - state = "INUSE"; - else if (vs->vs_state == VDEV_STATE_HEALTHY) - state = "AVAIL"; - } - - (void) printf("\t%*s%-*s %-8s", depth, "", namewidth - depth, - name, state); - - if (!isspare) { - zfs_nicenum(vs->vs_read_errors, rbuf, sizeof (rbuf)); - zfs_nicenum(vs->vs_write_errors, wbuf, sizeof (wbuf)); - zfs_nicenum(vs->vs_checksum_errors, cbuf, sizeof (cbuf)); - (void) printf(" %5s %5s %5s", rbuf, wbuf, cbuf); - } - - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - ¬present) == 0) { - char *path; - verify(nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0); - (void) printf(" was %s", path); - } else if (vs->vs_aux != 0) { - (void) printf(" "); - - switch (vs->vs_aux) { - case VDEV_AUX_OPEN_FAILED: - (void) printf(gettext("cannot open")); - break; - - case VDEV_AUX_BAD_GUID_SUM: - (void) printf(gettext("missing device")); - break; - - case VDEV_AUX_NO_REPLICAS: - (void) printf(gettext("insufficient replicas")); - break; - - case VDEV_AUX_VERSION_NEWER: - (void) printf(gettext("newer version")); - break; - - case VDEV_AUX_SPARED: - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, - &cb.cb_guid) == 0); - if (zpool_iter(g_zfs, find_spare, &cb) == 1) { - if (strcmp(zpool_get_name(cb.cb_zhp), - zpool_get_name(zhp)) == 0) - (void) printf(gettext("currently in " - "use")); - else - (void) printf(gettext("in use by " - "pool '%s'"), - zpool_get_name(cb.cb_zhp)); - zpool_close(cb.cb_zhp); - } else { - (void) printf(gettext("currently in use")); - } - break; - - case VDEV_AUX_ERR_EXCEEDED: - (void) printf(gettext("too many errors")); - break; - - case VDEV_AUX_IO_FAILURE: - (void) printf(gettext("experienced I/O failures")); - break; - - case VDEV_AUX_BAD_LOG: - (void) printf(gettext("bad intent log")); - break; - - default: - (void) printf(gettext("corrupted data")); - break; - } - } else if (vs->vs_scrub_repaired != 0 && children == 0) { - /* - * Report bytes resilvered/repaired on leaf devices. - */ - zfs_nicenum(vs->vs_scrub_repaired, repaired, sizeof (repaired)); - (void) printf(gettext(" %s %s"), repaired, - (vs->vs_scrub_type == POOL_SCRUB_RESILVER) ? - "resilvered" : "repaired"); - } - - (void) printf("\n"); - - for (c = 0; c < children; c++) { - uint64_t is_log = B_FALSE; - - (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, - &is_log); - if ((is_log && !print_logs) || (!is_log && print_logs)) - continue; - vname = zpool_vdev_name(g_zfs, zhp, child[c]); - print_status_config(zhp, vname, child[c], - namewidth, depth + 2, isspare, B_FALSE); - free(vname); - } -} - static void print_error_log(zpool_handle_t *zhp) { @@ -2929,9 +3269,9 @@ print_spares(zpool_handle_t *zhp, nvlist_t **spares, uint_t nspares, (void) printf(gettext("\tspares\n")); for (i = 0; i < nspares; i++) { - name = zpool_vdev_name(g_zfs, zhp, spares[i]); + name = zpool_vdev_name(g_zfs, zhp, spares[i], B_FALSE); print_status_config(zhp, name, spares[i], - namewidth, 2, B_TRUE, B_FALSE); + namewidth, 2, B_TRUE); free(name); } } @@ -2949,13 +3289,43 @@ print_l2cache(zpool_handle_t *zhp, nvlist_t **l2cache, uint_t nl2cache, (void) printf(gettext("\tcache\n")); for (i = 0; i < nl2cache; i++) { - name = zpool_vdev_name(g_zfs, zhp, l2cache[i]); + name = zpool_vdev_name(g_zfs, zhp, l2cache[i], B_FALSE); print_status_config(zhp, name, l2cache[i], - namewidth, 2, B_FALSE, B_FALSE); + namewidth, 2, B_FALSE); free(name); } } +static void +print_dedup_stats(nvlist_t *config) +{ + ddt_histogram_t *ddh; + ddt_stat_t *dds; + ddt_object_t *ddo; + uint_t c; + + /* + * If the pool was faulted then we may not have been able to + * obtain the config. Otherwise, if have anything in the dedup + * table continue processing the stats. + */ + if (nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_OBJ_STATS, + (uint64_t **)&ddo, &c) != 0 || ddo->ddo_count == 0) + return; + + (void) printf("\n"); + (void) printf("DDT entries %llu, size %llu on disk, %llu in core\n", + (u_longlong_t)ddo->ddo_count, + (u_longlong_t)ddo->ddo_dspace, + (u_longlong_t)ddo->ddo_mspace); + + verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_STATS, + (uint64_t **)&dds, &c) == 0); + verify(nvlist_lookup_uint64_array(config, ZPOOL_CONFIG_DDT_HISTOGRAM, + (uint64_t **)&ddh, &c) == 0); + zpool_dump_ddt(dds, ddh); +} + /* * Display a summary of pool status. Displays a summary such as: * @@ -3046,8 +3416,8 @@ status_callback(zpool_handle_t *zhp, void *data) "be used because the label is missing \n\tor invalid. " "There are insufficient replicas for the pool to " "continue\n\tfunctioning.\n")); - (void) printf(gettext("action: Destroy and re-create the pool " - "from a backup source.\n")); + zpool_explain_recover(zpool_get_handle(zhp), + zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_FAILING_DEV: @@ -3071,6 +3441,17 @@ status_callback(zpool_handle_t *zhp, void *data) "replace'.\n")); break; + case ZPOOL_STATUS_REMOVED_DEV: + (void) printf(gettext("status: One or more devices has " + "been removed by the administrator.\n\tSufficient " + "replicas exist for the pool to continue functioning in " + "a\n\tdegraded state.\n")); + (void) printf(gettext("action: Online the device using " + "'zpool online' or replace the device with\n\t'zpool " + "replace'.\n")); + break; + + case ZPOOL_STATUS_RESILVERING: (void) printf(gettext("status: One or more devices is " "currently being resilvered. The pool will\n\tcontinue " @@ -3091,8 +3472,8 @@ status_callback(zpool_handle_t *zhp, void *data) case ZPOOL_STATUS_CORRUPT_POOL: (void) printf(gettext("status: The pool metadata is corrupted " "and the pool cannot be opened.\n")); - (void) printf(gettext("action: Destroy and re-create the pool " - "from a backup source.\n")); + zpool_explain_recover(zpool_get_handle(zhp), + zpool_get_name(zhp), reason, config); break; case ZPOOL_STATUS_VERSION_OLDER: @@ -3181,11 +3562,10 @@ status_callback(zpool_handle_t *zhp, void *data) (void) printf(gettext("\t%-*s %-8s %5s %5s %5s\n"), namewidth, "NAME", "STATE", "READ", "WRITE", "CKSUM"); print_status_config(zhp, zpool_get_name(zhp), nvroot, - namewidth, 0, B_FALSE, B_FALSE); - if (num_logs(nvroot) > 0) - print_status_config(zhp, "logs", nvroot, namewidth, 0, - B_FALSE, B_TRUE); + namewidth, 0, B_FALSE); + if (num_logs(nvroot) > 0) + print_logs(zhp, nvroot, namewidth, B_TRUE); if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache) == 0) print_l2cache(zhp, l2cache, nl2cache, namewidth); @@ -3228,6 +3608,9 @@ status_callback(zpool_handle_t *zhp, void *data) else print_error_log(zhp); } + + if (cbp->cb_dedup_stats) + print_dedup_stats(config); } else { (void) printf(gettext("config: The configuration cannot be " "determined.\n")); @@ -3241,6 +3624,7 @@ status_callback(zpool_handle_t *zhp, void *data) * * -v Display complete error logs * -x Display only pools with potential problems + * -D Display dedup status (undocumented) * * Describes the health status of all pools or some subset. */ @@ -3252,7 +3636,7 @@ zpool_do_status(int argc, char **argv) status_cbdata_t cb = { 0 }; /* check options */ - while ((c = getopt(argc, argv, "vx")) != -1) { + while ((c = getopt(argc, argv, "vxD")) != -1) { switch (c) { case 'v': cb.cb_verbose = B_TRUE; @@ -3260,6 +3644,9 @@ zpool_do_status(int argc, char **argv) case 'x': cb.cb_explain = B_TRUE; break; + case 'D': + cb.cb_dedup_stats = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -3409,7 +3796,7 @@ zpool_do_upgrade(int argc, char **argv) /* check options */ - while ((c = getopt(argc, argv, "avV:")) != -1) { + while ((c = getopt(argc, argv, ":avV:")) != -1) { switch (c) { case 'a': cb.cb_all = B_TRUE; @@ -3426,6 +3813,11 @@ zpool_do_upgrade(int argc, char **argv) usage(B_FALSE); } break; + case ':': + (void) fprintf(stderr, gettext("missing argument for " + "'%c' option\n"), optopt); + usage(B_FALSE); + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -3486,9 +3878,18 @@ zpool_do_upgrade(int argc, char **argv) (void) printf(gettext(" 11 Improved scrub performance\n")); (void) printf(gettext(" 12 Snapshot properties\n")); (void) printf(gettext(" 13 snapused property\n")); - (void) printf(gettext(" 14 passthrough-x aclinherit " - "support\n")); - (void) printf(gettext("For more information on a particular " + (void) printf(gettext(" 14 passthrough-x aclinherit\n")); + (void) printf(gettext(" 15 user/group space accounting\n")); + (void) printf(gettext(" 16 stmf property support\n")); + (void) printf(gettext(" 17 Triple-parity RAID-Z\n")); + (void) printf(gettext(" 18 Snapshot user holds\n")); + (void) printf(gettext(" 19 Log device removal\n")); + (void) printf(gettext(" 20 Compression using zle " + "(zero-length encoding)\n")); + (void) printf(gettext(" 21 Deduplication\n")); + (void) printf(gettext(" 22 Received properties\n")); + (void) printf(gettext(" 23 Slim ZIL\n")); + (void) printf(gettext("\nFor more information on a particular " "version, including supported releases, see:\n\n")); (void) printf("http://www.opensolaris.org/os/community/zfs/" "version/N\n\n"); @@ -3534,47 +3935,6 @@ typedef struct hist_cbdata { int internal; } hist_cbdata_t; -char *hist_event_table[LOG_END] = { - "invalid event", - "pool create", - "vdev add", - "pool remove", - "pool destroy", - "pool export", - "pool import", - "vdev attach", - "vdev replace", - "vdev detach", - "vdev online", - "vdev offline", - "vdev upgrade", - "pool clear", - "pool scrub", - "pool property set", - "create", - "clone", - "destroy", - "destroy_begin_sync", - "inherit", - "property set", - "quota set", - "permission update", - "permission remove", - "permission who remove", - "promote", - "receive", - "rename", - "reservation set", - "replay_inc_sync", - "replay_full_sync", - "rollback", - "snapshot", - "filesystem version upgrade", - "refquota set", - "refreservation set", - "pool scrub done", -}; - /* * Print out the command history for a specific pool. */ @@ -3744,7 +4104,8 @@ get_callback(zpool_handle_t *zhp, void *data) continue; zprop_print_one_property(zpool_get_name(zhp), cbp, - zpool_prop_to_name(pl->pl_prop), value, srctype, NULL); + zpool_prop_to_name(pl->pl_prop), value, srctype, NULL, + NULL); } return (0); } diff --git a/external/cddl/osnet/dist/cmd/zpool/zpool_util.c b/external/cddl/osnet/dist/cmd/zpool/zpool_util.c index f44da4ff60f53..c7a002efb17cf 100644 --- a/external/cddl/osnet/dist/cmd/zpool/zpool_util.c +++ b/external/cddl/osnet/dist/cmd/zpool/zpool_util.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -50,22 +48,6 @@ safe_malloc(size_t size) return (data); } -/* - * Same as above, but for strdup() - */ -char * -safe_strdup(const char *str) -{ - char *ret; - - if ((ret = strdup(str)) == NULL) { - (void) fprintf(stderr, "internal error: out of memory\n"); - exit(1); - } - - return (ret); -} - /* * Display an out of memory error message and abort the current program. */ diff --git a/external/cddl/osnet/dist/cmd/zpool/zpool_util.h b/external/cddl/osnet/dist/cmd/zpool/zpool_util.h index e82f3202af2ab..a18b8b705fd9a 100644 --- a/external/cddl/osnet/dist/cmd/zpool/zpool_util.h +++ b/external/cddl/osnet/dist/cmd/zpool/zpool_util.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,7 +37,6 @@ extern "C" { * Basic utility functions */ void *safe_malloc(size_t); -char *safe_strdup(const char *); void zpool_no_memory(void); uint_t num_logs(nvlist_t *nv); @@ -47,6 +46,8 @@ uint_t num_logs(nvlist_t *nv); nvlist_t *make_root_vdev(zpool_handle_t *zhp, int force, int check_rep, boolean_t isreplace, boolean_t dryrun, int argc, char **argv); +nvlist_t *split_mirror_vdev(zpool_handle_t *zhp, char *newname, + nvlist_t *props, splitflags_t flags, int argc, char **argv); /* * Pool list functions diff --git a/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c b/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c index 10007c14927f6..3c725d232c77c 100644 --- a/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c +++ b/external/cddl/osnet/dist/cmd/zpool/zpool_vdev.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +67,7 @@ #include #include #include +#include #include #include #include @@ -1093,20 +1094,35 @@ check_in_use(nvlist_t *config, nvlist_t *nv, int force, int isreplacing, } static const char * -is_grouping(const char *type, int *mindev) +is_grouping(const char *type, int *mindev, int *maxdev) { - if (strcmp(type, "raidz") == 0 || strcmp(type, "raidz1") == 0) { - if (mindev != NULL) - *mindev = 2; - return (VDEV_TYPE_RAIDZ); - } + if (strncmp(type, "raidz", 5) == 0) { + const char *p = type + 5; + char *end; + long nparity; + + if (*p == '\0') { + nparity = 1; + } else if (*p == '0') { + return (NULL); /* no zero prefixes allowed */ + } else { + errno = 0; + nparity = strtol(p, &end, 10); + if (errno != 0 || nparity < 1 || nparity >= 255 || + *end != '\0') + return (NULL); + } - if (strcmp(type, "raidz2") == 0) { if (mindev != NULL) - *mindev = 3; + *mindev = nparity + 1; + if (maxdev != NULL) + *maxdev = 255; return (VDEV_TYPE_RAIDZ); } + if (maxdev != NULL) + *maxdev = INT_MAX; + if (strcmp(type, "mirror") == 0) { if (mindev != NULL) *mindev = 2; @@ -1144,7 +1160,7 @@ nvlist_t * construct_spec(int argc, char **argv) { nvlist_t *nvroot, *nv, **top, **spares, **l2cache; - int t, toplevels, mindev, nspares, nlogs, nl2cache; + int t, toplevels, mindev, maxdev, nspares, nlogs, nl2cache; const char *type; uint64_t is_log; boolean_t seen_logs; @@ -1166,7 +1182,7 @@ construct_spec(int argc, char **argv) * If it's a mirror or raidz, the subsequent arguments are * its leaves -- until we encounter the next mirror or raidz. */ - if ((type = is_grouping(argv[0], &mindev)) != NULL) { + if ((type = is_grouping(argv[0], &mindev, &maxdev)) != NULL) { nvlist_t **child = NULL; int c, children = 0; @@ -1223,7 +1239,7 @@ construct_spec(int argc, char **argv) } for (c = 1; c < argc; c++) { - if (is_grouping(argv[c], NULL) != NULL) + if (is_grouping(argv[c], NULL, NULL) != NULL) break; children++; child = realloc(child, @@ -1243,6 +1259,13 @@ construct_spec(int argc, char **argv) return (NULL); } + if (children > maxdev) { + (void) fprintf(stderr, gettext("invalid vdev " + "specification: %s supports no more than " + "%d devices\n"), argv[0], maxdev); + return (NULL); + } + argc -= c; argv += c; @@ -1337,6 +1360,52 @@ construct_spec(int argc, char **argv) return (nvroot); } +nvlist_t * +split_mirror_vdev(zpool_handle_t *zhp, char *newname, nvlist_t *props, + splitflags_t flags, int argc, char **argv) +{ + nvlist_t *newroot = NULL, **child; + uint_t c, children; + + if (argc > 0) { + if ((newroot = construct_spec(argc, argv)) == NULL) { + (void) fprintf(stderr, gettext("Unable to build a " + "pool from the specified devices\n")); + return (NULL); + } + + if (!flags.dryrun && make_disks(zhp, newroot) != 0) { + nvlist_free(newroot); + return (NULL); + } + + /* avoid any tricks in the spec */ + verify(nvlist_lookup_nvlist_array(newroot, + ZPOOL_CONFIG_CHILDREN, &child, &children) == 0); + for (c = 0; c < children; c++) { + char *path; + const char *type; + int min, max; + + verify(nvlist_lookup_string(child[c], + ZPOOL_CONFIG_PATH, &path) == 0); + if ((type = is_grouping(path, &min, &max)) != NULL) { + (void) fprintf(stderr, gettext("Cannot use " + "'%s' as a device for splitting\n"), type); + nvlist_free(newroot); + return (NULL); + } + } + } + + if (zpool_vdev_split(zhp, newname, &newroot, props, flags) != 0) { + if (newroot != NULL) + nvlist_free(newroot); + return (NULL); + } + + return (newroot); +} /* * Get and validate the contents of the given vdev specification. This ensures diff --git a/external/cddl/osnet/dist/cmd/ztest/ztest.c b/external/cddl/osnet/dist/cmd/ztest/ztest.c index 53cc6c7093b72..24464b4594b9e 100644 --- a/external/cddl/osnet/dist/cmd/ztest/ztest.c +++ b/external/cddl/osnet/dist/cmd/ztest/ztest.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -76,6 +76,7 @@ #include #include #include +#include #include #include #include @@ -85,13 +86,14 @@ #include #include #include -#include -#include #include +#include #include #include #include +#include #include +#include #include #include #include @@ -103,6 +105,7 @@ #include #include #include +#include static char cmdname[] = "ztest"; static char *zopt_pool = cmdname; @@ -122,41 +125,103 @@ static int zopt_verbose = 0; static int zopt_init = 1; static char *zopt_dir = "/tmp"; static uint64_t zopt_time = 300; /* 5 minutes */ -static int zopt_maxfaults; + +#define BT_MAGIC 0x123456789abcdefULL +#define MAXFAULTS() (MAX(zs->zs_mirrors, 1) * (zopt_raidz_parity + 1) - 1) + +enum ztest_io_type { + ZTEST_IO_WRITE_TAG, + ZTEST_IO_WRITE_PATTERN, + ZTEST_IO_WRITE_ZEROES, + ZTEST_IO_TRUNCATE, + ZTEST_IO_SETATTR, + ZTEST_IO_TYPES +}; typedef struct ztest_block_tag { + uint64_t bt_magic; uint64_t bt_objset; uint64_t bt_object; uint64_t bt_offset; + uint64_t bt_gen; uint64_t bt_txg; - uint64_t bt_thread; - uint64_t bt_seq; + uint64_t bt_crtxg; } ztest_block_tag_t; -typedef struct ztest_args { - char za_pool[MAXNAMELEN]; - spa_t *za_spa; - objset_t *za_os; - zilog_t *za_zilog; - thread_t za_thread; - uint64_t za_instance; - uint64_t za_random; - uint64_t za_diroff; - uint64_t za_diroff_shared; - uint64_t za_zil_seq; - hrtime_t za_start; - hrtime_t za_stop; - hrtime_t za_kill; - /* - * Thread-local variables can go here to aid debugging. - */ - ztest_block_tag_t za_rbt; - ztest_block_tag_t za_wbt; - dmu_object_info_t za_doi; - dmu_buf_t *za_dbuf; -} ztest_args_t; - -typedef void ztest_func_t(ztest_args_t *); +typedef struct bufwad { + uint64_t bw_index; + uint64_t bw_txg; + uint64_t bw_data; +} bufwad_t; + +/* + * XXX -- fix zfs range locks to be generic so we can use them here. + */ +typedef enum { + RL_READER, + RL_WRITER, + RL_APPEND +} rl_type_t; + +typedef struct rll { + void *rll_writer; + int rll_readers; + mutex_t rll_lock; + cond_t rll_cv; +} rll_t; + +typedef struct rl { + uint64_t rl_object; + uint64_t rl_offset; + uint64_t rl_size; + rll_t *rl_lock; +} rl_t; + +#define ZTEST_RANGE_LOCKS 64 +#define ZTEST_OBJECT_LOCKS 64 + +/* + * Object descriptor. Used as a template for object lookup/create/remove. + */ +typedef struct ztest_od { + uint64_t od_dir; + uint64_t od_object; + dmu_object_type_t od_type; + dmu_object_type_t od_crtype; + uint64_t od_blocksize; + uint64_t od_crblocksize; + uint64_t od_gen; + uint64_t od_crgen; + char od_name[MAXNAMELEN]; +} ztest_od_t; + +/* + * Per-dataset state. + */ +typedef struct ztest_ds { + objset_t *zd_os; + zilog_t *zd_zilog; + uint64_t zd_seq; + ztest_od_t *zd_od; /* debugging aid */ + char zd_name[MAXNAMELEN]; + mutex_t zd_dirobj_lock; + rll_t zd_object_lock[ZTEST_OBJECT_LOCKS]; + rll_t zd_range_lock[ZTEST_RANGE_LOCKS]; +} ztest_ds_t; + +/* + * Per-iteration state. + */ +typedef void ztest_func_t(ztest_ds_t *zd, uint64_t id); + +typedef struct ztest_info { + ztest_func_t *zi_func; /* test function */ + uint64_t zi_iters; /* iterations per execution */ + uint64_t *zi_interval; /* execute every seconds */ + uint64_t zi_call_count; /* per-pass count */ + uint64_t zi_call_time; /* per-pass time */ + uint64_t zi_call_next; /* next time to call this function */ +} ztest_info_t; /* * Note: these aren't static because we want dladdr() to work. @@ -164,94 +229,126 @@ typedef void ztest_func_t(ztest_args_t *); ztest_func_t ztest_dmu_read_write; ztest_func_t ztest_dmu_write_parallel; ztest_func_t ztest_dmu_object_alloc_free; +ztest_func_t ztest_dmu_commit_callbacks; ztest_func_t ztest_zap; ztest_func_t ztest_zap_parallel; -ztest_func_t ztest_traverse; -ztest_func_t ztest_dsl_prop_get_set; +ztest_func_t ztest_zil_commit; +ztest_func_t ztest_dmu_read_write_zcopy; ztest_func_t ztest_dmu_objset_create_destroy; +ztest_func_t ztest_dmu_prealloc; +ztest_func_t ztest_fzap; ztest_func_t ztest_dmu_snapshot_create_destroy; +ztest_func_t ztest_dsl_prop_get_set; +ztest_func_t ztest_spa_prop_get_set; ztest_func_t ztest_spa_create_destroy; ztest_func_t ztest_fault_inject; +ztest_func_t ztest_ddt_repair; +ztest_func_t ztest_dmu_snapshot_hold; ztest_func_t ztest_spa_rename; +ztest_func_t ztest_scrub; +ztest_func_t ztest_dsl_dataset_promote_busy; ztest_func_t ztest_vdev_attach_detach; ztest_func_t ztest_vdev_LUN_growth; ztest_func_t ztest_vdev_add_remove; ztest_func_t ztest_vdev_aux_add_remove; -ztest_func_t ztest_scrub; - -typedef struct ztest_info { - ztest_func_t *zi_func; /* test function */ - uint64_t zi_iters; /* iterations per execution */ - uint64_t *zi_interval; /* execute every seconds */ - uint64_t zi_calls; /* per-pass count */ - uint64_t zi_call_time; /* per-pass time */ - uint64_t zi_call_total; /* cumulative total */ - uint64_t zi_call_target; /* target cumulative total */ -} ztest_info_t; +ztest_func_t ztest_split_pool; -uint64_t zopt_always = 0; /* all the time */ -uint64_t zopt_often = 1; /* every second */ -uint64_t zopt_sometimes = 10; /* every 10 seconds */ -uint64_t zopt_rarely = 60; /* every 60 seconds */ +uint64_t zopt_always = 0ULL * NANOSEC; /* all the time */ +uint64_t zopt_incessant = 1ULL * NANOSEC / 10; /* every 1/10 second */ +uint64_t zopt_often = 1ULL * NANOSEC; /* every second */ +uint64_t zopt_sometimes = 10ULL * NANOSEC; /* every 10 seconds */ +uint64_t zopt_rarely = 60ULL * NANOSEC; /* every 60 seconds */ ztest_info_t ztest_info[] = { { ztest_dmu_read_write, 1, &zopt_always }, - { ztest_dmu_write_parallel, 30, &zopt_always }, + { ztest_dmu_write_parallel, 10, &zopt_always }, { ztest_dmu_object_alloc_free, 1, &zopt_always }, + { ztest_dmu_commit_callbacks, 1, &zopt_always }, { ztest_zap, 30, &zopt_always }, { ztest_zap_parallel, 100, &zopt_always }, - { ztest_dsl_prop_get_set, 1, &zopt_sometimes }, - { ztest_dmu_objset_create_destroy, 1, &zopt_sometimes }, - { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, - { ztest_spa_create_destroy, 1, &zopt_sometimes }, + { ztest_split_pool, 1, &zopt_always }, + { ztest_zil_commit, 1, &zopt_incessant }, + { ztest_dmu_read_write_zcopy, 1, &zopt_often }, + { ztest_dmu_objset_create_destroy, 1, &zopt_often }, + { ztest_dsl_prop_get_set, 1, &zopt_often }, + { ztest_spa_prop_get_set, 1, &zopt_sometimes }, +#if 0 + { ztest_dmu_prealloc, 1, &zopt_sometimes }, +#endif + { ztest_fzap, 1, &zopt_sometimes }, + { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, + { ztest_spa_create_destroy, 1, &zopt_sometimes }, { ztest_fault_inject, 1, &zopt_sometimes }, + { ztest_ddt_repair, 1, &zopt_sometimes }, + { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, { ztest_spa_rename, 1, &zopt_rarely }, + { ztest_scrub, 1, &zopt_rarely }, + { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, { ztest_vdev_attach_detach, 1, &zopt_rarely }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, { ztest_vdev_add_remove, 1, &zopt_vdevtime }, { ztest_vdev_aux_add_remove, 1, &zopt_vdevtime }, - { ztest_scrub, 1, &zopt_vdevtime }, }; #define ZTEST_FUNCS (sizeof (ztest_info) / sizeof (ztest_info_t)) -#define ZTEST_SYNC_LOCKS 16 +/* + * The following struct is used to hold a list of uncalled commit callbacks. + * The callbacks are ordered by txg number. + */ +typedef struct ztest_cb_list { + mutex_t zcl_callbacks_lock; + list_t zcl_callbacks; +} ztest_cb_list_t; /* * Stuff we need to share writably between parent and child. */ typedef struct ztest_shared { - mutex_t zs_vdev_lock; - rwlock_t zs_name_lock; - uint64_t zs_vdev_primaries; - uint64_t zs_vdev_aux; + char *zs_pool; + spa_t *zs_spa; + hrtime_t zs_proc_start; + hrtime_t zs_proc_stop; + hrtime_t zs_thread_start; + hrtime_t zs_thread_stop; + hrtime_t zs_thread_kill; uint64_t zs_enospc_count; - hrtime_t zs_start_time; - hrtime_t zs_stop_time; + uint64_t zs_vdev_next_leaf; + uint64_t zs_vdev_aux; uint64_t zs_alloc; uint64_t zs_space; + mutex_t zs_vdev_lock; + rwlock_t zs_name_lock; ztest_info_t zs_info[ZTEST_FUNCS]; - mutex_t zs_sync_lock[ZTEST_SYNC_LOCKS]; - uint64_t zs_seq[ZTEST_SYNC_LOCKS]; + uint64_t zs_splits; + uint64_t zs_mirrors; + ztest_ds_t zs_zd[]; } ztest_shared_t; +#define ID_PARALLEL -1ULL + static char ztest_dev_template[] = "%s/%s.%llua"; static char ztest_aux_template[] = "%s/%s.%s.%llu"; -static ztest_shared_t *ztest_shared; +ztest_shared_t *ztest_shared; +uint64_t *ztest_seq; static int ztest_random_fd; static int ztest_dump_core = 1; static boolean_t ztest_exiting; -extern uint64_t metaslab_gang_bang; +/* Global commit callback list */ +static ztest_cb_list_t zcl; -#define ZTEST_DIROBJ 1 -#define ZTEST_MICROZAP_OBJ 2 -#define ZTEST_FATZAP_OBJ 3 +extern uint64_t metaslab_gang_bang; +extern uint64_t metaslab_df_alloc_threshold; +static uint64_t metaslab_sz; -#define ZTEST_DIROBJ_BLOCKSIZE (1 << 10) -#define ZTEST_DIRSIZE 256 +enum ztest_object { + ZTEST_META_DNODE = 0, + ZTEST_DIROBJ, + ZTEST_OBJECTS +}; static void usage(boolean_t) __NORETURN; @@ -405,27 +502,6 @@ usage(boolean_t requested) exit(requested ? 0 : 1); } -static uint64_t -ztest_random(uint64_t range) -{ - uint64_t r; - - if (range == 0) - return (0); - - if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r)) - fatal(1, "short read from /dev/urandom"); - - return (r % range); -} - -static void -ztest_record_enospc(char *s) -{ - dprintf("ENOSPC doing: %s\n", s ? s : ""); - ztest_shared->zs_enospc_count++; -} - static void process_options(int argc, char **argv) { @@ -471,7 +547,7 @@ process_options(int argc, char **argv) zopt_raidz = MAX(1, value); break; case 'R': - zopt_raidz_parity = MIN(MAX(value, 1), 2); + zopt_raidz_parity = MIN(MAX(value, 1), 3); break; case 'd': zopt_datasets = MAX(1, value); @@ -518,8 +594,37 @@ process_options(int argc, char **argv) zopt_raidz_parity = MIN(zopt_raidz_parity, zopt_raidz - 1); - zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time / zopt_vdevs : UINT64_MAX); - zopt_maxfaults = MAX(zopt_mirrors, 1) * (zopt_raidz_parity + 1) - 1; + zopt_vdevtime = (zopt_vdevs > 0 ? zopt_time * NANOSEC / zopt_vdevs : + UINT64_MAX >> 2); +} + +static void +ztest_kill(ztest_shared_t *zs) +{ + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(zs->zs_spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(zs->zs_spa)); + (void) kill(getpid(), SIGKILL); +} + +static uint64_t +ztest_random(uint64_t range) +{ + uint64_t r; + + if (range == 0) + return (0); + + if (read(ztest_random_fd, &r, sizeof (r)) != sizeof (r)) + fatal(1, "short read from /dev/urandom"); + + return (r % range); +} + +/* ARGSUSED */ +static void +ztest_record_enospc(const char *s) +{ + ztest_shared->zs_enospc_count++; } static uint64_t @@ -548,7 +653,7 @@ make_vdev_file(char *path, char *aux, size_t size, uint64_t ashift) (void) sprintf(path, ztest_aux_template, zopt_dir, zopt_pool, aux, vdev); } else { - vdev = ztest_shared->zs_vdev_primaries++; + vdev = ztest_shared->zs_vdev_next_leaf++; (void) sprintf(path, ztest_dev_template, zopt_dir, zopt_pool, vdev); } @@ -659,270 +764,1479 @@ make_vdev_root(char *path, char *aux, size_t size, uint64_t ashift, return (root); } -static void -ztest_set_random_blocksize(objset_t *os, uint64_t object, dmu_tx_t *tx) +static int +ztest_random_blocksize(void) { - int bs = SPA_MINBLOCKSHIFT + - ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1); - int ibs = DN_MIN_INDBLKSHIFT + - ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1); - int error; + return (1 << (SPA_MINBLOCKSHIFT + + ztest_random(SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1))); +} - error = dmu_object_set_blocksize(os, object, 1ULL << bs, ibs, tx); - if (error) { - char osname[300]; - dmu_objset_name(os, osname); - fatal(0, "dmu_object_set_blocksize('%s', %llu, %d, %d) = %d", - osname, object, 1 << bs, ibs, error); - } +static int +ztest_random_ibshift(void) +{ + return (DN_MIN_INDBLKSHIFT + + ztest_random(DN_MAX_INDBLKSHIFT - DN_MIN_INDBLKSHIFT + 1)); } -static uint8_t -ztest_random_checksum(void) +static uint64_t +ztest_random_vdev_top(spa_t *spa, boolean_t log_ok) { - uint8_t checksum; + uint64_t top; + vdev_t *rvd = spa->spa_root_vdev; + vdev_t *tvd; - do { - checksum = ztest_random(ZIO_CHECKSUM_FUNCTIONS); - } while (zio_checksum_table[checksum].ci_zbt); + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - if (checksum == ZIO_CHECKSUM_OFF) - checksum = ZIO_CHECKSUM_ON; + do { + top = ztest_random(rvd->vdev_children); + tvd = rvd->vdev_child[top]; + } while (tvd->vdev_ishole || (tvd->vdev_islog && !log_ok) || + tvd->vdev_mg == NULL || tvd->vdev_mg->mg_class == NULL); - return (checksum); + return (top); } -static uint8_t -ztest_random_compress(void) +static uint64_t +ztest_random_dsl_prop(zfs_prop_t prop) { - return ((uint8_t)ztest_random(ZIO_COMPRESS_FUNCTIONS)); -} + uint64_t value; -typedef struct ztest_replay { - objset_t *zr_os; - uint64_t zr_assign; -} ztest_replay_t; + do { + value = zfs_prop_random_value(prop, ztest_random(-1ULL)); + } while (prop == ZFS_PROP_CHECKSUM && value == ZIO_CHECKSUM_OFF); + + return (value); +} static int -ztest_replay_create(ztest_replay_t *zr, lr_create_t *lr, boolean_t byteswap) +ztest_dsl_prop_set_uint64(char *osname, zfs_prop_t prop, uint64_t value, + boolean_t inherit) { - objset_t *os = zr->zr_os; - dmu_tx_t *tx; + const char *propname = zfs_prop_to_name(prop); + const char *valname; + char setpoint[MAXPATHLEN]; + uint64_t curval; int error; - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); + error = dsl_prop_set(osname, propname, + (inherit ? ZPROP_SRC_NONE : ZPROP_SRC_LOCAL), + sizeof (value), 1, &value); - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - error = dmu_tx_assign(tx, zr->zr_assign); - if (error) { - dmu_tx_abort(tx); + if (error == ENOSPC) { + ztest_record_enospc(FTAG); return (error); } - - error = dmu_object_claim(os, lr->lr_doid, lr->lr_mode, 0, - DMU_OT_NONE, 0, tx); ASSERT3U(error, ==, 0); - dmu_tx_commit(tx); - if (zopt_verbose >= 5) { - char osname[MAXNAMELEN]; - dmu_objset_name(os, osname); - (void) printf("replay create of %s object %llu" - " in txg %llu = %d\n", - osname, (u_longlong_t)lr->lr_doid, - (u_longlong_t)zr->zr_assign, error); + VERIFY3U(dsl_prop_get(osname, propname, sizeof (curval), + 1, &curval, setpoint), ==, 0); + + if (zopt_verbose >= 6) { + VERIFY(zfs_prop_index_to_string(prop, curval, &valname) == 0); + (void) printf("%s %s = %s at '%s'\n", + osname, propname, valname, setpoint); } return (error); } static int -ztest_replay_remove(ztest_replay_t *zr, lr_remove_t *lr, boolean_t byteswap) +ztest_spa_prop_set_uint64(ztest_shared_t *zs, zpool_prop_t prop, uint64_t value) { - objset_t *os = zr->zr_os; - dmu_tx_t *tx; + spa_t *spa = zs->zs_spa; + nvlist_t *props = NULL; int error; - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, zpool_prop_to_name(prop), value) == 0); - tx = dmu_tx_create(os); - dmu_tx_hold_free(tx, lr->lr_doid, 0, DMU_OBJECT_END); - error = dmu_tx_assign(tx, zr->zr_assign); - if (error) { - dmu_tx_abort(tx); + error = spa_prop_set(spa, props); + + nvlist_free(props); + + if (error == ENOSPC) { + ztest_record_enospc(FTAG); return (error); } - - error = dmu_object_free(os, lr->lr_doid, tx); - dmu_tx_commit(tx); + ASSERT3U(error, ==, 0); return (error); } -zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { - NULL, /* 0 no such transaction type */ - ztest_replay_create, /* TX_CREATE */ - NULL, /* TX_MKDIR */ - NULL, /* TX_MKXATTR */ - NULL, /* TX_SYMLINK */ - ztest_replay_remove, /* TX_REMOVE */ - NULL, /* TX_RMDIR */ - NULL, /* TX_LINK */ - NULL, /* TX_RENAME */ - NULL, /* TX_WRITE */ - NULL, /* TX_TRUNCATE */ - NULL, /* TX_SETATTR */ - NULL, /* TX_ACL */ -}; +static void +ztest_rll_init(rll_t *rll) +{ + rll->rll_writer = NULL; + rll->rll_readers = 0; + VERIFY(_mutex_init(&rll->rll_lock, USYNC_THREAD, NULL) == 0); + VERIFY(cond_init(&rll->rll_cv, USYNC_THREAD, NULL) == 0); +} -/* - * Verify that we can't destroy an active pool, create an existing pool, - * or create a pool with a bad vdev spec. - */ -void -ztest_spa_create_destroy(ztest_args_t *za) +static void +ztest_rll_destroy(rll_t *rll) { - int error; - spa_t *spa; - nvlist_t *nvroot; + ASSERT(rll->rll_writer == NULL); + ASSERT(rll->rll_readers == 0); + VERIFY(_mutex_destroy(&rll->rll_lock) == 0); + VERIFY(cond_destroy(&rll->rll_cv) == 0); +} - /* - * Attempt to create using a bad file. - */ - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); - error = spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL); - nvlist_free(nvroot); - if (error != ENOENT) - fatal(0, "spa_create(bad_file) = %d", error); +static void +ztest_rll_lock(rll_t *rll, rl_type_t type) +{ + VERIFY(mutex_lock(&rll->rll_lock) == 0); - /* - * Attempt to create using a bad mirror. - */ - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); - error = spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL); - nvlist_free(nvroot); - if (error != ENOENT) - fatal(0, "spa_create(bad_mirror) = %d", error); + if (type == RL_READER) { + while (rll->rll_writer != NULL) + (void) cond_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_readers++; + } else { + while (rll->rll_writer != NULL || rll->rll_readers) + (void) cond_wait(&rll->rll_cv, &rll->rll_lock); + rll->rll_writer = curthread; + } - /* - * Attempt to create an existing pool. It shouldn't matter - * what's in the nvroot; we should fail with EEXIST. - */ - (void) rw_rdlock(&ztest_shared->zs_name_lock); - nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); - error = spa_create(za->za_pool, nvroot, NULL, NULL, NULL); - nvlist_free(nvroot); - if (error != EEXIST) - fatal(0, "spa_create(whatever) = %d", error); + VERIFY(mutex_unlock(&rll->rll_lock) == 0); +} - error = spa_open(za->za_pool, &spa, FTAG); - if (error) - fatal(0, "spa_open() = %d", error); +static void +ztest_rll_unlock(rll_t *rll) +{ + VERIFY(mutex_lock(&rll->rll_lock) == 0); - error = spa_destroy(za->za_pool); - if (error != EBUSY) - fatal(0, "spa_destroy() = %d", error); + if (rll->rll_writer) { + ASSERT(rll->rll_readers == 0); + rll->rll_writer = NULL; + } else { + ASSERT(rll->rll_readers != 0); + ASSERT(rll->rll_writer == NULL); + rll->rll_readers--; + } - spa_close(spa, FTAG); - (void) rw_unlock(&ztest_shared->zs_name_lock); + if (rll->rll_writer == NULL && rll->rll_readers == 0) + VERIFY(cond_broadcast(&rll->rll_cv) == 0); + + VERIFY(mutex_unlock(&rll->rll_lock) == 0); } -static vdev_t * -vdev_lookup_by_path(vdev_t *vd, const char *path) +static void +ztest_object_lock(ztest_ds_t *zd, uint64_t object, rl_type_t type) { - vdev_t *mvd; + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; - if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) - return (vd); + ztest_rll_lock(rll, type); +} - for (int c = 0; c < vd->vdev_children; c++) - if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != - NULL) - return (mvd); +static void +ztest_object_unlock(ztest_ds_t *zd, uint64_t object) +{ + rll_t *rll = &zd->zd_object_lock[object & (ZTEST_OBJECT_LOCKS - 1)]; - return (NULL); + ztest_rll_unlock(rll); } -/* - * Verify that vdev_add() works as expected. - */ -void -ztest_vdev_add_remove(ztest_args_t *za) +static rl_t * +ztest_range_lock(ztest_ds_t *zd, uint64_t object, uint64_t offset, + uint64_t size, rl_type_t type) { - spa_t *spa = za->za_spa; - uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; - nvlist_t *nvroot; - int error; + uint64_t hash = object ^ (offset % (ZTEST_RANGE_LOCKS + 1)); + rll_t *rll = &zd->zd_range_lock[hash & (ZTEST_RANGE_LOCKS - 1)]; + rl_t *rl; - (void) mutex_lock(&ztest_shared->zs_vdev_lock); + rl = umem_alloc(sizeof (*rl), UMEM_NOFAIL); + rl->rl_object = object; + rl->rl_offset = offset; + rl->rl_size = size; + rl->rl_lock = rll; - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + ztest_rll_lock(rll, type); - ztest_shared->zs_vdev_primaries = - spa->spa_root_vdev->vdev_children * leaves; + return (rl); +} - spa_config_exit(spa, SCL_VDEV, FTAG); +static void +ztest_range_unlock(rl_t *rl) +{ + rll_t *rll = rl->rl_lock; - /* - * Make 1/4 of the devices be log devices. - */ - nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, - ztest_random(4) == 0, zopt_raidz, zopt_mirrors, 1); + ztest_rll_unlock(rll); - error = spa_vdev_add(spa, nvroot); - nvlist_free(nvroot); + umem_free(rl, sizeof (*rl)); +} + +static void +ztest_zd_init(ztest_ds_t *zd, objset_t *os) +{ + zd->zd_os = os; + zd->zd_zilog = dmu_objset_zil(os); + zd->zd_seq = 0; + dmu_objset_name(os, zd->zd_name); - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + VERIFY(_mutex_init(&zd->zd_dirobj_lock, USYNC_THREAD, NULL) == 0); - if (error == ENOSPC) - ztest_record_enospc("spa_vdev_add"); - else if (error != 0) - fatal(0, "spa_vdev_add() = %d", error); + for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_init(&zd->zd_object_lock[l]); + + for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_init(&zd->zd_range_lock[l]); } -/* - * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. - */ -void -ztest_vdev_aux_add_remove(ztest_args_t *za) +static void +ztest_zd_fini(ztest_ds_t *zd) { - spa_t *spa = za->za_spa; - vdev_t *rvd = spa->spa_root_vdev; - spa_aux_vdev_t *sav; - char *aux; - uint64_t guid = 0; - int error; + VERIFY(_mutex_destroy(&zd->zd_dirobj_lock) == 0); - if (ztest_random(2) == 0) { - sav = &spa->spa_spares; - aux = ZPOOL_CONFIG_SPARES; - } else { - sav = &spa->spa_l2cache; - aux = ZPOOL_CONFIG_L2CACHE; - } + for (int l = 0; l < ZTEST_OBJECT_LOCKS; l++) + ztest_rll_destroy(&zd->zd_object_lock[l]); - (void) mutex_lock(&ztest_shared->zs_vdev_lock); + for (int l = 0; l < ZTEST_RANGE_LOCKS; l++) + ztest_rll_destroy(&zd->zd_range_lock[l]); +} - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); +#define TXG_MIGHTWAIT (ztest_random(10) == 0 ? TXG_NOWAIT : TXG_WAIT) - if (sav->sav_count != 0 && ztest_random(4) == 0) { - /* - * Pick a random device to remove. - */ - guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; - } else { - /* - * Find an unused device we can add. - */ - ztest_shared->zs_vdev_aux = 0; - for (;;) { - char path[MAXPATHLEN]; - int c; +static uint64_t +ztest_tx_assign(dmu_tx_t *tx, uint64_t txg_how, const char *tag) +{ + uint64_t txg; + int error; + + /* + * Attempt to assign tx to some transaction group. + */ + error = dmu_tx_assign(tx, txg_how); + if (error) { + if (error == ERESTART) { + ASSERT(txg_how == TXG_NOWAIT); + dmu_tx_wait(tx); + } else { + ASSERT3U(error, ==, ENOSPC); + ztest_record_enospc(tag); + } + dmu_tx_abort(tx); + return (0); + } + txg = dmu_tx_get_txg(tx); + ASSERT(txg != 0); + return (txg); +} + +static void +ztest_pattern_set(void *buf, uint64_t size, uint64_t value) +{ + uint64_t *ip = buf; + uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); + + while (ip < ip_end) + *ip++ = value; +} + +static boolean_t +ztest_pattern_match(void *buf, uint64_t size, uint64_t value) +{ + uint64_t *ip = buf; + uint64_t *ip_end = (uint64_t *)((uintptr_t)buf + (uintptr_t)size); + uint64_t diff = 0; + + while (ip < ip_end) + diff |= (value - *ip++); + + return (diff == 0); +} + +static void +ztest_bt_generate(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) +{ + bt->bt_magic = BT_MAGIC; + bt->bt_objset = dmu_objset_id(os); + bt->bt_object = object; + bt->bt_offset = offset; + bt->bt_gen = gen; + bt->bt_txg = txg; + bt->bt_crtxg = crtxg; +} + +static void +ztest_bt_verify(ztest_block_tag_t *bt, objset_t *os, uint64_t object, + uint64_t offset, uint64_t gen, uint64_t txg, uint64_t crtxg) +{ + ASSERT(bt->bt_magic == BT_MAGIC); + ASSERT(bt->bt_objset == dmu_objset_id(os)); + ASSERT(bt->bt_object == object); + ASSERT(bt->bt_offset == offset); + ASSERT(bt->bt_gen <= gen); + ASSERT(bt->bt_txg <= txg); + ASSERT(bt->bt_crtxg == crtxg); +} + +static ztest_block_tag_t * +ztest_bt_bonus(dmu_buf_t *db) +{ + dmu_object_info_t doi; + ztest_block_tag_t *bt; + + dmu_object_info_from_db(db, &doi); + ASSERT3U(doi.doi_bonus_size, <=, db->db_size); + ASSERT3U(doi.doi_bonus_size, >=, sizeof (*bt)); + bt = (void *)((char *)db->db_data + doi.doi_bonus_size - sizeof (*bt)); + + return (bt); +} + +/* + * ZIL logging ops + */ + +#define lrz_type lr_mode +#define lrz_blocksize lr_uid +#define lrz_ibshift lr_gid +#define lrz_bonustype lr_rdev +#define lrz_bonuslen lr_crtime[1] + +static uint64_t +ztest_log_create(ztest_ds_t *zd, dmu_tx_t *tx, lr_create_t *lr) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); +} + +static uint64_t +ztest_log_remove(ztest_ds_t *zd, dmu_tx_t *tx, lr_remove_t *lr) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + size_t namesize = strlen(name) + 1; + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + itx = zil_itx_create(TX_REMOVE, sizeof (*lr) + namesize); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) + namesize - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); +} + +static uint64_t +ztest_log_write(ztest_ds_t *zd, dmu_tx_t *tx, lr_write_t *lr) +{ + itx_t *itx; + itx_wr_state_t write_state = ztest_random(WR_NUM_STATES); + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + if (lr->lr_length > ZIL_MAX_LOG_DATA) + write_state = WR_INDIRECT; + + itx = zil_itx_create(TX_WRITE, + sizeof (*lr) + (write_state == WR_COPIED ? lr->lr_length : 0)); + + if (write_state == WR_COPIED && + dmu_read(zd->zd_os, lr->lr_foid, lr->lr_offset, lr->lr_length, + ((lr_write_t *)&itx->itx_lr) + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + write_state = WR_NEED_COPY; + } + itx->itx_private = zd; + itx->itx_wr_state = write_state; + itx->itx_sync = (ztest_random(8) == 0); + itx->itx_sod += (write_state == WR_NEED_COPY ? lr->lr_length : 0); + + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); +} + +static uint64_t +ztest_log_truncate(ztest_ds_t *zd, dmu_tx_t *tx, lr_truncate_t *lr) +{ + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + itx = zil_itx_create(TX_TRUNCATE, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); +} + +static uint64_t +ztest_log_setattr(ztest_ds_t *zd, dmu_tx_t *tx, lr_setattr_t *lr) +{ + itx_t *itx; + + if (zil_replaying(zd->zd_zilog, tx)) + return (0); + + itx = zil_itx_create(TX_SETATTR, sizeof (*lr)); + bcopy(&lr->lr_common + 1, &itx->itx_lr + 1, + sizeof (*lr) - sizeof (lr_t)); + + return (zil_itx_assign(zd->zd_zilog, itx, tx)); +} + +/* + * ZIL replay ops + */ +static int +ztest_replay_create(ztest_ds_t *zd, lr_create_t *lr, boolean_t byteswap) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + ztest_block_tag_t *bbt; + dmu_buf_t *db; + dmu_tx_t *tx; + uint64_t txg; + int error = 0; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_TRUE, name); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, B_TRUE, NULL); + } else { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + } + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) + return (ENOSPC); + + ASSERT(dmu_objset_zil(os)->zl_replay == !!lr->lr_foid); + + if (lr->lrz_type == DMU_OT_ZAP_OTHER) { + if (lr->lr_foid == 0) { + lr->lr_foid = zap_create(os, + lr->lrz_type, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } else { + error = zap_create_claim(os, lr->lr_foid, + lr->lrz_type, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } + } else { + if (lr->lr_foid == 0) { + lr->lr_foid = dmu_object_alloc(os, + lr->lrz_type, 0, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } else { + error = dmu_object_claim(os, lr->lr_foid, + lr->lrz_type, 0, lr->lrz_bonustype, + lr->lrz_bonuslen, tx); + } + } + + if (error) { + ASSERT3U(error, ==, EEXIST); + ASSERT(zd->zd_zilog->zl_replay); + dmu_tx_commit(tx); + return (error); + } + + ASSERT(lr->lr_foid != 0); + + if (lr->lrz_type != DMU_OT_ZAP_OTHER) + VERIFY3U(0, ==, dmu_object_set_blocksize(os, lr->lr_foid, + lr->lrz_blocksize, lr->lrz_ibshift, tx)); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + bbt = ztest_bt_bonus(db); + dmu_buf_will_dirty(db, tx); + ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_gen, txg, txg); + dmu_buf_rele(db, FTAG); + + VERIFY3U(0, ==, zap_add(os, lr->lr_doid, name, sizeof (uint64_t), 1, + &lr->lr_foid, tx)); + + (void) ztest_log_create(zd, tx, lr); + + dmu_tx_commit(tx); + + return (0); +} + +static int +ztest_replay_remove(ztest_ds_t *zd, lr_remove_t *lr, boolean_t byteswap) +{ + char *name = (void *)(lr + 1); /* name follows lr */ + objset_t *os = zd->zd_os; + dmu_object_info_t doi; + dmu_tx_t *tx; + uint64_t object, txg; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ASSERT(lr->lr_doid == ZTEST_DIROBJ); + ASSERT(name[0] != '\0'); + + VERIFY3U(0, ==, + zap_lookup(os, lr->lr_doid, name, sizeof (object), 1, &object)); + ASSERT(object != 0); + + ztest_object_lock(zd, object, RL_WRITER); + + VERIFY3U(0, ==, dmu_object_info(os, object, &doi)); + + tx = dmu_tx_create(os); + + dmu_tx_hold_zap(tx, lr->lr_doid, B_FALSE, name); + dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_object_unlock(zd, object); + return (ENOSPC); + } + + if (doi.doi_type == DMU_OT_ZAP_OTHER) { + VERIFY3U(0, ==, zap_destroy(os, object, tx)); + } else { + VERIFY3U(0, ==, dmu_object_free(os, object, tx)); + } + + VERIFY3U(0, ==, zap_remove(os, lr->lr_doid, name, tx)); + + (void) ztest_log_remove(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, object); + + return (0); +} + +static int +ztest_replay_write(ztest_ds_t *zd, lr_write_t *lr, boolean_t byteswap) +{ + objset_t *os = zd->zd_os; + void *data = lr + 1; /* data follows lr */ + uint64_t offset, length; + ztest_block_tag_t *bt = data; + ztest_block_tag_t *bbt; + uint64_t gen, txg, lrtxg, crtxg; + dmu_object_info_t doi; + dmu_tx_t *tx; + dmu_buf_t *db; + arc_buf_t *abuf = NULL; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + offset = lr->lr_offset; + length = lr->lr_length; + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + } + + if (bt->bt_magic == BSWAP_64(BT_MAGIC)) + byteswap_uint64_array(bt, sizeof (*bt)); + + if (bt->bt_magic != BT_MAGIC) + bt = NULL; + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, offset, length, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + dmu_object_info_from_db(db, &doi); + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + gen = bbt->bt_gen; + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, lr->lr_foid, offset, length); + + if (ztest_random(8) == 0 && length == doi.doi_data_block_size && + P2PHASE(offset, length) == 0) + abuf = dmu_request_arcbuf(db, length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + dmu_buf_rele(db, FTAG); + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + if (bt != NULL) { + /* + * Usually, verify the old data before writing new data -- + * but not always, because we also want to verify correct + * behavior when the data was not recently read into cache. + */ + ASSERT(offset % doi.doi_data_block_size == 0); + if (ztest_random(4) != 0) { + int prefetch = ztest_random(2) ? + DMU_READ_PREFETCH : DMU_READ_NO_PREFETCH; + ztest_block_tag_t rbt; + + VERIFY(dmu_read(os, lr->lr_foid, offset, + sizeof (rbt), &rbt, prefetch) == 0); + if (rbt.bt_magic == BT_MAGIC) { + ztest_bt_verify(&rbt, os, lr->lr_foid, + offset, gen, txg, crtxg); + } + } + + /* + * Writes can appear to be newer than the bonus buffer because + * the ztest_get_data() callback does a dmu_read() of the + * open-context data, which may be different than the data + * as it was when the write was generated. + */ + if (zd->zd_zilog->zl_replay) { + ztest_bt_verify(bt, os, lr->lr_foid, offset, + MAX(gen, bt->bt_gen), MAX(txg, lrtxg), + bt->bt_crtxg); + } + + /* + * Set the bt's gen/txg to the bonus buffer's gen/txg + * so that all of the usual ASSERTs will work. + */ + ztest_bt_generate(bt, os, lr->lr_foid, offset, gen, txg, crtxg); + } + + if (abuf == NULL) { + dmu_write(os, lr->lr_foid, offset, length, data, tx); + } else { + bcopy(data, abuf->b_data, length); + dmu_assign_arcbuf(db, offset, abuf, tx); + } + + (void) ztest_log_write(zd, tx, lr); + + dmu_buf_rele(db, FTAG); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +static int +ztest_replay_truncate(ztest_ds_t *zd, lr_truncate_t *lr, boolean_t byteswap) +{ + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_READER); + rl = ztest_range_lock(zd, lr->lr_foid, lr->lr_offset, lr->lr_length, + RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_free(tx, lr->lr_foid, lr->lr_offset, lr->lr_length); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + VERIFY(dmu_free_range(os, lr->lr_foid, lr->lr_offset, + lr->lr_length, tx) == 0); + + (void) ztest_log_truncate(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_range_unlock(rl); + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +static int +ztest_replay_setattr(ztest_ds_t *zd, lr_setattr_t *lr, boolean_t byteswap) +{ + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + dmu_buf_t *db; + ztest_block_tag_t *bbt; + uint64_t txg, lrtxg, crtxg; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + ztest_object_lock(zd, lr->lr_foid, RL_WRITER); + + VERIFY3U(0, ==, dmu_bonus_hold(os, lr->lr_foid, FTAG, &db)); + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, lr->lr_foid); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, lr->lr_foid); + return (ENOSPC); + } + + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + crtxg = bbt->bt_crtxg; + lrtxg = lr->lr_common.lrc_txg; + + if (zd->zd_zilog->zl_replay) { + ASSERT(lr->lr_size != 0); + ASSERT(lr->lr_mode != 0); + ASSERT(lrtxg != 0); + } else { + /* + * Randomly change the size and increment the generation. + */ + lr->lr_size = (ztest_random(db->db_size / sizeof (*bbt)) + 1) * + sizeof (*bbt); + lr->lr_mode = bbt->bt_gen + 1; + ASSERT(lrtxg == 0); + } + + /* + * Verify that the current bonus buffer is not newer than our txg. + */ + ztest_bt_verify(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, + MAX(txg, lrtxg), crtxg); + + dmu_buf_will_dirty(db, tx); + + ASSERT3U(lr->lr_size, >=, sizeof (*bbt)); + ASSERT3U(lr->lr_size, <=, db->db_size); + VERIFY3U(dmu_set_bonus(db, lr->lr_size, tx), ==, 0); + bbt = ztest_bt_bonus(db); + + ztest_bt_generate(bbt, os, lr->lr_foid, -1ULL, lr->lr_mode, txg, crtxg); + + dmu_buf_rele(db, FTAG); + + (void) ztest_log_setattr(zd, tx, lr); + + dmu_tx_commit(tx); + + ztest_object_unlock(zd, lr->lr_foid); + + return (0); +} + +zil_replay_func_t *ztest_replay_vector[TX_MAX_TYPE] = { + NULL, /* 0 no such transaction type */ + ztest_replay_create, /* TX_CREATE */ + NULL, /* TX_MKDIR */ + NULL, /* TX_MKXATTR */ + NULL, /* TX_SYMLINK */ + ztest_replay_remove, /* TX_REMOVE */ + NULL, /* TX_RMDIR */ + NULL, /* TX_LINK */ + NULL, /* TX_RENAME */ + ztest_replay_write, /* TX_WRITE */ + ztest_replay_truncate, /* TX_TRUNCATE */ + ztest_replay_setattr, /* TX_SETATTR */ + NULL, /* TX_ACL */ + NULL, /* TX_CREATE_ACL */ + NULL, /* TX_CREATE_ATTR */ + NULL, /* TX_CREATE_ACL_ATTR */ + NULL, /* TX_MKDIR_ACL */ + NULL, /* TX_MKDIR_ATTR */ + NULL, /* TX_MKDIR_ACL_ATTR */ + NULL, /* TX_WRITE2 */ +}; + +/* + * ZIL get_data callbacks + */ + +static void +ztest_get_done(zgd_t *zgd, int error) +{ + ztest_ds_t *zd = zgd->zgd_private; + uint64_t object = zgd->zgd_rl->rl_object; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + ztest_range_unlock(zgd->zgd_rl); + ztest_object_unlock(zd, object); + + if (error == 0 && zgd->zgd_bp) + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); + + umem_free(zgd, sizeof (*zgd)); +} + +static int +ztest_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) +{ + ztest_ds_t *zd = arg; + objset_t *os = zd->zd_os; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + blkptr_t *bp = &lr->lr_blkptr; + uint64_t txg = lr->lr_common.lrc_txg; + uint64_t crtxg; + dmu_object_info_t doi; + dmu_buf_t *db; + zgd_t *zgd; + int error; + + ztest_object_lock(zd, object, RL_READER); + error = dmu_bonus_hold(os, object, FTAG, &db); + if (error) { + ztest_object_unlock(zd, object); + return (error); + } + + crtxg = ztest_bt_bonus(db)->bt_crtxg; + + if (crtxg == 0 || crtxg > txg) { + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, object); + return (ENOENT); + } + + dmu_object_info_from_db(db, &doi); + dmu_buf_rele(db, FTAG); + db = NULL; + + zgd = umem_zalloc(sizeof (*zgd), UMEM_NOFAIL); + zgd->zgd_zilog = zd->zd_zilog; + zgd->zgd_private = zd; + + if (buf != NULL) { /* immediate write */ + zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, + RL_READER); + + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + ASSERT(error == 0); + } else { + size = doi.doi_data_block_size; + if (ISP2(size)) { + offset = P2ALIGN(offset, size); + } else { + ASSERT(offset < size); + offset = 0; + } + + zgd->zgd_rl = ztest_range_lock(zd, object, offset, size, + RL_READER); + + error = dmu_buf_hold(os, object, offset, zgd, &db); + + if (error == 0) { + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + ztest_get_done, zgd); + + if (error == 0) + return (0); + } + } + + ztest_get_done(zgd, error); + + return (error); +} + +static void * +ztest_lr_alloc(size_t lrsize, char *name) +{ + char *lr; + size_t namesize = name ? strlen(name) + 1 : 0; + + lr = umem_zalloc(lrsize + namesize, UMEM_NOFAIL); + + if (name) + bcopy(name, lr + lrsize, namesize); + + return (lr); +} + +void +ztest_lr_free(void *lr, size_t lrsize, char *name) +{ + size_t namesize = name ? strlen(name) + 1 : 0; + + umem_free(lr, lrsize + namesize); +} + +/* + * Lookup a bunch of objects. Returns the number of objects not found. + */ +static int +ztest_lookup(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int error; + + ASSERT(_mutex_held(&zd->zd_dirobj_lock)); + + for (int i = 0; i < count; i++, od++) { + od->od_object = 0; + error = zap_lookup(zd->zd_os, od->od_dir, od->od_name, + sizeof (uint64_t), 1, &od->od_object); + if (error) { + ASSERT(error == ENOENT); + ASSERT(od->od_object == 0); + missing++; + } else { + dmu_buf_t *db; + ztest_block_tag_t *bbt; + dmu_object_info_t doi; + + ASSERT(od->od_object != 0); + ASSERT(missing == 0); /* there should be no gaps */ + + ztest_object_lock(zd, od->od_object, RL_READER); + VERIFY3U(0, ==, dmu_bonus_hold(zd->zd_os, + od->od_object, FTAG, &db)); + dmu_object_info_from_db(db, &doi); + bbt = ztest_bt_bonus(db); + ASSERT3U(bbt->bt_magic, ==, BT_MAGIC); + od->od_type = doi.doi_type; + od->od_blocksize = doi.doi_data_block_size; + od->od_gen = bbt->bt_gen; + dmu_buf_rele(db, FTAG); + ztest_object_unlock(zd, od->od_object); + } + } + + return (missing); +} + +static int +ztest_create(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + + ASSERT(_mutex_held(&zd->zd_dirobj_lock)); + + for (int i = 0; i < count; i++, od++) { + if (missing) { + od->od_object = 0; + missing++; + continue; + } + + lr_create_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + lr->lr_foid = 0; /* 0 to allocate, > 0 to claim */ + lr->lrz_type = od->od_crtype; + lr->lrz_blocksize = od->od_crblocksize; + lr->lrz_ibshift = ztest_random_ibshift(); + lr->lrz_bonustype = DMU_OT_UINT64_OTHER; + lr->lrz_bonuslen = dmu_bonus_max(); + lr->lr_gen = od->od_crgen; + lr->lr_crtime[0] = time(NULL); + + if (ztest_replay_create(zd, lr, B_FALSE) != 0) { + ASSERT(missing == 0); + od->od_object = 0; + missing++; + } else { + od->od_object = lr->lr_foid; + od->od_type = od->od_crtype; + od->od_blocksize = od->od_crblocksize; + od->od_gen = od->od_crgen; + ASSERT(od->od_object != 0); + } + + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); +} + +static int +ztest_remove(ztest_ds_t *zd, ztest_od_t *od, int count) +{ + int missing = 0; + int error; + + ASSERT(_mutex_held(&zd->zd_dirobj_lock)); + + od += count - 1; + + for (int i = count - 1; i >= 0; i--, od--) { + if (missing) { + missing++; + continue; + } + + if (od->od_object == 0) + continue; + + lr_remove_t *lr = ztest_lr_alloc(sizeof (*lr), od->od_name); + + lr->lr_doid = od->od_dir; + + if ((error = ztest_replay_remove(zd, lr, B_FALSE)) != 0) { + ASSERT3U(error, ==, ENOSPC); + missing++; + } else { + od->od_object = 0; + } + ztest_lr_free(lr, sizeof (*lr), od->od_name); + } + + return (missing); +} + +static int +ztest_write(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size, + void *data) +{ + lr_write_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr) + size, NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + lr->lr_blkoff = 0; + BP_ZERO(&lr->lr_blkptr); + + bcopy(data, lr + 1, size); + + error = ztest_replay_write(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr) + size, NULL); + + return (error); +} + +static int +ztest_truncate(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) +{ + lr_truncate_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_offset = offset; + lr->lr_length = size; + + error = ztest_replay_truncate(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); +} + +static int +ztest_setattr(ztest_ds_t *zd, uint64_t object) +{ + lr_setattr_t *lr; + int error; + + lr = ztest_lr_alloc(sizeof (*lr), NULL); + + lr->lr_foid = object; + lr->lr_size = 0; + lr->lr_mode = 0; + + error = ztest_replay_setattr(zd, lr, B_FALSE); + + ztest_lr_free(lr, sizeof (*lr), NULL); + + return (error); +} + +static void +ztest_prealloc(ztest_ds_t *zd, uint64_t object, uint64_t offset, uint64_t size) +{ + objset_t *os = zd->zd_os; + dmu_tx_t *tx; + uint64_t txg; + rl_t *rl; + + txg_wait_synced(dmu_objset_pool(os), 0); + + ztest_object_lock(zd, object, RL_READER); + rl = ztest_range_lock(zd, object, offset, size, RL_WRITER); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, object, offset, size); + + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + + if (txg != 0) { + dmu_prealloc(os, object, offset, size, tx); + dmu_tx_commit(tx); + txg_wait_synced(dmu_objset_pool(os), txg); + } else { + (void) dmu_free_long_range(os, object, offset, size); + } + + ztest_range_unlock(rl); + ztest_object_unlock(zd, object); +} + +static void +ztest_io(ztest_ds_t *zd, uint64_t object, uint64_t offset) +{ + ztest_block_tag_t wbt; + dmu_object_info_t doi; + enum ztest_io_type io_type; + uint64_t blocksize; + void *data; + + VERIFY(dmu_object_info(zd->zd_os, object, &doi) == 0); + blocksize = doi.doi_data_block_size; + data = umem_alloc(blocksize, UMEM_NOFAIL); + + /* + * Pick an i/o type at random, biased toward writing block tags. + */ + io_type = ztest_random(ZTEST_IO_TYPES); + if (ztest_random(2) == 0) + io_type = ZTEST_IO_WRITE_TAG; + + switch (io_type) { + + case ZTEST_IO_WRITE_TAG: + ztest_bt_generate(&wbt, zd->zd_os, object, offset, 0, 0, 0); + (void) ztest_write(zd, object, offset, sizeof (wbt), &wbt); + break; + + case ZTEST_IO_WRITE_PATTERN: + (void) memset(data, 'a' + (object + offset) % 5, blocksize); + if (ztest_random(2) == 0) { + /* + * Induce fletcher2 collisions to ensure that + * zio_ddt_collision() detects and resolves them + * when using fletcher2-verify for deduplication. + */ + ((uint64_t *)data)[0] ^= 1ULL << 63; + ((uint64_t *)data)[4] ^= 1ULL << 63; + } + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_WRITE_ZEROES: + bzero(data, blocksize); + (void) ztest_write(zd, object, offset, blocksize, data); + break; + + case ZTEST_IO_TRUNCATE: + (void) ztest_truncate(zd, object, offset, blocksize); + break; + + case ZTEST_IO_SETATTR: + (void) ztest_setattr(zd, object); + break; + } + + umem_free(data, blocksize); +} + +/* + * Initialize an object description template. + */ +static void +ztest_od_init(ztest_od_t *od, uint64_t id, char *tag, uint64_t index, + dmu_object_type_t type, uint64_t blocksize, uint64_t gen) +{ + od->od_dir = ZTEST_DIROBJ; + od->od_object = 0; + + od->od_crtype = type; + od->od_crblocksize = blocksize ? blocksize : ztest_random_blocksize(); + od->od_crgen = gen; + + od->od_type = DMU_OT_NONE; + od->od_blocksize = 0; + od->od_gen = 0; + + (void) snprintf(od->od_name, sizeof (od->od_name), "%s(%lld)[%llu]", + tag, (int64_t)id, index); +} + +/* + * Lookup or create the objects for a test using the od template. + * If the objects do not all exist, or if 'remove' is specified, + * remove any existing objects and create new ones. Otherwise, + * use the existing objects. + */ +static int +ztest_object_init(ztest_ds_t *zd, ztest_od_t *od, size_t size, boolean_t remove) +{ + int count = size / sizeof (*od); + int rv = 0; + + VERIFY(mutex_lock(&zd->zd_dirobj_lock) == 0); + if ((ztest_lookup(zd, od, count) != 0 || remove) && + (ztest_remove(zd, od, count) != 0 || + ztest_create(zd, od, count) != 0)) + rv = -1; + zd->zd_od = od; + VERIFY(mutex_unlock(&zd->zd_dirobj_lock) == 0); + + return (rv); +} + +/* ARGSUSED */ +void +ztest_zil_commit(ztest_ds_t *zd, uint64_t id) +{ + zilog_t *zilog = zd->zd_zilog; + + zil_commit(zilog, UINT64_MAX, ztest_random(ZTEST_OBJECTS)); + + /* + * Remember the committed values in zd, which is in parent/child + * shared memory. If we die, the next iteration of ztest_run() + * will verify that the log really does contain this record. + */ + mutex_enter(&zilog->zl_lock); + ASSERT(zd->zd_seq <= zilog->zl_commit_lr_seq); + zd->zd_seq = zilog->zl_commit_lr_seq; + mutex_exit(&zilog->zl_lock); +} + +/* + * Verify that we can't destroy an active pool, create an existing pool, + * or create a pool with a bad vdev spec. + */ +/* ARGSUSED */ +void +ztest_spa_create_destroy(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa; + nvlist_t *nvroot; + + /* + * Attempt to create using a bad file. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_file", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create using a bad mirror. + */ + nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 2, 1); + VERIFY3U(ENOENT, ==, + spa_create("ztest_bad_mirror", nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + + /* + * Attempt to create an existing pool. It shouldn't matter + * what's in the nvroot; we should fail with EEXIST. + */ + (void) rw_rdlock(&zs->zs_name_lock); + nvroot = make_vdev_root("/dev/bogus", NULL, 0, 0, 0, 0, 0, 1); + VERIFY3U(EEXIST, ==, spa_create(zs->zs_pool, nvroot, NULL, NULL, NULL)); + nvlist_free(nvroot); + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + VERIFY3U(EBUSY, ==, spa_destroy(zs->zs_pool)); + spa_close(spa, FTAG); + + (void) rw_unlock(&zs->zs_name_lock); +} + +static vdev_t * +vdev_lookup_by_path(vdev_t *vd, const char *path) +{ + vdev_t *mvd; + + if (vd->vdev_path != NULL && strcmp(path, vd->vdev_path) == 0) + return (vd); + + for (int c = 0; c < vd->vdev_children; c++) + if ((mvd = vdev_lookup_by_path(vd->vdev_child[c], path)) != + NULL) + return (mvd); + + return (NULL); +} + +/* + * Find the first available hole which can be used as a top-level. + */ +int +find_vdev_hole(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; + int c; + + ASSERT(spa_config_held(spa, SCL_VDEV, RW_READER) == SCL_VDEV); + + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + + if (cvd->vdev_ishole) + break; + } + return (c); +} + +/* + * Verify that vdev_add() works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_add_remove(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + uint64_t leaves; + uint64_t guid; + nvlist_t *nvroot; + int error; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors + zs->zs_splits, 1) * zopt_raidz; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + ztest_shared->zs_vdev_next_leaf = find_vdev_hole(spa) * leaves; + + /* + * If we have slogs then remove them 1/4 of the time. + */ + if (spa_has_slogs(spa) && ztest_random(4) == 0) { + /* + * Grab the guid from the head of the log class rotor. + */ + guid = spa_log_class(spa)->mc_rotor->mg_vd->vdev_guid; + + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between removing a slog (dmu_objset_find) + * and destroying a dataset. Removing the slog will + * grab a reference on the dataset which may cause + * dmu_objset_destroy() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + VERIFY(rw_wrlock(&ztest_shared->zs_name_lock) == 0); + error = spa_vdev_remove(spa, guid, B_FALSE); + VERIFY(rw_unlock(&ztest_shared->zs_name_lock) == 0); + + if (error && error != EEXIST) + fatal(0, "spa_vdev_remove() = %d", error); + } else { + spa_config_exit(spa, SCL_VDEV, FTAG); + + /* + * Make 1/4 of the devices be log devices. + */ + nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, + ztest_random(4) == 0, zopt_raidz, zs->zs_mirrors, 1); + + error = spa_vdev_add(spa, nvroot); + nvlist_free(nvroot); + + if (error == ENOSPC) + ztest_record_enospc("spa_vdev_add"); + else if (error != 0) + fatal(0, "spa_vdev_add() = %d", error); + } + + VERIFY(mutex_unlock(&ztest_shared->zs_vdev_lock) == 0); +} + +/* + * Verify that adding/removing aux devices (l2arc, hot spare) works as expected. + */ +/* ARGSUSED */ +void +ztest_vdev_aux_add_remove(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + vdev_t *rvd = spa->spa_root_vdev; + spa_aux_vdev_t *sav; + char *aux; + uint64_t guid = 0; + int error; + + if (ztest_random(2) == 0) { + sav = &spa->spa_spares; + aux = ZPOOL_CONFIG_SPARES; + } else { + sav = &spa->spa_l2cache; + aux = ZPOOL_CONFIG_L2CACHE; + } + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + if (sav->sav_count != 0 && ztest_random(4) == 0) { + /* + * Pick a random device to remove. + */ + guid = sav->sav_vdevs[ztest_random(sav->sav_count)]->vdev_guid; + } else { + /* + * Find an unused device we can add. + */ + zs->zs_vdev_aux = 0; + for (;;) { + char path[MAXPATHLEN]; + int c; (void) sprintf(path, ztest_aux_template, zopt_dir, - zopt_pool, aux, ztest_shared->zs_vdev_aux); + zopt_pool, aux, zs->zs_vdev_aux); for (c = 0; c < sav->sav_count; c++) if (strcmp(sav->sav_vdevs[c]->vdev_path, path) == 0) @@ -930,7 +2244,7 @@ ztest_vdev_aux_add_remove(ztest_args_t *za) if (c == sav->sav_count && vdev_lookup_by_path(rvd, path) == NULL) break; - ztest_shared->zs_vdev_aux++; + zs->zs_vdev_aux++; } } @@ -953,31 +2267,126 @@ ztest_vdev_aux_add_remove(ztest_args_t *za) * of devices that have pending state changes. */ if (ztest_random(2) == 0) - (void) vdev_online(spa, guid, B_FALSE, NULL); + (void) vdev_online(spa, guid, 0, NULL); error = spa_vdev_remove(spa, guid, B_FALSE); if (error != 0 && error != EBUSY) fatal(0, "spa_vdev_remove(%llu) = %d", guid, error); } - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); +} + +/* + * split a pool if it has mirror tlvdevs + */ +/* ARGSUSED */ +void +ztest_split_pool(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + vdev_t *rvd = spa->spa_root_vdev; + nvlist_t *tree, **child, *config, *split, **schild; + uint_t c, children, schildren = 0, lastlogid = 0; + int error = 0; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + + /* ensure we have a useable config; mirrors of raidz aren't supported */ + if (zs->zs_mirrors < 3 || zopt_raidz > 1) { + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + + /* clean up the old pool, if any */ + (void) spa_destroy("splitp"); + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + /* generate a config from the existing config */ + VERIFY(nvlist_lookup_nvlist(spa->spa_config, ZPOOL_CONFIG_VDEV_TREE, + &tree) == 0); + VERIFY(nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, + &children) == 0); + + schild = malloc(rvd->vdev_children * sizeof (nvlist_t *)); + for (c = 0; c < children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + nvlist_t **mchild; + uint_t mchildren; + + if (tvd->vdev_islog || tvd->vdev_ops == &vdev_hole_ops) { + VERIFY(nvlist_alloc(&schild[schildren], NV_UNIQUE_NAME, + 0) == 0); + VERIFY(nvlist_add_string(schild[schildren], + ZPOOL_CONFIG_TYPE, VDEV_TYPE_HOLE) == 0); + VERIFY(nvlist_add_uint64(schild[schildren], + ZPOOL_CONFIG_IS_HOLE, 1) == 0); + if (lastlogid == 0) + lastlogid = schildren; + ++schildren; + continue; + } + lastlogid = 0; + VERIFY(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); + VERIFY(nvlist_dup(mchild[0], &schild[schildren++], 0) == 0); + } + + /* OK, create a config that can be used to split */ + VERIFY(nvlist_alloc(&split, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_string(split, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_nvlist_array(split, ZPOOL_CONFIG_CHILDREN, schild, + lastlogid != 0 ? lastlogid : schildren) == 0); + + VERIFY(nvlist_alloc(&config, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, split) == 0); + + for (c = 0; c < schildren; c++) + nvlist_free(schild[c]); + free(schild); + nvlist_free(split); + + spa_config_exit(spa, SCL_VDEV, FTAG); + + (void) rw_wrlock(&zs->zs_name_lock); + error = spa_vdev_split_mirror(spa, "splitp", config, NULL, B_FALSE); + (void) rw_unlock(&zs->zs_name_lock); + + nvlist_free(config); + + if (error == 0) { + (void) printf("successful split - results:\n"); + mutex_enter(&spa_namespace_lock); + show_pool_stats(spa); + show_pool_stats(spa_lookup("splitp")); + mutex_exit(&spa_namespace_lock); + ++zs->zs_splits; + --zs->zs_mirrors; + } + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + } /* * Verify that we can attach and detach devices. */ +/* ARGSUSED */ void -ztest_vdev_attach_detach(ztest_args_t *za) +ztest_vdev_attach_detach(ztest_ds_t *zd, uint64_t id) { - spa_t *spa = za->za_spa; + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; spa_aux_vdev_t *sav = &spa->spa_spares; vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *pvd; nvlist_t *root; - uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; + uint64_t leaves; uint64_t leaf, top; uint64_t ashift = ztest_get_ashift(); - uint64_t oldguid; + uint64_t oldguid, pguid; size_t oldsize, newsize; char oldpath[MAXPATHLEN], newpath[MAXPATHLEN]; int replacing; @@ -986,7 +2395,8 @@ ztest_vdev_attach_detach(ztest_args_t *za) int oldvd_is_log; int error, expected_error; - (void) mutex_lock(&ztest_shared->zs_vdev_lock); + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); @@ -998,7 +2408,7 @@ ztest_vdev_attach_detach(ztest_args_t *za) /* * Pick a random top-level vdev. */ - top = ztest_random(rvd->vdev_children); + top = ztest_random_vdev_top(spa, B_TRUE); /* * Pick a random leaf within it. @@ -1009,10 +2419,16 @@ ztest_vdev_attach_detach(ztest_args_t *za) * Locate this vdev. */ oldvd = rvd->vdev_child[top]; - if (zopt_mirrors >= 1) + if (zs->zs_mirrors >= 1) { + ASSERT(oldvd->vdev_ops == &vdev_mirror_ops); + ASSERT(oldvd->vdev_children >= zs->zs_mirrors); oldvd = oldvd->vdev_child[leaf / zopt_raidz]; - if (zopt_raidz > 1) + } + if (zopt_raidz > 1) { + ASSERT(oldvd->vdev_ops == &vdev_raidz_ops); + ASSERT(oldvd->vdev_children == zopt_raidz); oldvd = oldvd->vdev_child[leaf % zopt_raidz]; + } /* * If we're already doing an attach or replace, oldvd may be a @@ -1020,26 +2436,27 @@ ztest_vdev_attach_detach(ztest_args_t *za) */ while (oldvd->vdev_children != 0) { oldvd_has_siblings = B_TRUE; - ASSERT(oldvd->vdev_children == 2); - oldvd = oldvd->vdev_child[ztest_random(2)]; + ASSERT(oldvd->vdev_children >= 2); + oldvd = oldvd->vdev_child[ztest_random(oldvd->vdev_children)]; } oldguid = oldvd->vdev_guid; - oldsize = vdev_get_rsize(oldvd); + oldsize = vdev_get_min_asize(oldvd); oldvd_is_log = oldvd->vdev_top->vdev_islog; (void) strcpy(oldpath, oldvd->vdev_path); pvd = oldvd->vdev_parent; + pguid = pvd->vdev_guid; /* * If oldvd has siblings, then half of the time, detach it. */ if (oldvd_has_siblings && ztest_random(2) == 0) { spa_config_exit(spa, SCL_VDEV, FTAG); - error = spa_vdev_detach(spa, oldguid, B_FALSE); - if (error != 0 && error != ENODEV && error != EBUSY) - fatal(0, "detach (%s) returned %d", - oldpath, error); - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + error = spa_vdev_detach(spa, oldguid, pguid, B_FALSE); + if (error != 0 && error != ENODEV && error != EBUSY && + error != ENOTSUP) + fatal(0, "detach (%s) returned %d", oldpath, error); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); return; } @@ -1060,7 +2477,7 @@ ztest_vdev_attach_detach(ztest_args_t *za) } if (newvd) { - newsize = vdev_get_rsize(newvd); + newsize = vdev_get_min_asize(newvd); } else { /* * Make newsize a little bigger or smaller than oldsize. @@ -1132,7 +2549,117 @@ ztest_vdev_attach_detach(ztest_args_t *za) (longlong_t)newsize, replacing, error, expected_error); } - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); +} + +/* + * Callback function which expands the physical size of the vdev. + */ +vdev_t * +grow_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa = vd->vdev_spa; + size_t *newsize = arg; + size_t fsize; + int fd; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + if ((fd = open(vd->vdev_path, O_RDWR)) == -1) + return (vd); + + fsize = lseek(fd, 0, SEEK_END); + (void) ftruncate(fd, *newsize); + + if (zopt_verbose >= 6) { + (void) printf("%s grew from %lu to %lu bytes\n", + vd->vdev_path, (ulong_t)fsize, (ulong_t)*newsize); + } + (void) close(fd); + return (NULL); +} + +/* + * Callback function which expands a given vdev by calling vdev_online(). + */ +/* ARGSUSED */ +vdev_t * +online_vdev(vdev_t *vd, void *arg) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + uint64_t guid = vd->vdev_guid; + uint64_t generation = spa->spa_config_generation + 1; + vdev_state_t newstate = VDEV_STATE_UNKNOWN; + int error; + + ASSERT(spa_config_held(spa, SCL_STATE, RW_READER) == SCL_STATE); + ASSERT(vd->vdev_ops->vdev_op_leaf); + + /* Calling vdev_online will initialize the new metaslabs */ + spa_config_exit(spa, SCL_STATE, spa); + error = vdev_online(spa, guid, ZFS_ONLINE_EXPAND, &newstate); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + /* + * If vdev_online returned an error or the underlying vdev_open + * failed then we abort the expand. The only way to know that + * vdev_open fails is by checking the returned newstate. + */ + if (error || newstate != VDEV_STATE_HEALTHY) { + if (zopt_verbose >= 5) { + (void) printf("Unable to expand vdev, state %llu, " + "error %d\n", (u_longlong_t)newstate, error); + } + return (vd); + } + ASSERT3U(newstate, ==, VDEV_STATE_HEALTHY); + + /* + * Since we dropped the lock we need to ensure that we're + * still talking to the original vdev. It's possible this + * vdev may have been detached/replaced while we were + * trying to online it. + */ + if (generation != spa->spa_config_generation) { + if (zopt_verbose >= 5) { + (void) printf("vdev configuration has changed, " + "guid %llu, state %llu, expected gen %llu, " + "got gen %llu\n", + (u_longlong_t)guid, + (u_longlong_t)tvd->vdev_state, + (u_longlong_t)generation, + (u_longlong_t)spa->spa_config_generation); + } + return (vd); + } + return (NULL); +} + +/* + * Traverse the vdev tree calling the supplied function. + * We continue to walk the tree until we either have walked all + * children or we receive a non-NULL return from the callback. + * If a NULL callback is passed, then we just return back the first + * leaf vdev we encounter. + */ +vdev_t * +vdev_walk_tree(vdev_t *vd, vdev_t *(*func)(vdev_t *, void *), void *arg) +{ + if (vd->vdev_ops->vdev_op_leaf) { + if (func == NULL) + return (vd); + else + return (func(vd, arg)); + } + + for (uint_t c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + if ((cvd = vdev_walk_tree(cvd, func, arg)) != NULL) + return (cvd); + } + return (NULL); } /* @@ -1140,164 +2667,240 @@ ztest_vdev_attach_detach(ztest_args_t *za) */ /* ARGSUSED */ void -ztest_vdev_LUN_growth(ztest_args_t *za) +ztest_vdev_LUN_growth(ztest_ds_t *zd, uint64_t id) { - spa_t *spa = za->za_spa; - char dev_name[MAXPATHLEN]; - uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; - uint64_t vdev; - size_t fsize; - int fd; + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + vdev_t *vd, *tvd; + metaslab_class_t *mc; + metaslab_group_t *mg; + size_t psize, newsize; + uint64_t top; + uint64_t old_class_space, new_class_space, old_ms_count, new_ms_count; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + top = ztest_random_vdev_top(spa, B_TRUE); - (void) mutex_lock(&ztest_shared->zs_vdev_lock); + tvd = spa->spa_root_vdev->vdev_child[top]; + mg = tvd->vdev_mg; + mc = mg->mg_class; + old_ms_count = tvd->vdev_ms_count; + old_class_space = metaslab_class_get_space(mc); /* - * Pick a random leaf vdev. + * Determine the size of the first leaf vdev associated with + * our top-level device. */ - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - vdev = ztest_random(spa->spa_root_vdev->vdev_children * leaves); - spa_config_exit(spa, SCL_VDEV, FTAG); + vd = vdev_walk_tree(tvd, NULL, NULL); + ASSERT3P(vd, !=, NULL); + ASSERT(vd->vdev_ops->vdev_op_leaf); - (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev); + psize = vd->vdev_psize; - if ((fd = open(dev_name, O_RDWR)) != -1) { - /* - * Determine the size. - */ - fsize = lseek(fd, 0, SEEK_END); + /* + * We only try to expand the vdev if it's healthy, less than 4x its + * original size, and it has a valid psize. + */ + if (tvd->vdev_state != VDEV_STATE_HEALTHY || + psize == 0 || psize >= 4 * zopt_vdev_size) { + spa_config_exit(spa, SCL_STATE, spa); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + ASSERT(psize > 0); + newsize = psize + psize / 8; + ASSERT3U(newsize, >, psize); - /* - * If it's less than 2x the original size, grow by around 3%. - */ - if (fsize < 2 * zopt_vdev_size) { - size_t newsize = fsize + ztest_random(fsize / 32); - (void) ftruncate(fd, newsize); - if (zopt_verbose >= 6) { - (void) printf("%s grew from %lu to %lu bytes\n", - dev_name, (ulong_t)fsize, (ulong_t)newsize); - } + if (zopt_verbose >= 6) { + (void) printf("Expanding LUN %s from %lu to %lu\n", + vd->vdev_path, (ulong_t)psize, (ulong_t)newsize); + } + + /* + * Growing the vdev is a two step process: + * 1). expand the physical size (i.e. relabel) + * 2). online the vdev to create the new metaslabs + */ + if (vdev_walk_tree(tvd, grow_vdev, &newsize) != NULL || + vdev_walk_tree(tvd, online_vdev, NULL) != NULL || + tvd->vdev_state != VDEV_STATE_HEALTHY) { + if (zopt_verbose >= 5) { + (void) printf("Could not expand LUN because " + "the vdev configuration changed.\n"); } - (void) close(fd); + spa_config_exit(spa, SCL_STATE, spa); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + + spa_config_exit(spa, SCL_STATE, spa); + + /* + * Expanding the LUN will update the config asynchronously, + * thus we must wait for the async thread to complete any + * pending tasks before proceeding. + */ + for (;;) { + boolean_t done; + mutex_enter(&spa->spa_async_lock); + done = (spa->spa_async_thread == NULL && !spa->spa_async_tasks); + mutex_exit(&spa->spa_async_lock); + if (done) + break; + txg_wait_synced(spa_get_dsl(spa), 0); + (void) poll(NULL, 0, 100); + } + + spa_config_enter(spa, SCL_STATE, spa, RW_READER); + + tvd = spa->spa_root_vdev->vdev_child[top]; + new_ms_count = tvd->vdev_ms_count; + new_class_space = metaslab_class_get_space(mc); + + if (tvd->vdev_mg != mg || mg->mg_class != mc) { + if (zopt_verbose >= 5) { + (void) printf("Could not verify LUN expansion due to " + "intervening vdev offline or remove.\n"); + } + spa_config_exit(spa, SCL_STATE, spa); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + return; + } + + /* + * Make sure we were able to grow the vdev. + */ + if (new_ms_count <= old_ms_count) + fatal(0, "LUN expansion failed: ms_count %llu <= %llu\n", + old_ms_count, new_ms_count); + + /* + * Make sure we were able to grow the pool. + */ + if (new_class_space <= old_class_space) + fatal(0, "LUN expansion failed: class_space %llu <= %llu\n", + old_class_space, new_class_space); + + if (zopt_verbose >= 5) { + char oldnumbuf[6], newnumbuf[6]; + + nicenum(old_class_space, oldnumbuf); + nicenum(new_class_space, newnumbuf); + (void) printf("%s grew from %s to %s\n", + spa->spa_name, oldnumbuf, newnumbuf); } - (void) mutex_unlock(&ztest_shared->zs_vdev_lock); + spa_config_exit(spa, SCL_STATE, spa); + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); } +/* + * Verify that dmu_objset_{create,destroy,open,close} work as expected. + */ /* ARGSUSED */ static void -ztest_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) +ztest_objset_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) { /* - * Create the directory object. + * Create the objects common to all ztest datasets. */ - VERIFY(dmu_object_claim(os, ZTEST_DIROBJ, - DMU_OT_UINT64_OTHER, ZTEST_DIROBJ_BLOCKSIZE, - DMU_OT_UINT64_OTHER, 5 * sizeof (ztest_block_tag_t), tx) == 0); - - VERIFY(zap_create_claim(os, ZTEST_MICROZAP_OBJ, - DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); - - VERIFY(zap_create_claim(os, ZTEST_FATZAP_OBJ, + VERIFY(zap_create_claim(os, ZTEST_DIROBJ, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx) == 0); } +/* ARGSUSED */ static int -ztest_destroy_cb(char *name, void *arg) +ztest_objset_destroy_cb(const char *name, void *arg) { - ztest_args_t *za = arg; objset_t *os; - dmu_object_info_t *doi = &za->za_doi; + dmu_object_info_t doi; int error; /* * Verify that the dataset contains a directory object. */ - error = dmu_objset_open(name, DMU_OST_OTHER, - DS_MODE_USER | DS_MODE_READONLY, &os); - ASSERT3U(error, ==, 0); - error = dmu_object_info(os, ZTEST_DIROBJ, doi); + VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os)); + error = dmu_object_info(os, ZTEST_DIROBJ, &doi); if (error != ENOENT) { /* We could have crashed in the middle of destroying it */ ASSERT3U(error, ==, 0); - ASSERT3U(doi->doi_type, ==, DMU_OT_UINT64_OTHER); - ASSERT3S(doi->doi_physical_blks, >=, 0); + ASSERT3U(doi.doi_type, ==, DMU_OT_ZAP_OTHER); + ASSERT3S(doi.doi_physical_blocks_512, >=, 0); } - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); /* * Destroy the dataset. */ - error = dmu_objset_destroy(name); - if (error) { - (void) dmu_objset_open(name, DMU_OST_OTHER, - DS_MODE_USER | DS_MODE_READONLY, &os); - fatal(0, "dmu_objset_destroy(os=%p) = %d\n", &os, error); - } + VERIFY3U(0, ==, dmu_objset_destroy(name, B_FALSE)); return (0); } -/* - * Verify that dmu_objset_{create,destroy,open,close} work as expected. - */ -static uint64_t -ztest_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t object, int mode) +static boolean_t +ztest_snapshot_create(char *osname, uint64_t id) { - itx_t *itx; - lr_create_t *lr; - size_t namesize; - char name[24]; - - (void) sprintf(name, "ZOBJ_%llu", (u_longlong_t)object); - namesize = strlen(name) + 1; - - itx = zil_itx_create(TX_CREATE, sizeof (*lr) + namesize + - ztest_random(ZIL_MAX_BLKSZ)); - lr = (lr_create_t *)&itx->itx_lr; - bzero(lr + 1, lr->lr_common.lrc_reclen - sizeof (*lr)); - lr->lr_doid = object; - lr->lr_foid = 0; - lr->lr_mode = mode; - lr->lr_uid = 0; - lr->lr_gid = 0; - lr->lr_gen = dmu_tx_get_txg(tx); - lr->lr_crtime[0] = time(NULL); - lr->lr_crtime[1] = 0; - lr->lr_rdev = 0; - bcopy(name, (char *)(lr + 1), namesize); - - return (zil_itx_assign(zilog, itx, tx)); + char snapname[MAXNAMELEN]; + int error; + + (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, + (u_longlong_t)id); + + error = dmu_objset_snapshot(osname, strchr(snapname, '@') + 1, + NULL, B_FALSE); + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + return (B_FALSE); + } + if (error != 0 && error != EEXIST) + fatal(0, "ztest_snapshot_create(%s) = %d", snapname, error); + return (B_TRUE); +} + +static boolean_t +ztest_snapshot_destroy(char *osname, uint64_t id) +{ + char snapname[MAXNAMELEN]; + int error; + + (void) snprintf(snapname, MAXNAMELEN, "%s@%llu", osname, + (u_longlong_t)id); + + error = dmu_objset_destroy(snapname, B_FALSE); + if (error != 0 && error != ENOENT) + fatal(0, "ztest_snapshot_destroy(%s) = %d", snapname, error); + return (B_TRUE); } +/* ARGSUSED */ void -ztest_dmu_objset_create_destroy(ztest_args_t *za) +ztest_dmu_objset_create_destroy(ztest_ds_t *zd, uint64_t id) { + ztest_shared_t *zs = ztest_shared; + ztest_ds_t zdtmp; + int iters; int error; objset_t *os, *os2; - char name[100]; - int basemode, expected_error; + char name[MAXNAMELEN]; zilog_t *zilog; - uint64_t seq; - uint64_t objects; - ztest_replay_t zr; - (void) rw_rdlock(&ztest_shared->zs_name_lock); - (void) snprintf(name, 100, "%s/%s_temp_%llu", za->za_pool, za->za_pool, - (u_longlong_t)za->za_instance); + (void) rw_rdlock(&zs->zs_name_lock); - basemode = DS_MODE_TYPE(za->za_instance); - if (basemode != DS_MODE_USER && basemode != DS_MODE_OWNER) - basemode = DS_MODE_USER; + (void) snprintf(name, MAXNAMELEN, "%s/temp_%llu", + zs->zs_pool, (u_longlong_t)id); /* * If this dataset exists from a previous run, process its replay log * half of the time. If we don't replay it, then dmu_objset_destroy() - * (invoked from ztest_destroy_cb() below) should just throw it away. + * (invoked from ztest_objset_destroy_cb()) should just throw it away. */ if (ztest_random(2) == 0 && - dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os) == 0) { - zr.zr_os = os; - zil_replay(os, &zr, &zr.zr_assign, ztest_replay_vector, NULL); - dmu_objset_close(os); + dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os) == 0) { + ztest_zd_init(&zdtmp, os); + zil_replay(os, &zdtmp, ztest_replay_vector); + ztest_zd_fini(&zdtmp); + dmu_objset_disown(os, FTAG); } /* @@ -1305,372 +2908,262 @@ ztest_dmu_objset_create_destroy(ztest_args_t *za) * create lying around from a previous run. If so, destroy it * and all of its snapshots. */ - (void) dmu_objset_find(name, ztest_destroy_cb, za, + (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); /* * Verify that the destroyed dataset is no longer in the namespace. */ - error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os); - if (error != ENOENT) - fatal(1, "dmu_objset_open(%s) found destroyed dataset %p", - name, os); + VERIFY3U(ENOENT, ==, dmu_objset_hold(name, FTAG, &os)); /* * Verify that we can create a new dataset. */ - error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, - ztest_create_cb, NULL); + error = dmu_objset_create(name, DMU_OST_OTHER, 0, + ztest_objset_create_cb, NULL); if (error) { if (error == ENOSPC) { - ztest_record_enospc("dmu_objset_create"); - (void) rw_unlock(&ztest_shared->zs_name_lock); + ztest_record_enospc(FTAG); + (void) rw_unlock(&zs->zs_name_lock); return; } fatal(0, "dmu_objset_create(%s) = %d", name, error); } - error = dmu_objset_open(name, DMU_OST_OTHER, basemode, &os); - if (error) { - fatal(0, "dmu_objset_open(%s) = %d", name, error); - } + VERIFY3U(0, ==, + dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os)); + + ztest_zd_init(&zdtmp, os); /* * Open the intent log for it. */ - zilog = zil_open(os, NULL); + zilog = zil_open(os, ztest_get_data); /* - * Put a random number of objects in there. + * Put some objects in there, do a little I/O to them, + * and randomly take a couple of snapshots along the way. */ - objects = ztest_random(20); - seq = 0; - while (objects-- != 0) { - uint64_t object; - dmu_tx_t *tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, sizeof (name)); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_NONE, 0, tx); - ztest_set_random_blocksize(os, object, tx); - seq = ztest_log_create(zilog, tx, object, - DMU_OT_UINT64_OTHER); - dmu_write(os, object, 0, sizeof (name), name, tx); - dmu_tx_commit(tx); - } - if (ztest_random(5) == 0) { - zil_commit(zilog, seq, object); - } - if (ztest_random(100) == 0) { - error = zil_suspend(zilog); - if (error == 0) { - zil_resume(zilog); - } - } + iters = ztest_random(5); + for (int i = 0; i < iters; i++) { + ztest_dmu_object_alloc_free(&zdtmp, id); + if (ztest_random(iters) == 0) + (void) ztest_snapshot_create(name, i); } /* * Verify that we cannot create an existing dataset. */ - error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, NULL, NULL); - if (error != EEXIST) - fatal(0, "created existing dataset, error = %d", error); + VERIFY3U(EEXIST, ==, + dmu_objset_create(name, DMU_OST_OTHER, 0, NULL, NULL)); /* - * Verify that multiple dataset holds are allowed, but only when - * the new access mode is compatible with the base mode. + * Verify that we can hold an objset that is also owned. */ - if (basemode == DS_MODE_OWNER) { - error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_USER, - &os2); - if (error) - fatal(0, "dmu_objset_open('%s') = %d", name, error); - else - dmu_objset_close(os2); - } - error = dmu_objset_open(name, DMU_OST_OTHER, DS_MODE_OWNER, &os2); - expected_error = (basemode == DS_MODE_OWNER) ? EBUSY : 0; - if (error != expected_error) - fatal(0, "dmu_objset_open('%s') = %d, expected %d", - name, error, expected_error); - if (error == 0) - dmu_objset_close(os2); + VERIFY3U(0, ==, dmu_objset_hold(name, FTAG, &os2)); + dmu_objset_rele(os2, FTAG); - zil_close(zilog); - dmu_objset_close(os); + /* + * Verify that we cannot own an objset that is already owned. + */ + VERIFY3U(EBUSY, ==, + dmu_objset_own(name, DMU_OST_OTHER, B_FALSE, FTAG, &os2)); - error = dmu_objset_destroy(name); - if (error) - fatal(0, "dmu_objset_destroy(%s) = %d", name, error); + zil_close(zilog); + dmu_objset_disown(os, FTAG); + ztest_zd_fini(&zdtmp); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) rw_unlock(&zs->zs_name_lock); } /* * Verify that dmu_snapshot_{create,destroy,open,close} work as expected. */ void -ztest_dmu_snapshot_create_destroy(ztest_args_t *za) +ztest_dmu_snapshot_create_destroy(ztest_ds_t *zd, uint64_t id) { - int error; - objset_t *os = za->za_os; - char snapname[100]; - char osname[MAXNAMELEN]; + ztest_shared_t *zs = ztest_shared; - (void) rw_rdlock(&ztest_shared->zs_name_lock); - dmu_objset_name(os, osname); - (void) snprintf(snapname, 100, "%s@%llu", osname, - (u_longlong_t)za->za_instance); + (void) rw_rdlock(&zs->zs_name_lock); + (void) ztest_snapshot_destroy(zd->zd_name, id); + (void) ztest_snapshot_create(zd->zd_name, id); + (void) rw_unlock(&zs->zs_name_lock); +} - error = dmu_objset_destroy(snapname); - if (error != 0 && error != ENOENT) - fatal(0, "dmu_objset_destroy() = %d", error); - error = dmu_objset_snapshot(osname, strchr(snapname, '@')+1, FALSE); - if (error == ENOSPC) - ztest_record_enospc("dmu_take_snapshot"); - else if (error != 0 && error != EEXIST) - fatal(0, "dmu_take_snapshot() = %d", error); - (void) rw_unlock(&ztest_shared->zs_name_lock); +/* + * Cleanup non-standard snapshots and clones. + */ +void +ztest_dsl_dataset_cleanup(char *osname, uint64_t id) +{ + char snap1name[MAXNAMELEN]; + char clone1name[MAXNAMELEN]; + char snap2name[MAXNAMELEN]; + char clone2name[MAXNAMELEN]; + char snap3name[MAXNAMELEN]; + int error; + + (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); + (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); + (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); + (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); + (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); + + error = dmu_objset_destroy(clone2name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", clone2name, error); + error = dmu_objset_destroy(snap3name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", snap3name, error); + error = dmu_objset_destroy(snap2name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", snap2name, error); + error = dmu_objset_destroy(clone1name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", clone1name, error); + error = dmu_objset_destroy(snap1name, B_FALSE); + if (error && error != ENOENT) + fatal(0, "dmu_objset_destroy(%s) = %d", snap1name, error); } /* - * Verify that dmu_object_{alloc,free} work as expected. + * Verify dsl_dataset_promote handles EBUSY */ void -ztest_dmu_object_alloc_free(ztest_args_t *za) +ztest_dsl_dataset_promote_busy(ztest_ds_t *zd, uint64_t id) { - objset_t *os = za->za_os; - dmu_buf_t *db; - dmu_tx_t *tx; - uint64_t batchobj, object, batchsize, endoff, temp; - int b, c, error, bonuslen; - dmu_object_info_t *doi = &za->za_doi; - char osname[MAXNAMELEN]; + ztest_shared_t *zs = ztest_shared; + objset_t *clone; + dsl_dataset_t *ds; + char snap1name[MAXNAMELEN]; + char clone1name[MAXNAMELEN]; + char snap2name[MAXNAMELEN]; + char clone2name[MAXNAMELEN]; + char snap3name[MAXNAMELEN]; + char *osname = zd->zd_name; + int error; - dmu_objset_name(os, osname); + (void) rw_rdlock(&zs->zs_name_lock); - endoff = -8ULL; - batchsize = 2; + ztest_dsl_dataset_cleanup(osname, id); - /* - * Create a batch object if necessary, and record it in the directory. - */ - VERIFY3U(0, ==, dmu_read(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t), &batchobj)); - if (batchobj == 0) { - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t)); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("create a batch object"); - dmu_tx_abort(tx); - return; - } - batchobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_NONE, 0, tx); - ztest_set_random_blocksize(os, batchobj, tx); - dmu_write(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t), &batchobj, tx); - dmu_tx_commit(tx); - } + (void) snprintf(snap1name, MAXNAMELEN, "%s@s1_%llu", osname, id); + (void) snprintf(clone1name, MAXNAMELEN, "%s/c1_%llu", osname, id); + (void) snprintf(snap2name, MAXNAMELEN, "%s@s2_%llu", clone1name, id); + (void) snprintf(clone2name, MAXNAMELEN, "%s/c2_%llu", osname, id); + (void) snprintf(snap3name, MAXNAMELEN, "%s@s3_%llu", clone1name, id); - /* - * Destroy the previous batch of objects. - */ - for (b = 0; b < batchsize; b++) { - VERIFY3U(0, ==, dmu_read(os, batchobj, b * sizeof (uint64_t), - sizeof (uint64_t), &object)); - if (object == 0) - continue; - /* - * Read and validate contents. - * We expect the nth byte of the bonus buffer to be n. - */ - VERIFY(0 == dmu_bonus_hold(os, object, FTAG, &db)); - za->za_dbuf = db; - - dmu_object_info_from_db(db, doi); - ASSERT(doi->doi_type == DMU_OT_UINT64_OTHER); - ASSERT(doi->doi_bonus_type == DMU_OT_PLAIN_OTHER); - ASSERT3S(doi->doi_physical_blks, >=, 0); - - bonuslen = doi->doi_bonus_size; - - for (c = 0; c < bonuslen; c++) { - if (((uint8_t *)db->db_data)[c] != - (uint8_t)(c + bonuslen)) { - fatal(0, - "bad bonus: %s, obj %llu, off %d: %u != %u", - osname, object, c, - ((uint8_t *)db->db_data)[c], - (uint8_t)(c + bonuslen)); - } + error = dmu_objset_snapshot(osname, strchr(snap1name, '@')+1, + NULL, B_FALSE); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; } + fatal(0, "dmu_take_snapshot(%s) = %d", snap1name, error); + } - dmu_buf_rele(db, FTAG); - za->za_dbuf = NULL; - - /* - * We expect the word at endoff to be our object number. - */ - VERIFY(0 == dmu_read(os, object, endoff, - sizeof (uint64_t), &temp)); + error = dmu_objset_hold(snap1name, FTAG, &clone); + if (error) + fatal(0, "dmu_open_snapshot(%s) = %d", snap1name, error); - if (temp != object) { - fatal(0, "bad data in %s, got %llu, expected %llu", - osname, temp, object); + error = dmu_objset_clone(clone1name, dmu_objset_ds(clone), 0); + dmu_objset_rele(clone, FTAG); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; } + fatal(0, "dmu_objset_create(%s) = %d", clone1name, error); + } - /* - * Destroy old object and clear batch entry. - */ - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, batchobj, - b * sizeof (uint64_t), sizeof (uint64_t)); - dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("free object"); - dmu_tx_abort(tx); - return; - } - error = dmu_object_free(os, object, tx); - if (error) { - fatal(0, "dmu_object_free('%s', %llu) = %d", - osname, object, error); + error = dmu_objset_snapshot(clone1name, strchr(snap2name, '@')+1, + NULL, B_FALSE); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; } - object = 0; - - dmu_object_set_checksum(os, batchobj, - ztest_random_checksum(), tx); - dmu_object_set_compress(os, batchobj, - ztest_random_compress(), tx); - - dmu_write(os, batchobj, b * sizeof (uint64_t), - sizeof (uint64_t), &object, tx); - - dmu_tx_commit(tx); + fatal(0, "dmu_open_snapshot(%s) = %d", snap2name, error); } - /* - * Before creating the new batch of objects, generate a bunch of churn. - */ - for (b = ztest_random(100); b > 0; b--) { - tx = dmu_tx_create(os); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("churn objects"); - dmu_tx_abort(tx); - return; - } - object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_NONE, 0, tx); - ztest_set_random_blocksize(os, object, tx); - error = dmu_object_free(os, object, tx); - if (error) { - fatal(0, "dmu_object_free('%s', %llu) = %d", - osname, object, error); + error = dmu_objset_snapshot(clone1name, strchr(snap3name, '@')+1, + NULL, B_FALSE); + if (error && error != EEXIST) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; } - dmu_tx_commit(tx); + fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); } - /* - * Create a new batch of objects with randomly chosen - * blocksizes and record them in the batch directory. - */ - for (b = 0; b < batchsize; b++) { - uint32_t va_blksize; - u_longlong_t va_nblocks; + error = dmu_objset_hold(snap3name, FTAG, &clone); + if (error) + fatal(0, "dmu_open_snapshot(%s) = %d", snap3name, error); - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, batchobj, b * sizeof (uint64_t), - sizeof (uint64_t)); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, endoff, - sizeof (uint64_t)); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("create batchobj"); - dmu_tx_abort(tx); - return; + error = dmu_objset_clone(clone2name, dmu_objset_ds(clone), 0); + dmu_objset_rele(clone, FTAG); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc(FTAG); + goto out; } - bonuslen = (int)ztest_random(dmu_bonus_max()) + 1; - - object = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_PLAIN_OTHER, bonuslen, tx); - - ztest_set_random_blocksize(os, object, tx); - - dmu_object_set_checksum(os, object, - ztest_random_checksum(), tx); - dmu_object_set_compress(os, object, - ztest_random_compress(), tx); - - dmu_write(os, batchobj, b * sizeof (uint64_t), - sizeof (uint64_t), &object, tx); + fatal(0, "dmu_objset_create(%s) = %d", clone2name, error); + } - /* - * Write to both the bonus buffer and the regular data. - */ - VERIFY(dmu_bonus_hold(os, object, FTAG, &db) == 0); - za->za_dbuf = db; - ASSERT3U(bonuslen, <=, db->db_size); + error = dsl_dataset_own(snap1name, B_FALSE, FTAG, &ds); + if (error) + fatal(0, "dsl_dataset_own(%s) = %d", snap1name, error); + error = dsl_dataset_promote(clone2name, NULL); + if (error != EBUSY) + fatal(0, "dsl_dataset_promote(%s), %d, not EBUSY", clone2name, + error); + dsl_dataset_disown(ds, FTAG); - dmu_object_size_from_db(db, &va_blksize, &va_nblocks); - ASSERT3S(va_nblocks, >=, 0); +out: + ztest_dsl_dataset_cleanup(osname, id); - dmu_buf_will_dirty(db, tx); + (void) rw_unlock(&zs->zs_name_lock); +} - /* - * See comments above regarding the contents of - * the bonus buffer and the word at endoff. - */ - for (c = 0; c < bonuslen; c++) - ((uint8_t *)db->db_data)[c] = (uint8_t)(c + bonuslen); +/* + * Verify that dmu_object_{alloc,free} work as expected. + */ +void +ztest_dmu_object_alloc_free(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t od[4]; + int batchsize = sizeof (od) / sizeof (od[0]); - dmu_buf_rele(db, FTAG); - za->za_dbuf = NULL; + for (int b = 0; b < batchsize; b++) + ztest_od_init(&od[b], id, FTAG, b, DMU_OT_UINT64_OTHER, 0, 0); - /* - * Write to a large offset to increase indirection. - */ - dmu_write(os, object, endoff, sizeof (uint64_t), &object, tx); + /* + * Destroy the previous batch of objects, create a new batch, + * and do some I/O on the new objects. + */ + if (ztest_object_init(zd, od, sizeof (od), B_TRUE) != 0) + return; - dmu_tx_commit(tx); - } + while (ztest_random(4 * batchsize) != 0) + ztest_io(zd, od[ztest_random(batchsize)].od_object, + ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); } /* * Verify that dmu_{read,write} work as expected. */ -typedef struct bufwad { - uint64_t bw_index; - uint64_t bw_txg; - uint64_t bw_data; -} bufwad_t; - -typedef struct dmu_read_write_dir { - uint64_t dd_packobj; - uint64_t dd_bigobj; - uint64_t dd_chunk; -} dmu_read_write_dir_t; - void -ztest_dmu_read_write(ztest_args_t *za) +ztest_dmu_read_write(ztest_ds_t *zd, uint64_t id) { - objset_t *os = za->za_os; - dmu_read_write_dir_t dd; + objset_t *os = zd->zd_os; + ztest_od_t od[2]; dmu_tx_t *tx; int i, freeit, error; uint64_t n, s, txg; bufwad_t *packbuf, *bigbuf, *pack, *bigH, *bigT; - uint64_t packoff, packsize, bigoff, bigsize; + uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; + uint64_t chunksize = (1000 + ztest_random(1000)) * sizeof (uint64_t); uint64_t regions = 997; uint64_t stride = 123456789ULL; uint64_t width = 40; @@ -1703,34 +3196,16 @@ ztest_dmu_read_write(ztest_args_t *za) /* * Read the directory info. If it's the first time, set things up. */ - VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (dd), &dd)); - if (dd.dd_chunk == 0) { - ASSERT(dd.dd_packobj == 0); - ASSERT(dd.dd_bigobj == 0); - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (dd)); - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("create r/w directory"); - dmu_tx_abort(tx); - return; - } - - dd.dd_packobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_NONE, 0, tx); - dd.dd_bigobj = dmu_object_alloc(os, DMU_OT_UINT64_OTHER, 0, - DMU_OT_NONE, 0, tx); - dd.dd_chunk = (1000 + ztest_random(1000)) * sizeof (uint64_t); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, chunksize); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); - ztest_set_random_blocksize(os, dd.dd_packobj, tx); - ztest_set_random_blocksize(os, dd.dd_bigobj, tx); + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; - dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (dd), &dd, - tx); - dmu_tx_commit(tx); - } + bigobj = od[0].od_object; + packobj = od[1].od_object; + chunksize = od[0].od_gen; + ASSERT(chunksize == od[1].od_gen); /* * Prefetch a random chunk of the big object. @@ -1740,7 +3215,7 @@ ztest_dmu_read_write(ztest_args_t *za) */ n = ztest_random(regions) * stride + ztest_random(width); s = 1 + ztest_random(2 * width - 1); - dmu_prefetch(os, dd.dd_bigobj, n * dd.dd_chunk, s * dd.dd_chunk); + dmu_prefetch(os, bigobj, n * chunksize, s * chunksize); /* * Pick a random index and compute the offsets into packobj and bigobj. @@ -1751,8 +3226,8 @@ ztest_dmu_read_write(ztest_args_t *za) packoff = n * sizeof (bufwad_t); packsize = s * sizeof (bufwad_t); - bigoff = n * dd.dd_chunk; - bigsize = s * dd.dd_chunk; + bigoff = n * chunksize; + bigsize = s * chunksize; packbuf = umem_alloc(packsize, UMEM_NOFAIL); bigbuf = umem_alloc(bigsize, UMEM_NOFAIL); @@ -1766,9 +3241,11 @@ ztest_dmu_read_write(ztest_args_t *za) /* * Read the current contents of our objects. */ - error = dmu_read(os, dd.dd_packobj, packoff, packsize, packbuf); + error = dmu_read(os, packobj, packoff, packsize, packbuf, + DMU_READ_PREFETCH); ASSERT3U(error, ==, 0); - error = dmu_read(os, dd.dd_bigobj, bigoff, bigsize, bigbuf); + error = dmu_read(os, bigobj, bigoff, bigsize, bigbuf, + DMU_READ_PREFETCH); ASSERT3U(error, ==, 0); /* @@ -1776,24 +3253,25 @@ ztest_dmu_read_write(ztest_args_t *za) */ tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, dd.dd_packobj, packoff, packsize); + dmu_tx_hold_write(tx, packobj, packoff, packsize); if (freeit) - dmu_tx_hold_free(tx, dd.dd_bigobj, bigoff, bigsize); + dmu_tx_hold_free(tx, bigobj, bigoff, bigsize); else - dmu_tx_hold_write(tx, dd.dd_bigobj, bigoff, bigsize); - - error = dmu_tx_assign(tx, TXG_WAIT); + dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); - if (error) { - ztest_record_enospc("dmu r/w range"); - dmu_tx_abort(tx); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { umem_free(packbuf, packsize); umem_free(bigbuf, bigsize); return; } - txg = dmu_tx_get_txg(tx); + dmu_object_set_checksum(os, bigobj, + (enum zio_checksum)ztest_random_dsl_prop(ZFS_PROP_CHECKSUM), tx); + + dmu_object_set_compress(os, bigobj, + (enum zio_compress)ztest_random_dsl_prop(ZFS_PROP_COMPRESSION), tx); /* * For each index from n to n + s, verify that the existing bufwad @@ -1805,9 +3283,9 @@ ztest_dmu_read_write(ztest_args_t *za) /* LINTED */ pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); /* LINTED */ - bigH = (bufwad_t *)((char *)bigbuf + i * dd.dd_chunk); + bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); /* LINTED */ - bigT = (bufwad_t *)((char *)bigH + dd.dd_chunk) - 1; + bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); @@ -1841,27 +3319,26 @@ ztest_dmu_read_write(ztest_args_t *za) * We've verified all the old bufwads, and made new ones. * Now write them out. */ - dmu_write(os, dd.dd_packobj, packoff, packsize, packbuf, tx); + dmu_write(os, packobj, packoff, packsize, packbuf, tx); if (freeit) { - if (zopt_verbose >= 6) { + if (zopt_verbose >= 7) { (void) printf("freeing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } - VERIFY(0 == dmu_free_range(os, dd.dd_bigobj, bigoff, - bigsize, tx)); + VERIFY(0 == dmu_free_range(os, bigobj, bigoff, bigsize, tx)); } else { - if (zopt_verbose >= 6) { + if (zopt_verbose >= 7) { (void) printf("writing offset %llx size %llx" " txg %llx\n", (u_longlong_t)bigoff, (u_longlong_t)bigsize, (u_longlong_t)txg); } - dmu_write(os, dd.dd_bigobj, bigoff, bigsize, bigbuf, tx); + dmu_write(os, bigobj, bigoff, bigsize, bigbuf, tx); } dmu_tx_commit(tx); @@ -1873,10 +3350,10 @@ ztest_dmu_read_write(ztest_args_t *za) void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); - VERIFY(0 == dmu_read(os, dd.dd_packobj, packoff, - packsize, packcheck)); - VERIFY(0 == dmu_read(os, dd.dd_bigobj, bigoff, - bigsize, bigcheck)); + VERIFY(0 == dmu_read(os, packobj, packoff, + packsize, packcheck, DMU_READ_PREFETCH)); + VERIFY(0 == dmu_read(os, bigobj, bigoff, + bigsize, bigcheck, DMU_READ_PREFETCH)); ASSERT(bcmp(packbuf, packcheck, packsize) == 0); ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); @@ -1890,240 +3367,335 @@ ztest_dmu_read_write(ztest_args_t *za) } void -ztest_dmu_check_future_leak(ztest_args_t *za) +compare_and_update_pbbufs(uint64_t s, bufwad_t *packbuf, bufwad_t *bigbuf, + uint64_t bigsize, uint64_t n, uint64_t chunksize, uint64_t txg) { - objset_t *os = za->za_os; - dmu_buf_t *db; - ztest_block_tag_t *bt; - dmu_object_info_t *doi = &za->za_doi; - - /* - * Make sure that, if there is a write record in the bonus buffer - * of the ZTEST_DIROBJ, that the txg for this record is <= the - * last synced txg of the pool. - */ - VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0); - za->za_dbuf = db; - VERIFY(dmu_object_info(os, ZTEST_DIROBJ, doi) == 0); - ASSERT3U(doi->doi_bonus_size, >=, sizeof (*bt)); - ASSERT3U(doi->doi_bonus_size, <=, db->db_size); - ASSERT3U(doi->doi_bonus_size % sizeof (*bt), ==, 0); - bt = (void *)((char *)db->db_data + doi->doi_bonus_size - sizeof (*bt)); - if (bt->bt_objset != 0) { - ASSERT3U(bt->bt_objset, ==, dmu_objset_id(os)); - ASSERT3U(bt->bt_object, ==, ZTEST_DIROBJ); - ASSERT3U(bt->bt_offset, ==, -1ULL); - ASSERT3U(bt->bt_txg, <, spa_first_txg(za->za_spa)); + uint64_t i; + bufwad_t *pack; + bufwad_t *bigH; + bufwad_t *bigT; + + /* + * For each index from n to n + s, verify that the existing bufwad + * in packobj matches the bufwads at the head and tail of the + * corresponding chunk in bigobj. Then update all three bufwads + * with the new values we want to write out. + */ + for (i = 0; i < s; i++) { + /* LINTED */ + pack = (bufwad_t *)((char *)packbuf + i * sizeof (bufwad_t)); + /* LINTED */ + bigH = (bufwad_t *)((char *)bigbuf + i * chunksize); + /* LINTED */ + bigT = (bufwad_t *)((char *)bigH + chunksize) - 1; + + ASSERT((uintptr_t)bigH - (uintptr_t)bigbuf < bigsize); + ASSERT((uintptr_t)bigT - (uintptr_t)bigbuf < bigsize); + + if (pack->bw_txg > txg) + fatal(0, "future leak: got %llx, open txg is %llx", + pack->bw_txg, txg); + + if (pack->bw_data != 0 && pack->bw_index != n + i) + fatal(0, "wrong index: got %llx, wanted %llx+%llx", + pack->bw_index, n, i); + + if (bcmp(pack, bigH, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigH mismatch in %p/%p", pack, bigH); + + if (bcmp(pack, bigT, sizeof (bufwad_t)) != 0) + fatal(0, "pack/bigT mismatch in %p/%p", pack, bigT); + + pack->bw_index = n + i; + pack->bw_txg = txg; + pack->bw_data = 1 + ztest_random(-2ULL); + + *bigH = *pack; + *bigT = *pack; } - dmu_buf_rele(db, FTAG); - za->za_dbuf = NULL; } void -ztest_dmu_write_parallel(ztest_args_t *za) +ztest_dmu_read_write_zcopy(ztest_ds_t *zd, uint64_t id) { - objset_t *os = za->za_os; - ztest_block_tag_t *rbt = &za->za_rbt; - ztest_block_tag_t *wbt = &za->za_wbt; - const size_t btsize = sizeof (ztest_block_tag_t); - dmu_buf_t *db; - int b, error; - int bs = ZTEST_DIROBJ_BLOCKSIZE; - int do_free = 0; - uint64_t off, txg, txg_how; - mutex_t *lp; - char osname[MAXNAMELEN]; - char iobuf[SPA_MAXBLOCKSIZE]; - blkptr_t blk = { 0 }; - uint64_t blkoff; - zbookmark_t zb; - dmu_tx_t *tx = dmu_tx_create(os); + objset_t *os = zd->zd_os; + ztest_od_t od[2]; + dmu_tx_t *tx; + uint64_t i; + int error; + uint64_t n, s, txg; + bufwad_t *packbuf, *bigbuf; + uint64_t packobj, packoff, packsize, bigobj, bigoff, bigsize; + uint64_t blocksize = ztest_random_blocksize(); + uint64_t chunksize = blocksize; + uint64_t regions = 997; + uint64_t stride = 123456789ULL; + uint64_t width = 9; + dmu_buf_t *bonus_db; + arc_buf_t **bigbuf_arcbufs; + dmu_object_info_t doi; - dmu_objset_name(os, osname); + /* + * This test uses two objects, packobj and bigobj, that are always + * updated together (i.e. in the same tx) so that their contents are + * in sync and can be compared. Their contents relate to each other + * in a simple way: packobj is a dense array of 'bufwad' structures, + * while bigobj is a sparse array of the same bufwads. Specifically, + * for any index n, there are three bufwads that should be identical: + * + * packobj, at offset n * sizeof (bufwad_t) + * bigobj, at the head of the nth chunk + * bigobj, at the tail of the nth chunk + * + * The chunk size is set equal to bigobj block size so that + * dmu_assign_arcbuf() can be tested for object updates. + */ /* - * Have multiple threads write to large offsets in ZTEST_DIROBJ - * to verify that having multiple threads writing to the same object - * in parallel doesn't cause any trouble. + * Read the directory info. If it's the first time, set things up. */ - if (ztest_random(4) == 0) { - /* - * Do the bonus buffer instead of a regular block. - * We need a lock to serialize resize vs. others, - * so we hash on the objset ID. - */ - b = dmu_objset_id(os) % ZTEST_SYNC_LOCKS; - off = -1ULL; - dmu_tx_hold_bonus(tx, ZTEST_DIROBJ); - } else { - b = ztest_random(ZTEST_SYNC_LOCKS); - off = za->za_diroff_shared + (b << SPA_MAXBLOCKSHIFT); - if (ztest_random(4) == 0) { - do_free = 1; - dmu_tx_hold_free(tx, ZTEST_DIROBJ, off, bs); - } else { - dmu_tx_hold_write(tx, ZTEST_DIROBJ, off, bs); - } - } + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); + ztest_od_init(&od[1], id, FTAG, 1, DMU_OT_UINT64_OTHER, 0, chunksize); - txg_how = ztest_random(2) == 0 ? TXG_WAIT : TXG_NOWAIT; - error = dmu_tx_assign(tx, txg_how); - if (error) { - if (error == ERESTART) { - ASSERT(txg_how == TXG_NOWAIT); - dmu_tx_wait(tx); - } else { - ztest_record_enospc("dmu write parallel"); - } - dmu_tx_abort(tx); + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; - } - txg = dmu_tx_get_txg(tx); - lp = &ztest_shared->zs_sync_lock[b]; - (void) mutex_lock(lp); - - wbt->bt_objset = dmu_objset_id(os); - wbt->bt_object = ZTEST_DIROBJ; - wbt->bt_offset = off; - wbt->bt_txg = txg; - wbt->bt_thread = za->za_instance; - wbt->bt_seq = ztest_shared->zs_seq[b]++; /* protected by lp */ - - /* - * Occasionally, write an all-zero block to test the behavior - * of blocks that compress into holes. - */ - if (off != -1ULL && ztest_random(8) == 0) - bzero(wbt, btsize); - - if (off == -1ULL) { - dmu_object_info_t *doi = &za->za_doi; - char *dboff; - - VERIFY(dmu_bonus_hold(os, ZTEST_DIROBJ, FTAG, &db) == 0); - za->za_dbuf = db; - dmu_object_info_from_db(db, doi); - ASSERT3U(doi->doi_bonus_size, <=, db->db_size); - ASSERT3U(doi->doi_bonus_size, >=, btsize); - ASSERT3U(doi->doi_bonus_size % btsize, ==, 0); - dboff = (char *)db->db_data + doi->doi_bonus_size - btsize; - bcopy(dboff, rbt, btsize); - if (rbt->bt_objset != 0) { - ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset); - ASSERT3U(rbt->bt_object, ==, wbt->bt_object); - ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset); - ASSERT3U(rbt->bt_txg, <=, wbt->bt_txg); - } - if (ztest_random(10) == 0) { - int newsize = (ztest_random(db->db_size / - btsize) + 1) * btsize; - - ASSERT3U(newsize, >=, btsize); - ASSERT3U(newsize, <=, db->db_size); - VERIFY3U(dmu_set_bonus(db, newsize, tx), ==, 0); - dboff = (char *)db->db_data + newsize - btsize; - } - dmu_buf_will_dirty(db, tx); - bcopy(wbt, dboff, btsize); - dmu_buf_rele(db, FTAG); - za->za_dbuf = NULL; - } else if (do_free) { - VERIFY(dmu_free_range(os, ZTEST_DIROBJ, off, bs, tx) == 0); - } else { - dmu_write(os, ZTEST_DIROBJ, off, btsize, wbt, tx); - } + bigobj = od[0].od_object; + packobj = od[1].od_object; + blocksize = od[0].od_blocksize; + chunksize = blocksize; + ASSERT(chunksize == od[1].od_gen); + + VERIFY(dmu_object_info(os, bigobj, &doi) == 0); + VERIFY(ISP2(doi.doi_data_block_size)); + VERIFY(chunksize == doi.doi_data_block_size); + VERIFY(chunksize >= 2 * sizeof (bufwad_t)); + + /* + * Pick a random index and compute the offsets into packobj and bigobj. + */ + n = ztest_random(regions) * stride + ztest_random(width); + s = 1 + ztest_random(width - 1); - (void) mutex_unlock(lp); + packoff = n * sizeof (bufwad_t); + packsize = s * sizeof (bufwad_t); - if (ztest_random(1000) == 0) - (void) poll(NULL, 0, 1); /* open dn_notxholds window */ + bigoff = n * chunksize; + bigsize = s * chunksize; - dmu_tx_commit(tx); + packbuf = umem_zalloc(packsize, UMEM_NOFAIL); + bigbuf = umem_zalloc(bigsize, UMEM_NOFAIL); - if (ztest_random(10000) == 0) - txg_wait_synced(dmu_objset_pool(os), txg); + VERIFY3U(0, ==, dmu_bonus_hold(os, bigobj, FTAG, &bonus_db)); - if (off == -1ULL || do_free) - return; + bigbuf_arcbufs = umem_zalloc(2 * s * sizeof (arc_buf_t *), UMEM_NOFAIL); - if (ztest_random(2) != 0) - return; + /* + * Iteration 0 test zcopy for DB_UNCACHED dbufs. + * Iteration 1 test zcopy to already referenced dbufs. + * Iteration 2 test zcopy to dirty dbuf in the same txg. + * Iteration 3 test zcopy to dbuf dirty in previous txg. + * Iteration 4 test zcopy when dbuf is no longer dirty. + * Iteration 5 test zcopy when it can't be done. + * Iteration 6 one more zcopy write. + */ + for (i = 0; i < 7; i++) { + uint64_t j; + uint64_t off; + + /* + * In iteration 5 (i == 5) use arcbufs + * that don't match bigobj blksz to test + * dmu_assign_arcbuf() when it can't directly + * assign an arcbuf to a dbuf. + */ + for (j = 0; j < s; j++) { + if (i != 5) { + bigbuf_arcbufs[j] = + dmu_request_arcbuf(bonus_db, chunksize); + } else { + bigbuf_arcbufs[2 * j] = + dmu_request_arcbuf(bonus_db, chunksize / 2); + bigbuf_arcbufs[2 * j + 1] = + dmu_request_arcbuf(bonus_db, chunksize / 2); + } + } + + /* + * Get a tx for the mods to both packobj and bigobj. + */ + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, packobj, packoff, packsize); + dmu_tx_hold_write(tx, bigobj, bigoff, bigsize); + + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) { + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + for (j = 0; j < s; j++) { + if (i != 5) { + dmu_return_arcbuf(bigbuf_arcbufs[j]); + } else { + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j]); + dmu_return_arcbuf( + bigbuf_arcbufs[2 * j + 1]); + } + } + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); + dmu_buf_rele(bonus_db, FTAG); + return; + } + + /* + * 50% of the time don't read objects in the 1st iteration to + * test dmu_assign_arcbuf() for the case when there're no + * existing dbufs for the specified offsets. + */ + if (i != 0 || ztest_random(2) != 0) { + error = dmu_read(os, packobj, packoff, + packsize, packbuf, DMU_READ_PREFETCH); + ASSERT3U(error, ==, 0); + error = dmu_read(os, bigobj, bigoff, bigsize, + bigbuf, DMU_READ_PREFETCH); + ASSERT3U(error, ==, 0); + } + compare_and_update_pbbufs(s, packbuf, bigbuf, bigsize, + n, chunksize, txg); + + /* + * We've verified all the old bufwads, and made new ones. + * Now write them out. + */ + dmu_write(os, packobj, packoff, packsize, packbuf, tx); + if (zopt_verbose >= 7) { + (void) printf("writing offset %llx size %llx" + " txg %llx\n", + (u_longlong_t)bigoff, + (u_longlong_t)bigsize, + (u_longlong_t)txg); + } + for (off = bigoff, j = 0; j < s; j++, off += chunksize) { + dmu_buf_t *dbt; + if (i != 5) { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[j]->b_data, chunksize); + } else { + bcopy((caddr_t)bigbuf + (off - bigoff), + bigbuf_arcbufs[2 * j]->b_data, + chunksize / 2); + bcopy((caddr_t)bigbuf + (off - bigoff) + + chunksize / 2, + bigbuf_arcbufs[2 * j + 1]->b_data, + chunksize / 2); + } + + if (i == 1) { + VERIFY(dmu_buf_hold(os, bigobj, off, + FTAG, &dbt) == 0); + } + if (i != 5) { + dmu_assign_arcbuf(bonus_db, off, + bigbuf_arcbufs[j], tx); + } else { + dmu_assign_arcbuf(bonus_db, off, + bigbuf_arcbufs[2 * j], tx); + dmu_assign_arcbuf(bonus_db, + off + chunksize / 2, + bigbuf_arcbufs[2 * j + 1], tx); + } + if (i == 1) { + dmu_buf_rele(dbt, FTAG); + } + } + dmu_tx_commit(tx); - /* - * dmu_sync() the block we just wrote. - */ - (void) mutex_lock(lp); + /* + * Sanity check the stuff we just wrote. + */ + { + void *packcheck = umem_alloc(packsize, UMEM_NOFAIL); + void *bigcheck = umem_alloc(bigsize, UMEM_NOFAIL); - blkoff = P2ALIGN_TYPED(off, bs, uint64_t); - error = dmu_buf_hold(os, ZTEST_DIROBJ, blkoff, FTAG, &db); - za->za_dbuf = db; - if (error) { - dprintf("dmu_buf_hold(%s, %d, %llx) = %d\n", - osname, ZTEST_DIROBJ, blkoff, error); - (void) mutex_unlock(lp); - return; - } - blkoff = off - blkoff; - error = dmu_sync(NULL, db, &blk, txg, NULL, NULL); - dmu_buf_rele(db, FTAG); - za->za_dbuf = NULL; + VERIFY(0 == dmu_read(os, packobj, packoff, + packsize, packcheck, DMU_READ_PREFETCH)); + VERIFY(0 == dmu_read(os, bigobj, bigoff, + bigsize, bigcheck, DMU_READ_PREFETCH)); - (void) mutex_unlock(lp); + ASSERT(bcmp(packbuf, packcheck, packsize) == 0); + ASSERT(bcmp(bigbuf, bigcheck, bigsize) == 0); - if (error) { - dprintf("dmu_sync(%s, %d, %llx) = %d\n", - osname, ZTEST_DIROBJ, off, error); - return; + umem_free(packcheck, packsize); + umem_free(bigcheck, bigsize); + } + if (i == 2) { + txg_wait_open(dmu_objset_pool(os), 0); + } else if (i == 3) { + txg_wait_synced(dmu_objset_pool(os), 0); + } } - if (blk.blk_birth == 0) /* concurrent free */ - return; - - txg_suspend(dmu_objset_pool(os)); + dmu_buf_rele(bonus_db, FTAG); + umem_free(packbuf, packsize); + umem_free(bigbuf, bigsize); + umem_free(bigbuf_arcbufs, 2 * s * sizeof (arc_buf_t *)); +} - ASSERT(blk.blk_fill == 1); - ASSERT3U(BP_GET_TYPE(&blk), ==, DMU_OT_UINT64_OTHER); - ASSERT3U(BP_GET_LEVEL(&blk), ==, 0); - ASSERT3U(BP_GET_LSIZE(&blk), ==, bs); +/* ARGSUSED */ +void +ztest_dmu_write_parallel(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t od[1]; + uint64_t offset = (1ULL << (ztest_random(20) + 43)) + + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); /* - * Read the block that dmu_sync() returned to make sure its contents - * match what we wrote. We do this while still txg_suspend()ed - * to ensure that the block can't be reused before we read it. + * Have multiple threads write to large offsets in an object + * to verify that parallel writes to an object -- even to the + * same blocks within the object -- doesn't cause any trouble. */ - zb.zb_objset = dmu_objset_id(os); - zb.zb_object = ZTEST_DIROBJ; - zb.zb_level = 0; - zb.zb_blkid = off / bs; - error = zio_wait(zio_read(NULL, za->za_spa, &blk, iobuf, bs, - NULL, NULL, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_MUSTSUCCEED, &zb)); - ASSERT3U(error, ==, 0); + ztest_od_init(&od[0], ID_PARALLEL, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; + + while (ztest_random(10) != 0) + ztest_io(zd, od[0].od_object, offset); +} - txg_resume(dmu_objset_pool(os)); +void +ztest_dmu_prealloc(ztest_ds_t *zd, uint64_t id) +{ + ztest_od_t od[1]; + uint64_t offset = (1ULL << (ztest_random(4) + SPA_MAXBLOCKSHIFT)) + + (ztest_random(ZTEST_RANGE_LOCKS) << SPA_MAXBLOCKSHIFT); + uint64_t count = ztest_random(20) + 1; + uint64_t blocksize = ztest_random_blocksize(); + void *data; - bcopy(&iobuf[blkoff], rbt, btsize); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); - if (rbt->bt_objset == 0) /* concurrent free */ + if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; - if (wbt->bt_objset == 0) /* all-zero overwrite */ + if (ztest_truncate(zd, od[0].od_object, offset, count * blocksize) != 0) return; - ASSERT3U(rbt->bt_objset, ==, wbt->bt_objset); - ASSERT3U(rbt->bt_object, ==, wbt->bt_object); - ASSERT3U(rbt->bt_offset, ==, wbt->bt_offset); + ztest_prealloc(zd, od[0].od_object, offset, count * blocksize); - /* - * The semantic of dmu_sync() is that we always push the most recent - * version of the data, so in the face of concurrent updates we may - * see a newer version of the block. That's OK. - */ - ASSERT3U(rbt->bt_txg, >=, wbt->bt_txg); - if (rbt->bt_thread == wbt->bt_thread) - ASSERT3U(rbt->bt_seq, ==, wbt->bt_seq); - else - ASSERT3U(rbt->bt_seq, >, wbt->bt_seq); + data = umem_zalloc(blocksize, UMEM_NOFAIL); + + while (ztest_random(count) != 0) { + uint64_t randoff = offset + (ztest_random(count) * blocksize); + if (ztest_write(zd, od[0].od_object, randoff, blocksize, + data) != 0) + break; + while (ztest_random(4) != 0) + ztest_io(zd, od[0].od_object, randoff); + } + + umem_free(data, blocksize); } /* @@ -2134,9 +3706,10 @@ ztest_dmu_write_parallel(ztest_args_t *za) #define ZTEST_ZAP_MAX_PROPS 1000 void -ztest_zap(ztest_args_t *za) +ztest_zap(ztest_ds_t *zd, uint64_t id) { - objset_t *os = za->za_os; + objset_t *os = zd->zd_os; + ztest_od_t od[1]; uint64_t object; uint64_t txg, last_txg; uint64_t value[ZTEST_ZAP_MAX_INTS]; @@ -2145,64 +3718,45 @@ ztest_zap(ztest_args_t *za) dmu_tx_t *tx; char propname[100], txgname[100]; int error; - char osname[MAXNAMELEN]; char *hc[2] = { "s.acl.h", ".s.open.h.hyLZlg" }; - dmu_objset_name(os, osname); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); - /* - * Create a new object if necessary, and record it in the directory. - */ - VERIFY(0 == dmu_read(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t), &object)); + if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) + return; - if (object == 0) { - tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t)); - dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("create zap test obj"); - dmu_tx_abort(tx); - return; - } - object = zap_create(os, DMU_OT_ZAP_OTHER, DMU_OT_NONE, 0, tx); - if (error) { - fatal(0, "zap_create('%s', %llu) = %d", - osname, object, error); - } - ASSERT(object != 0); - dmu_write(os, ZTEST_DIROBJ, za->za_diroff, - sizeof (uint64_t), &object, tx); - /* - * Generate a known hash collision, and verify that - * we can lookup and remove both entries. - */ - for (i = 0; i < 2; i++) { - value[i] = i; - error = zap_add(os, object, hc[i], sizeof (uint64_t), - 1, &value[i], tx); - ASSERT3U(error, ==, 0); - } - for (i = 0; i < 2; i++) { - error = zap_add(os, object, hc[i], sizeof (uint64_t), - 1, &value[i], tx); - ASSERT3U(error, ==, EEXIST); - error = zap_length(os, object, hc[i], - &zl_intsize, &zl_ints); - ASSERT3U(error, ==, 0); - ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); - ASSERT3U(zl_ints, ==, 1); - } - for (i = 0; i < 2; i++) { - error = zap_remove(os, object, hc[i], tx); - ASSERT3U(error, ==, 0); - } + object = od[0].od_object; - dmu_tx_commit(tx); + /* + * Generate a known hash collision, and verify that + * we can lookup and remove both entries. + */ + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + for (i = 0; i < 2; i++) { + value[i] = i; + VERIFY3U(0, ==, zap_add(os, object, hc[i], sizeof (uint64_t), + 1, &value[i], tx)); + } + for (i = 0; i < 2; i++) { + VERIFY3U(EEXIST, ==, zap_add(os, object, hc[i], + sizeof (uint64_t), 1, &value[i], tx)); + VERIFY3U(0, ==, + zap_length(os, object, hc[i], &zl_intsize, &zl_ints)); + ASSERT3U(zl_intsize, ==, sizeof (uint64_t)); + ASSERT3U(zl_ints, ==, 1); + } + for (i = 0; i < 2; i++) { + VERIFY3U(0, ==, zap_remove(os, object, hc[i], tx)); } + dmu_tx_commit(tx); + /* + * Generate a buch of random entries. + */ ints = MAX(ZTEST_ZAP_MIN_INTS, object % ZTEST_ZAP_MAX_INTS); prop = ztest_random(ZTEST_ZAP_MAX_PROPS); @@ -2246,14 +3800,10 @@ ztest_zap(ztest_args_t *za) * should be txg + object + n. */ tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("create zap entry"); - dmu_tx_abort(tx); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) return; - } - txg = dmu_tx_get_txg(tx); if (last_txg > txg) fatal(0, "zap future leak: old %llu new %llu", last_txg, txg); @@ -2261,16 +3811,10 @@ ztest_zap(ztest_args_t *za) for (i = 0; i < ints; i++) value[i] = txg + object + i; - error = zap_update(os, object, txgname, sizeof (uint64_t), 1, &txg, tx); - if (error) - fatal(0, "zap_update('%s', %llu, '%s') = %d", - osname, object, txgname, error); - - error = zap_update(os, object, propname, sizeof (uint64_t), - ints, value, tx); - if (error) - fatal(0, "zap_update('%s', %llu, '%s') = %d", - osname, object, propname, error); + VERIFY3U(0, ==, zap_update(os, object, txgname, sizeof (uint64_t), + 1, &txg, tx)); + VERIFY3U(0, ==, zap_update(os, object, propname, sizeof (uint64_t), + ints, value, tx)); dmu_tx_commit(tx); @@ -2289,231 +3833,558 @@ ztest_zap(ztest_args_t *za) ASSERT3U(error, ==, 0); tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("remove zap entry"); - dmu_tx_abort(tx); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + VERIFY3U(0, ==, zap_remove(os, object, txgname, tx)); + VERIFY3U(0, ==, zap_remove(os, object, propname, tx)); + dmu_tx_commit(tx); +} + +/* + * Testcase to test the upgrading of a microzap to fatzap. + */ +void +ztest_fzap(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t object, txg; + + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_ZAP_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), !ztest_random(2)) != 0) return; + + object = od[0].od_object; + + /* + * Add entries to this ZAP and make sure it spills over + * and gets upgraded to a fatzap. Also, since we are adding + * 2050 entries we should see ptrtbl growth and leaf-block split. + */ + for (int i = 0; i < 2050; i++) { + char name[MAXNAMELEN]; + uint64_t value = i; + dmu_tx_t *tx; + int error; + + (void) snprintf(name, sizeof (name), "fzap-%llu-%llu", + id, value); + + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, name); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + error = zap_add(os, object, name, sizeof (uint64_t), 1, + &value, tx); + ASSERT(error == 0 || error == EEXIST); + dmu_tx_commit(tx); } - error = zap_remove(os, object, txgname, tx); - if (error) - fatal(0, "zap_remove('%s', %llu, '%s') = %d", - osname, object, txgname, error); +} - error = zap_remove(os, object, propname, tx); - if (error) - fatal(0, "zap_remove('%s', %llu, '%s') = %d", - osname, object, propname, error); +/* ARGSUSED */ +void +ztest_zap_parallel(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; + dmu_tx_t *tx; + int i, namelen, error; + int micro = ztest_random(2); + char name[20], string_value[20]; + void *data; - dmu_tx_commit(tx); + ztest_od_init(&od[0], ID_PARALLEL, FTAG, micro, DMU_OT_ZAP_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; + + object = od[0].od_object; + + /* + * Generate a random name of the form 'xxx.....' where each + * x is a random printable character and the dots are dots. + * There are 94 such characters, and the name length goes from + * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. + */ + namelen = ztest_random(sizeof (name) - 5) + 5 + 1; + + for (i = 0; i < 3; i++) + name[i] = '!' + ztest_random('~' - '!' + 1); + for (; i < namelen - 1; i++) + name[i] = '.'; + name[i] = '\0'; + + if ((namelen & 1) || micro) { + wsize = sizeof (txg); + wc = 1; + data = &txg; + } else { + wsize = 1; + wc = namelen; + data = string_value; + } + + count = -1ULL; + VERIFY(zap_count(os, object, &count) == 0); + ASSERT(count != -1ULL); /* - * Once in a while, destroy the object. + * Select an operation: length, lookup, add, update, remove. */ - if (ztest_random(1000) != 0) + i = ztest_random(5); + + if (i >= 2) { + tx = dmu_tx_create(os); + dmu_tx_hold_zap(tx, object, B_TRUE, NULL); + txg = ztest_tx_assign(tx, TXG_MIGHTWAIT, FTAG); + if (txg == 0) + return; + bcopy(name, string_value, namelen); + } else { + tx = NULL; + txg = 0; + bzero(string_value, namelen); + } + + switch (i) { + + case 0: + error = zap_length(os, object, name, &zl_wsize, &zl_wc); + if (error == 0) { + ASSERT3U(wsize, ==, zl_wsize); + ASSERT3U(wc, ==, zl_wc); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 1: + error = zap_lookup(os, object, name, wsize, wc, data); + if (error == 0) { + if (data == string_value && + bcmp(name, data, namelen) != 0) + fatal(0, "name '%s' != val '%s' len %d", + name, data, namelen); + } else { + ASSERT3U(error, ==, ENOENT); + } + break; + + case 2: + error = zap_add(os, object, name, wsize, wc, data, tx); + ASSERT(error == 0 || error == EEXIST); + break; + + case 3: + VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); + break; + + case 4: + error = zap_remove(os, object, name, tx); + ASSERT(error == 0 || error == ENOENT); + break; + } + + if (tx != NULL) + dmu_tx_commit(tx); +} + +/* + * Commit callback data. + */ +typedef struct ztest_cb_data { + list_node_t zcd_node; + uint64_t zcd_txg; + int zcd_expected_err; + boolean_t zcd_added; + boolean_t zcd_called; + spa_t *zcd_spa; +} ztest_cb_data_t; + +/* This is the actual commit callback function */ +static void +ztest_commit_callback(void *arg, int error) +{ + ztest_cb_data_t *data = arg; + uint64_t synced_txg; + + VERIFY(data != NULL); + VERIFY3S(data->zcd_expected_err, ==, error); + VERIFY(!data->zcd_called); + + synced_txg = spa_last_synced_txg(data->zcd_spa); + if (data->zcd_txg > synced_txg) + fatal(0, "commit callback of txg %" PRIu64 " called prematurely" + ", last synced txg = %" PRIu64 "\n", data->zcd_txg, + synced_txg); + + data->zcd_called = B_TRUE; + + if (error == ECANCELED) { + ASSERT3U(data->zcd_txg, ==, 0); + ASSERT(!data->zcd_added); + + /* + * The private callback data should be destroyed here, but + * since we are going to check the zcd_called field after + * dmu_tx_abort(), we will destroy it there. + */ + return; + } + + /* Was this callback added to the global callback list? */ + if (!data->zcd_added) + goto out; + + ASSERT3U(data->zcd_txg, !=, 0); + + /* Remove our callback from the list */ + (void) mutex_lock(&zcl.zcl_callbacks_lock); + list_remove(&zcl.zcl_callbacks, data); + (void) mutex_unlock(&zcl.zcl_callbacks_lock); + +out: + umem_free(data, sizeof (ztest_cb_data_t)); +} + +/* Allocate and initialize callback data structure */ +static ztest_cb_data_t * +ztest_create_cb_data(objset_t *os, uint64_t txg) +{ + ztest_cb_data_t *cb_data; + + cb_data = umem_zalloc(sizeof (ztest_cb_data_t), UMEM_NOFAIL); + + cb_data->zcd_txg = txg; + cb_data->zcd_spa = dmu_objset_spa(os); + + return (cb_data); +} + +/* + * If a number of txgs equal to this threshold have been created after a commit + * callback has been registered but not called, then we assume there is an + * implementation bug. + */ +#define ZTEST_COMMIT_CALLBACK_THRESH (TXG_CONCURRENT_STATES + 2) + +/* + * Commit callback test. + */ +void +ztest_dmu_commit_callbacks(ztest_ds_t *zd, uint64_t id) +{ + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + dmu_tx_t *tx; + ztest_cb_data_t *cb_data[3], *tmp_cb; + uint64_t old_txg, txg; + int i, error; + + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, 0, 0); + + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) return; tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t)); - dmu_tx_hold_free(tx, object, 0, DMU_OBJECT_END); - error = dmu_tx_assign(tx, TXG_WAIT); + + cb_data[0] = ztest_create_cb_data(os, 0); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[0]); + + dmu_tx_hold_write(tx, od[0].od_object, 0, sizeof (uint64_t)); + + /* Every once in a while, abort the transaction on purpose */ + if (ztest_random(100) == 0) + error = -1; + + if (!error) + error = dmu_tx_assign(tx, TXG_NOWAIT); + + txg = error ? 0 : dmu_tx_get_txg(tx); + + cb_data[0]->zcd_txg = txg; + cb_data[1] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[1]); + if (error) { - ztest_record_enospc("destroy zap object"); + /* + * It's not a strict requirement to call the registered + * callbacks from inside dmu_tx_abort(), but that's what + * it's supposed to happen in the current implementation + * so we will check for that. + */ + for (i = 0; i < 2; i++) { + cb_data[i]->zcd_expected_err = ECANCELED; + VERIFY(!cb_data[i]->zcd_called); + } + dmu_tx_abort(tx); + + for (i = 0; i < 2; i++) { + VERIFY(cb_data[i]->zcd_called); + umem_free(cb_data[i], sizeof (ztest_cb_data_t)); + } + return; } - error = zap_destroy(os, object, tx); - if (error) - fatal(0, "zap_destroy('%s', %llu) = %d", - osname, object, error); - object = 0; - dmu_write(os, ZTEST_DIROBJ, za->za_diroff, sizeof (uint64_t), - &object, tx); - dmu_tx_commit(tx); -} -void -ztest_zap_parallel(ztest_args_t *za) -{ - objset_t *os = za->za_os; - uint64_t txg, object, count, wsize, wc, zl_wsize, zl_wc; - dmu_tx_t *tx; - int i, namelen, error; - char name[20], string_value[20]; - void *data; + cb_data[2] = ztest_create_cb_data(os, txg); + dmu_tx_callback_register(tx, ztest_commit_callback, cb_data[2]); /* - * Generate a random name of the form 'xxx.....' where each - * x is a random printable character and the dots are dots. - * There are 94 such characters, and the name length goes from - * 6 to 20, so there are 94^3 * 15 = 12,458,760 possible names. + * Read existing data to make sure there isn't a future leak. */ - namelen = ztest_random(sizeof (name) - 5) + 5 + 1; + VERIFY(0 == dmu_read(os, od[0].od_object, 0, sizeof (uint64_t), + &old_txg, DMU_READ_PREFETCH)); - for (i = 0; i < 3; i++) - name[i] = '!' + ztest_random('~' - '!' + 1); - for (; i < namelen - 1; i++) - name[i] = '.'; - name[i] = '\0'; + if (old_txg > txg) + fatal(0, "future leak: got %" PRIu64 ", open txg is %" PRIu64, + old_txg, txg); - if (ztest_random(2) == 0) - object = ZTEST_MICROZAP_OBJ; - else - object = ZTEST_FATZAP_OBJ; + dmu_write(os, od[0].od_object, 0, sizeof (uint64_t), &txg, tx); - if ((namelen & 1) || object == ZTEST_MICROZAP_OBJ) { - wsize = sizeof (txg); - wc = 1; - data = &txg; - } else { - wsize = 1; - wc = namelen; - data = string_value; - } + (void) mutex_lock(&zcl.zcl_callbacks_lock); - count = -1ULL; - VERIFY(zap_count(os, object, &count) == 0); - ASSERT(count != -1ULL); + /* + * Since commit callbacks don't have any ordering requirement and since + * it is theoretically possible for a commit callback to be called + * after an arbitrary amount of time has elapsed since its txg has been + * synced, it is difficult to reliably determine whether a commit + * callback hasn't been called due to high load or due to a flawed + * implementation. + * + * In practice, we will assume that if after a certain number of txgs a + * commit callback hasn't been called, then most likely there's an + * implementation bug.. + */ + tmp_cb = list_head(&zcl.zcl_callbacks); + if (tmp_cb != NULL && + tmp_cb->zcd_txg > txg - ZTEST_COMMIT_CALLBACK_THRESH) { + fatal(0, "Commit callback threshold exceeded, oldest txg: %" + PRIu64 ", open txg: %" PRIu64 "\n", tmp_cb->zcd_txg, txg); + } /* - * Select an operation: length, lookup, add, update, remove. + * Let's find the place to insert our callbacks. + * + * Even though the list is ordered by txg, it is possible for the + * insertion point to not be the end because our txg may already be + * quiescing at this point and other callbacks in the open txg + * (from other objsets) may have sneaked in. */ - i = ztest_random(5); + tmp_cb = list_tail(&zcl.zcl_callbacks); + while (tmp_cb != NULL && tmp_cb->zcd_txg > txg) + tmp_cb = list_prev(&zcl.zcl_callbacks, tmp_cb); + + /* Add the 3 callbacks to the list */ + for (i = 0; i < 3; i++) { + if (tmp_cb == NULL) + list_insert_head(&zcl.zcl_callbacks, cb_data[i]); + else + list_insert_after(&zcl.zcl_callbacks, tmp_cb, + cb_data[i]); - if (i >= 2) { - tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, object, TRUE, NULL); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - ztest_record_enospc("zap parallel"); - dmu_tx_abort(tx); - return; - } - txg = dmu_tx_get_txg(tx); - bcopy(name, string_value, namelen); - } else { - tx = NULL; - txg = 0; - bzero(string_value, namelen); + cb_data[i]->zcd_added = B_TRUE; + VERIFY(!cb_data[i]->zcd_called); + + tmp_cb = cb_data[i]; } - switch (i) { + (void) mutex_unlock(&zcl.zcl_callbacks_lock); - case 0: - error = zap_length(os, object, name, &zl_wsize, &zl_wc); - if (error == 0) { - ASSERT3U(wsize, ==, zl_wsize); - ASSERT3U(wc, ==, zl_wc); - } else { - ASSERT3U(error, ==, ENOENT); - } - break; + dmu_tx_commit(tx); +} - case 1: - error = zap_lookup(os, object, name, wsize, wc, data); - if (error == 0) { - if (data == string_value && - bcmp(name, data, namelen) != 0) - fatal(0, "name '%s' != val '%s' len %d", - name, data, namelen); - } else { - ASSERT3U(error, ==, ENOENT); - } - break; +/* ARGSUSED */ +void +ztest_dsl_prop_get_set(ztest_ds_t *zd, uint64_t id) +{ + zfs_prop_t proplist[] = { + ZFS_PROP_CHECKSUM, + ZFS_PROP_COMPRESSION, + ZFS_PROP_COPIES, + ZFS_PROP_DEDUP + }; + ztest_shared_t *zs = ztest_shared; - case 2: - error = zap_add(os, object, name, wsize, wc, data, tx); - ASSERT(error == 0 || error == EEXIST); - break; + (void) rw_rdlock(&zs->zs_name_lock); - case 3: - VERIFY(zap_update(os, object, name, wsize, wc, data, tx) == 0); - break; + for (int p = 0; p < sizeof (proplist) / sizeof (proplist[0]); p++) + (void) ztest_dsl_prop_set_uint64(zd->zd_name, proplist[p], + ztest_random_dsl_prop(proplist[p]), (int)ztest_random(2)); - case 4: - error = zap_remove(os, object, name, tx); - ASSERT(error == 0 || error == ENOENT); - break; - } + (void) rw_unlock(&zs->zs_name_lock); +} - if (tx != NULL) - dmu_tx_commit(tx); +/* ARGSUSED */ +void +ztest_spa_prop_get_set(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + nvlist_t *props = NULL; + + (void) rw_rdlock(&zs->zs_name_lock); + + (void) ztest_spa_prop_set_uint64(zs, ZPOOL_PROP_DEDUPDITTO, + ZIO_DEDUPDITTO_MIN + ztest_random(ZIO_DEDUPDITTO_MIN)); + + VERIFY3U(spa_prop_get(zs->zs_spa, &props), ==, 0); + + if (zopt_verbose >= 6) + dump_nvlist(props, 4); + + nvlist_free(props); + + (void) rw_unlock(&zs->zs_name_lock); } +/* + * Test snapshot hold/release and deferred destroy. + */ void -ztest_dsl_prop_get_set(ztest_args_t *za) +ztest_dmu_snapshot_hold(ztest_ds_t *zd, uint64_t id) { - objset_t *os = za->za_os; - int i, inherit; - uint64_t value; - const char *prop, *valname; - char setpoint[MAXPATHLEN]; - char osname[MAXNAMELEN]; int error; + objset_t *os = zd->zd_os; + objset_t *origin; + char snapname[100]; + char fullname[100]; + char clonename[100]; + char tag[100]; + char osname[MAXNAMELEN]; (void) rw_rdlock(&ztest_shared->zs_name_lock); dmu_objset_name(os, osname); - for (i = 0; i < 2; i++) { - if (i == 0) { - prop = "checksum"; - value = ztest_random_checksum(); - inherit = (value == ZIO_CHECKSUM_INHERIT); - } else { - prop = "compression"; - value = ztest_random_compress(); - inherit = (value == ZIO_COMPRESS_INHERIT); + (void) snprintf(snapname, 100, "sh1_%llu", id); + (void) snprintf(fullname, 100, "%s@%s", osname, snapname); + (void) snprintf(clonename, 100, "%s/ch1_%llu", osname, id); + (void) snprintf(tag, 100, "%tag_%llu", id); + + /* + * Clean up from any previous run. + */ + (void) dmu_objset_destroy(clonename, B_FALSE); + (void) dsl_dataset_user_release(osname, snapname, tag, B_FALSE); + (void) dmu_objset_destroy(fullname, B_FALSE); + + /* + * Create snapshot, clone it, mark snap for deferred destroy, + * destroy clone, verify snap was also destroyed. + */ + error = dmu_objset_snapshot(osname, snapname, NULL, FALSE); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_snapshot"); + goto out; } + fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + } - error = dsl_prop_set(osname, prop, sizeof (value), - !inherit, &value); + error = dmu_objset_hold(fullname, FTAG, &origin); + if (error) + fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); + error = dmu_objset_clone(clonename, dmu_objset_ds(origin), 0); + dmu_objset_rele(origin, FTAG); + if (error) { if (error == ENOSPC) { - ztest_record_enospc("dsl_prop_set"); - break; + ztest_record_enospc("dmu_objset_clone"); + goto out; } + fatal(0, "dmu_objset_clone(%s) = %d", clonename, error); + } - ASSERT3U(error, ==, 0); + error = dmu_objset_destroy(fullname, B_TRUE); + if (error) { + fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", + fullname, error); + } - VERIFY3U(dsl_prop_get(osname, prop, sizeof (value), - 1, &value, setpoint), ==, 0); + error = dmu_objset_destroy(clonename, B_FALSE); + if (error) + fatal(0, "dmu_objset_destroy(%s) = %d", clonename, error); - if (i == 0) - valname = zio_checksum_table[value].ci_name; - else - valname = zio_compress_table[value].ci_name; + error = dmu_objset_hold(fullname, FTAG, &origin); + if (error != ENOENT) + fatal(0, "dmu_objset_hold(%s) = %d", fullname, error); - if (zopt_verbose >= 6) { - (void) printf("%s %s = %s for '%s'\n", - osname, prop, valname, setpoint); + /* + * Create snapshot, add temporary hold, verify that we can't + * destroy a held snapshot, mark for deferred destroy, + * release hold, verify snapshot was destroyed. + */ + error = dmu_objset_snapshot(osname, snapname, NULL, FALSE); + if (error) { + if (error == ENOSPC) { + ztest_record_enospc("dmu_objset_snapshot"); + goto out; } + fatal(0, "dmu_objset_snapshot(%s) = %d", fullname, error); + } + + error = dsl_dataset_user_hold(osname, snapname, tag, B_FALSE, B_TRUE); + if (error) + fatal(0, "dsl_dataset_user_hold(%s)", fullname, tag); + + error = dmu_objset_destroy(fullname, B_FALSE); + if (error != EBUSY) { + fatal(0, "dmu_objset_destroy(%s, B_FALSE) = %d", + fullname, error); + } + + error = dmu_objset_destroy(fullname, B_TRUE); + if (error) { + fatal(0, "dmu_objset_destroy(%s, B_TRUE) = %d", + fullname, error); } + error = dsl_dataset_user_release(osname, snapname, tag, B_FALSE); + if (error) + fatal(0, "dsl_dataset_user_release(%s)", fullname, tag); + + VERIFY(dmu_objset_hold(fullname, FTAG, &origin) == ENOENT); + +out: (void) rw_unlock(&ztest_shared->zs_name_lock); } /* * Inject random faults into the on-disk data. */ +/* ARGSUSED */ void -ztest_fault_inject(ztest_args_t *za) +ztest_fault_inject(ztest_ds_t *zd, uint64_t id) { + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; int fd; uint64_t offset; - uint64_t leaves = MAX(zopt_mirrors, 1) * zopt_raidz; + uint64_t leaves; uint64_t bad = 0x1990c0ffeedecade; uint64_t top, leaf; char path0[MAXPATHLEN]; char pathrand[MAXPATHLEN]; size_t fsize; - spa_t *spa = za->za_spa; int bshift = SPA_MAXBLOCKSHIFT + 2; /* don't scrog all labels */ int iters = 1000; - int maxfaults = zopt_maxfaults; + int maxfaults; + int mirror_save; vdev_t *vd0 = NULL; uint64_t guid0 = 0; + boolean_t islog = B_FALSE; + + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + maxfaults = MAXFAULTS(); + leaves = MAX(zs->zs_mirrors, 1) * zopt_raidz; + mirror_save = zs->zs_mirrors; + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); ASSERT(leaves >= 1); @@ -2524,10 +4395,10 @@ ztest_fault_inject(ztest_args_t *za) if (ztest_random(2) == 0) { /* - * Inject errors on a normal data device. + * Inject errors on a normal data device or slog device. */ - top = ztest_random(spa->spa_root_vdev->vdev_children); - leaf = ztest_random(leaves); + top = ztest_random_vdev_top(spa, B_TRUE); + leaf = ztest_random(leaves) + zs->zs_splits; /* * Generate paths to the first leaf in this top-level vdev, @@ -2536,11 +4407,14 @@ ztest_fault_inject(ztest_args_t *za) * and we'll write random garbage to the randomly chosen leaf. */ (void) snprintf(path0, sizeof (path0), ztest_dev_template, - zopt_dir, zopt_pool, top * leaves + 0); + zopt_dir, zopt_pool, top * leaves + zs->zs_splits); (void) snprintf(pathrand, sizeof (pathrand), ztest_dev_template, zopt_dir, zopt_pool, top * leaves + leaf); vd0 = vdev_lookup_by_path(spa->spa_root_vdev, path0); + if (vd0 != NULL && vd0->vdev_top->vdev_islog) + islog = B_TRUE; + if (vd0 != NULL && maxfaults != 1) { /* * Make vd0 explicitly claim to be unreadable, @@ -2584,23 +4458,40 @@ ztest_fault_inject(ztest_args_t *za) maxfaults = INT_MAX; /* no limit on cache devices */ } - dprintf("damaging %s and %s\n", path0, pathrand); - spa_config_exit(spa, SCL_STATE, FTAG); - if (maxfaults == 0) - return; - /* - * If we can tolerate two or more faults, randomly online/offline vd0. + * If we can tolerate two or more faults, or we're dealing + * with a slog, randomly online/offline vd0. */ - if (maxfaults >= 2 && guid0 != 0) { - if (ztest_random(10) < 6) - (void) vdev_offline(spa, guid0, B_TRUE); - else - (void) vdev_online(spa, guid0, B_FALSE, NULL); + if ((maxfaults >= 2 || islog) && guid0 != 0) { + if (ztest_random(10) < 6) { + int flags = (ztest_random(2) == 0 ? + ZFS_OFFLINE_TEMPORARY : 0); + + /* + * We have to grab the zs_name_lock as writer to + * prevent a race between offlining a slog and + * destroying a dataset. Offlining the slog will + * grab a reference on the dataset which may cause + * dmu_objset_destroy() to fail with EBUSY thus + * leaving the dataset in an inconsistent state. + */ + if (islog) + (void) rw_wrlock(&ztest_shared->zs_name_lock); + + VERIFY(vdev_offline(spa, guid0, flags) != EBUSY); + + if (islog) + (void) rw_unlock(&ztest_shared->zs_name_lock); + } else { + (void) vdev_online(spa, guid0, 0, NULL); + } } + if (maxfaults == 0) + return; + /* * We have at least single-fault tolerance, so inject data corruption. */ @@ -2619,173 +4510,196 @@ ztest_fault_inject(ztest_args_t *za) if (offset >= fsize) continue; - if (zopt_verbose >= 6) - (void) printf("injecting bad word into %s," - " offset 0x%llx\n", pathrand, (u_longlong_t)offset); + VERIFY(mutex_lock(&zs->zs_vdev_lock) == 0); + if (mirror_save != zs->zs_mirrors) { + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + (void) close(fd); + return; + } if (pwrite(fd, &bad, sizeof (bad), offset) != sizeof (bad)) fatal(1, "can't inject bad word at 0x%llx in %s", offset, pathrand); + + VERIFY(mutex_unlock(&zs->zs_vdev_lock) == 0); + + if (zopt_verbose >= 7) + (void) printf("injected bad word into %s," + " offset 0x%llx\n", pathrand, (u_longlong_t)offset); } (void) close(fd); } /* - * Scrub the pool. + * Verify that DDT repair works as expected. */ void -ztest_scrub(ztest_args_t *za) +ztest_ddt_repair(ztest_ds_t *zd, uint64_t id) { - spa_t *spa = za->za_spa; - - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); - (void) poll(NULL, 0, 1000); /* wait a second, then force a restart */ - (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); -} + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; + objset_t *os = zd->zd_os; + ztest_od_t od[1]; + uint64_t object, blocksize, txg, pattern, psize; + enum zio_checksum checksum = spa_dedup_checksum(spa); + dmu_buf_t *db; + dmu_tx_t *tx; + void *buf; + blkptr_t blk; + int copies = 2 * ZIO_DEDUPDITTO_MIN; -/* - * Rename the pool to a different name and then rename it back. - */ -void -ztest_spa_rename(ztest_args_t *za) -{ - char *oldname, *newname; - int error; - spa_t *spa; + blocksize = ztest_random_blocksize(); + blocksize = MIN(blocksize, 2048); /* because we write so many */ - (void) rw_wrlock(&ztest_shared->zs_name_lock); + ztest_od_init(&od[0], id, FTAG, 0, DMU_OT_UINT64_OTHER, blocksize, 0); - oldname = za->za_pool; - newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); - (void) strcpy(newname, oldname); - (void) strcat(newname, "_tmp"); + if (ztest_object_init(zd, od, sizeof (od), B_FALSE) != 0) + return; /* - * Do the rename + * Take the name lock as writer to prevent anyone else from changing + * the pool and dataset properies we need to maintain during this test. */ - error = spa_rename(oldname, newname); - if (error) - fatal(0, "spa_rename('%s', '%s') = %d", oldname, - newname, error); + (void) rw_wrlock(&zs->zs_name_lock); - /* - * Try to open it under the old name, which shouldn't exist - */ - error = spa_open(oldname, &spa, FTAG); - if (error != ENOENT) - fatal(0, "spa_open('%s') = %d", oldname, error); + if (ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_DEDUP, checksum, + B_FALSE) != 0 || + ztest_dsl_prop_set_uint64(zd->zd_name, ZFS_PROP_COPIES, 1, + B_FALSE) != 0) { + (void) rw_unlock(&zs->zs_name_lock); + return; + } + + object = od[0].od_object; + blocksize = od[0].od_blocksize; + pattern = spa_guid(spa) ^ dmu_objset_fsid_guid(os); + + ASSERT(object != 0); + + tx = dmu_tx_create(os); + dmu_tx_hold_write(tx, object, 0, copies * blocksize); + txg = ztest_tx_assign(tx, TXG_WAIT, FTAG); + if (txg == 0) { + (void) rw_unlock(&zs->zs_name_lock); + return; + } /* - * Open it under the new name and make sure it's still the same spa_t. + * Write all the copies of our block. */ - error = spa_open(newname, &spa, FTAG); - if (error != 0) - fatal(0, "spa_open('%s') = %d", newname, error); + for (int i = 0; i < copies; i++) { + uint64_t offset = i * blocksize; + VERIFY(dmu_buf_hold(os, object, offset, FTAG, &db) == 0); + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == blocksize); + ASSERT(ztest_pattern_match(db->db_data, db->db_size, pattern) || + ztest_pattern_match(db->db_data, db->db_size, 0ULL)); + dmu_buf_will_fill(db, tx); + ztest_pattern_set(db->db_data, db->db_size, pattern); + dmu_buf_rele(db, FTAG); + } - ASSERT(spa == za->za_spa); - spa_close(spa, FTAG); + dmu_tx_commit(tx); + txg_wait_synced(spa_get_dsl(spa), txg); /* - * Rename it back to the original + * Find out what block we got. */ - error = spa_rename(newname, oldname); - if (error) - fatal(0, "spa_rename('%s', '%s') = %d", newname, - oldname, error); + VERIFY(dmu_buf_hold(os, object, 0, FTAG, &db) == 0); + blk = *((dmu_buf_impl_t *)db)->db_blkptr; + dmu_buf_rele(db, FTAG); /* - * Make sure it can still be opened + * Damage the block. Dedup-ditto will save us when we read it later. */ - error = spa_open(oldname, &spa, FTAG); - if (error != 0) - fatal(0, "spa_open('%s') = %d", oldname, error); + psize = BP_GET_PSIZE(&blk); + buf = zio_buf_alloc(psize); + ztest_pattern_set(buf, psize, ~pattern); - ASSERT(spa == za->za_spa); - spa_close(spa, FTAG); + (void) zio_wait(zio_rewrite(NULL, spa, 0, &blk, + buf, psize, NULL, NULL, ZIO_PRIORITY_SYNC_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_INDUCE_DAMAGE, NULL)); - umem_free(newname, strlen(newname) + 1); + zio_buf_free(buf, psize); - (void) rw_unlock(&ztest_shared->zs_name_lock); + (void) rw_unlock(&zs->zs_name_lock); } - /* - * Completely obliterate one disk. + * Scrub the pool. */ -static void -ztest_obliterate_one_disk(uint64_t vdev) +/* ARGSUSED */ +void +ztest_scrub(ztest_ds_t *zd, uint64_t id) { - int fd; - char dev_name[MAXPATHLEN], copy_name[MAXPATHLEN]; - size_t fsize; + ztest_shared_t *zs = ztest_shared; + spa_t *spa = zs->zs_spa; - if (zopt_maxfaults < 2) - return; + (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); + (void) poll(NULL, 0, 100); /* wait a moment, then force a restart */ + (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); +} - (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev); - (void) snprintf(copy_name, MAXPATHLEN, "%s.old", dev_name); +/* + * Rename the pool to a different name and then rename it back. + */ +/* ARGSUSED */ +void +ztest_spa_rename(ztest_ds_t *zd, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + char *oldname, *newname; + spa_t *spa; - fd = open(dev_name, O_RDWR); + (void) rw_wrlock(&zs->zs_name_lock); - if (fd == -1) - fatal(1, "can't open %s", dev_name); + oldname = zs->zs_pool; + newname = umem_alloc(strlen(oldname) + 5, UMEM_NOFAIL); + (void) strcpy(newname, oldname); + (void) strcat(newname, "_tmp"); /* - * Determine the size. + * Do the rename */ - fsize = lseek(fd, 0, SEEK_END); - - (void) close(fd); + VERIFY3U(0, ==, spa_rename(oldname, newname)); /* - * Rename the old device to dev_name.old (useful for debugging). + * Try to open it under the old name, which shouldn't exist */ - VERIFY(rename(dev_name, copy_name) == 0); + VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); /* - * Create a new one. + * Open it under the new name and make sure it's still the same spa_t. */ - VERIFY((fd = open(dev_name, O_RDWR | O_CREAT | O_TRUNC, 0666)) >= 0); - VERIFY(ftruncate(fd, fsize) == 0); - (void) close(fd); -} + VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); -static void -ztest_replace_one_disk(spa_t *spa, uint64_t vdev) -{ - char dev_name[MAXPATHLEN]; - nvlist_t *root; - int error; - uint64_t guid; - vdev_t *vd; + ASSERT(spa == zs->zs_spa); + spa_close(spa, FTAG); - (void) sprintf(dev_name, ztest_dev_template, zopt_dir, zopt_pool, vdev); + /* + * Rename it back to the original + */ + VERIFY3U(0, ==, spa_rename(newname, oldname)); /* - * Build the nvlist describing dev_name. + * Make sure it can still be opened */ - root = make_vdev_root(dev_name, NULL, 0, 0, 0, 0, 0, 1); + VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - if ((vd = vdev_lookup_by_path(spa->spa_root_vdev, dev_name)) == NULL) - guid = 0; - else - guid = vd->vdev_guid; - spa_config_exit(spa, SCL_VDEV, FTAG); - error = spa_vdev_attach(spa, guid, root, B_TRUE); - if (error != 0 && - error != EBUSY && - error != ENOTSUP && - error != ENODEV && - error != EDOM) - fatal(0, "spa_vdev_attach(in-place) = %d", error); + ASSERT(spa == zs->zs_spa); + spa_close(spa, FTAG); - nvlist_free(root); + umem_free(newname, strlen(newname) + 1); + + (void) rw_unlock(&zs->zs_name_lock); } +/* + * Verify pool integrity by running zdb. + */ static void -ztest_verify_blocks(char *pool) +ztest_run_zdb(char *pool) { int status; char zdb[MAXPATHLEN + MAXNAMELEN + 20]; @@ -2806,7 +4720,7 @@ ztest_verify_blocks(char *pool) isa = strdup(isa); /* LINTED */ (void) sprintf(bin, - "/usr/sbin%.*s/zdb -bc%s%s -U /tmp/zpool.cache %s", + "/usr/sbin%.*s/zdb -bcc%s%s -U /tmp/zpool.cache %s", isalen, isa, zopt_verbose >= 3 ? "s" : "", @@ -2853,10 +4767,9 @@ ztest_walk_pool_directory(char *header) static void ztest_spa_import_export(char *oldname, char *newname) { - nvlist_t *config; + nvlist_t *config, *newconfig; uint64_t pool_guid; spa_t *spa; - int error; if (zopt_verbose >= 4) { (void) printf("import/export: old = %s, new = %s\n", @@ -2871,9 +4784,13 @@ ztest_spa_import_export(char *oldname, char *newname) /* * Get the pool's configuration and guid. */ - error = spa_open(oldname, &spa, FTAG); - if (error) - fatal(0, "spa_open('%s') = %d", oldname, error); + VERIFY3U(0, ==, spa_open(oldname, &spa, FTAG)); + + /* + * Kick off a scrub to tickle scrub/export races. + */ + if (ztest_random(2) == 0) + (void) spa_scrub(spa, POOL_SCRUB_EVERYTHING); pool_guid = spa_guid(spa); spa_close(spa, FTAG); @@ -2883,225 +4800,337 @@ ztest_spa_import_export(char *oldname, char *newname) /* * Export it. */ - error = spa_export(oldname, &config, B_FALSE); - if (error) - fatal(0, "spa_export('%s') = %d", oldname, error); + VERIFY3U(0, ==, spa_export(oldname, &config, B_FALSE, B_FALSE)); ztest_walk_pool_directory("pools after export"); + /* + * Try to import it. + */ + newconfig = spa_tryimport(config); + ASSERT(newconfig != NULL); + nvlist_free(newconfig); + /* * Import it under the new name. */ - error = spa_import(newname, config, NULL); - if (error) - fatal(0, "spa_import('%s') = %d", newname, error); + VERIFY3U(0, ==, spa_import(newname, config, NULL)); ztest_walk_pool_directory("pools after import"); /* * Try to import it again -- should fail with EEXIST. */ - error = spa_import(newname, config, NULL); - if (error != EEXIST) - fatal(0, "spa_import('%s') twice", newname); + VERIFY3U(EEXIST, ==, spa_import(newname, config, NULL)); /* * Try to import it under a different name -- should fail with EEXIST. */ - error = spa_import(oldname, config, NULL); - if (error != EEXIST) - fatal(0, "spa_import('%s') under multiple names", newname); + VERIFY3U(EEXIST, ==, spa_import(oldname, config, NULL)); /* * Verify that the pool is no longer visible under the old name. */ - error = spa_open(oldname, &spa, FTAG); - if (error != ENOENT) - fatal(0, "spa_open('%s') = %d", newname, error); + VERIFY3U(ENOENT, ==, spa_open(oldname, &spa, FTAG)); /* * Verify that we can open and close the pool using the new name. */ - error = spa_open(newname, &spa, FTAG); - if (error) - fatal(0, "spa_open('%s') = %d", newname, error); + VERIFY3U(0, ==, spa_open(newname, &spa, FTAG)); ASSERT(pool_guid == spa_guid(spa)); spa_close(spa, FTAG); nvlist_free(config); } +static void +ztest_resume(spa_t *spa) +{ + if (spa_suspended(spa) && zopt_verbose >= 6) + (void) printf("resuming from suspended state\n"); + spa_vdev_state_enter(spa, SCL_NONE); + vdev_clear(spa, NULL); + (void) spa_vdev_state_exit(spa, NULL, 0); + (void) zio_resume(spa); +} + static void * -ztest_resume(void *arg) +ztest_resume_thread(void *arg) { spa_t *spa = arg; while (!ztest_exiting) { - (void) poll(NULL, 0, 1000); + if (spa_suspended(spa)) + ztest_resume(spa); + (void) poll(NULL, 0, 100); + } + return (NULL); +} - if (!spa_suspended(spa)) - continue; +static void * +ztest_deadman_thread(void *arg) +{ + ztest_shared_t *zs = arg; + int grace = 300; + hrtime_t delta; - spa_vdev_state_enter(spa); - vdev_clear(spa, NULL); - (void) spa_vdev_state_exit(spa, NULL, 0); + delta = (zs->zs_thread_stop - zs->zs_thread_start) / NANOSEC + grace; + + (void) poll(NULL, 0, (int)(1000 * delta)); + + fatal(0, "failed to complete within %d seconds of deadline", grace); - zio_resume(spa); - } return (NULL); } +static void +ztest_execute(ztest_info_t *zi, uint64_t id) +{ + ztest_shared_t *zs = ztest_shared; + ztest_ds_t *zd = &zs->zs_zd[id % zopt_datasets]; + hrtime_t functime = gethrtime(); + + for (int i = 0; i < zi->zi_iters; i++) + zi->zi_func(zd, id); + + functime = gethrtime() - functime; + + atomic_add_64(&zi->zi_call_count, 1); + atomic_add_64(&zi->zi_call_time, functime); + + if (zopt_verbose >= 4) { + Dl_info dli; + (void) dladdr((void *)zi->zi_func, &dli); + (void) printf("%6.2f sec in %s\n", + (double)functime / NANOSEC, dli.dli_sname); + } +} + static void * ztest_thread(void *arg) { - ztest_args_t *za = arg; + uint64_t id = (uintptr_t)arg; ztest_shared_t *zs = ztest_shared; - hrtime_t now, functime; + uint64_t call_next; + hrtime_t now; ztest_info_t *zi; - int f, i; - while ((now = gethrtime()) < za->za_stop) { + while ((now = gethrtime()) < zs->zs_thread_stop) { /* * See if it's time to force a crash. */ - if (now > za->za_kill) { - zs->zs_alloc = spa_get_alloc(za->za_spa); - zs->zs_space = spa_get_space(za->za_spa); - (void) kill(getpid(), SIGKILL); - } + if (now > zs->zs_thread_kill) + ztest_kill(zs); /* - * Pick a random function. + * If we're getting ENOSPC with some regularity, stop. */ - f = ztest_random(ZTEST_FUNCS); - zi = &zs->zs_info[f]; + if (zs->zs_enospc_count > 10) + break; /* - * Decide whether to call it, based on the requested frequency. + * Pick a random function to execute. */ - if (zi->zi_call_target == 0 || - (double)zi->zi_call_total / zi->zi_call_target > - (double)(now - zs->zs_start_time) / (zopt_time * NANOSEC)) - continue; + zi = &zs->zs_info[ztest_random(ZTEST_FUNCS)]; + call_next = zi->zi_call_next; - atomic_add_64(&zi->zi_calls, 1); - atomic_add_64(&zi->zi_call_total, 1); + if (now >= call_next && + atomic_cas_64(&zi->zi_call_next, call_next, call_next + + ztest_random(2 * zi->zi_interval[0] + 1)) == call_next) + ztest_execute(zi, id); + } - za->za_diroff = (za->za_instance * ZTEST_FUNCS + f) * - ZTEST_DIRSIZE; - za->za_diroff_shared = (1ULL << 63); + return (NULL); +} - for (i = 0; i < zi->zi_iters; i++) - zi->zi_func(za); +static void +ztest_dataset_name(char *dsname, char *pool, int d) +{ + (void) snprintf(dsname, MAXNAMELEN, "%s/ds_%d", pool, d); +} - functime = gethrtime() - now; +static void +ztest_dataset_destroy(ztest_shared_t *zs, int d) +{ + char name[MAXNAMELEN]; - atomic_add_64(&zi->zi_call_time, functime); + ztest_dataset_name(name, zs->zs_pool, d); - if (zopt_verbose >= 4) { - Dl_info dli; - (void) dladdr((void *)zi->zi_func, &dli); - (void) printf("%6.2f sec in %s\n", - (double)functime / NANOSEC, dli.dli_sname); - } + if (zopt_verbose >= 3) + (void) printf("Destroying %s to free up space\n", name); - /* - * If we're getting ENOSPC with some regularity, stop. - */ - if (zs->zs_enospc_count > 10) - break; + /* + * Cleanup any non-standard clones and snapshots. In general, + * ztest thread t operates on dataset (t % zopt_datasets), + * so there may be more than one thing to clean up. + */ + for (int t = d; t < zopt_threads; t += zopt_datasets) + ztest_dsl_dataset_cleanup(name, t); + + (void) dmu_objset_find(name, ztest_objset_destroy_cb, NULL, + DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); +} + +static void +ztest_dataset_dirobj_verify(ztest_ds_t *zd) +{ + uint64_t usedobjs, dirobjs, scratch; + + /* + * ZTEST_DIROBJ is the object directory for the entire dataset. + * Therefore, the number of objects in use should equal the + * number of ZTEST_DIROBJ entries, +1 for ZTEST_DIROBJ itself. + * If not, we have an object leak. + * + * Note that we can only check this in ztest_dataset_open(), + * when the open-context and syncing-context values agree. + * That's because zap_count() returns the open-context value, + * while dmu_objset_space() returns the rootbp fill count. + */ + VERIFY3U(0, ==, zap_count(zd->zd_os, ZTEST_DIROBJ, &dirobjs)); + dmu_objset_space(zd->zd_os, &scratch, &scratch, &usedobjs, &scratch); + ASSERT3U(dirobjs + 1, ==, usedobjs); +} + +static int +ztest_dataset_open(ztest_shared_t *zs, int d) +{ + ztest_ds_t *zd = &zs->zs_zd[d]; + uint64_t committed_seq = zd->zd_seq; + objset_t *os; + zilog_t *zilog; + char name[MAXNAMELEN]; + int error; + + ztest_dataset_name(name, zs->zs_pool, d); + + (void) rw_rdlock(&zs->zs_name_lock); + + error = dmu_objset_create(name, DMU_OST_OTHER, 0, + ztest_objset_create_cb, NULL); + if (error == ENOSPC) { + (void) rw_unlock(&zs->zs_name_lock); + ztest_record_enospc(FTAG); + return (error); } + ASSERT(error == 0 || error == EEXIST); - return (NULL); + VERIFY3U(dmu_objset_hold(name, zd, &os), ==, 0); + (void) rw_unlock(&zs->zs_name_lock); + + ztest_zd_init(zd, os); + + zilog = zd->zd_zilog; + + if (zilog->zl_header->zh_claim_lr_seq != 0 && + zilog->zl_header->zh_claim_lr_seq < committed_seq) + fatal(0, "missing log records: claimed %llu < committed %llu", + zilog->zl_header->zh_claim_lr_seq, committed_seq); + + ztest_dataset_dirobj_verify(zd); + + zil_replay(os, zd, ztest_replay_vector); + + ztest_dataset_dirobj_verify(zd); + + if (zopt_verbose >= 6) + (void) printf("%s replay %llu blocks, %llu records, seq %llu\n", + zd->zd_name, + (u_longlong_t)zilog->zl_parse_blk_count, + (u_longlong_t)zilog->zl_parse_lr_count, + (u_longlong_t)zilog->zl_replaying_seq); + + zilog = zil_open(os, ztest_get_data); + + if (zilog->zl_replaying_seq != 0 && + zilog->zl_replaying_seq < committed_seq) + fatal(0, "missing log records: replayed %llu < committed %llu", + zilog->zl_replaying_seq, committed_seq); + + return (0); +} + +static void +ztest_dataset_close(ztest_shared_t *zs, int d) +{ + ztest_ds_t *zd = &zs->zs_zd[d]; + + zil_close(zd->zd_zilog); + dmu_objset_rele(zd->zd_os, zd); + + ztest_zd_fini(zd); } /* * Kick off threads to run tests on all datasets in parallel. */ static void -ztest_run(char *pool) +ztest_run(ztest_shared_t *zs) { - int t, d, error; - ztest_shared_t *zs = ztest_shared; - ztest_args_t *za; + thread_t *tid; spa_t *spa; - char name[100]; thread_t resume_tid; + int error; ztest_exiting = B_FALSE; - (void) _mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL); - (void) rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL); - - for (t = 0; t < ZTEST_SYNC_LOCKS; t++) - (void) _mutex_init(&zs->zs_sync_lock[t], USYNC_THREAD, NULL); - /* - * Destroy one disk before we even start. - * It's mirrored, so everything should work just fine. - * This makes us exercise fault handling very early in spa_load(). + * Initialize parent/child shared state. */ - ztest_obliterate_one_disk(0); + VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); + VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); - /* - * Verify that the sum of the sizes of all blocks in the pool - * equals the SPA's allocated space total. - */ - ztest_verify_blocks(pool); + zs->zs_thread_start = gethrtime(); + zs->zs_thread_stop = zs->zs_thread_start + zopt_passtime * NANOSEC; + zs->zs_thread_stop = MIN(zs->zs_thread_stop, zs->zs_proc_stop); + zs->zs_thread_kill = zs->zs_thread_stop; + if (ztest_random(100) < zopt_killrate) + zs->zs_thread_kill -= ztest_random(zopt_passtime * NANOSEC); - /* - * Kick off a replacement of the disk we just obliterated. - */ - kernel_init(FREAD | FWRITE); - VERIFY(spa_open(pool, &spa, FTAG) == 0); - ztest_replace_one_disk(spa, 0); - if (zopt_verbose >= 5) - show_pool_stats(spa); - spa_close(spa, FTAG); - kernel_fini(); + (void) _mutex_init(&zcl.zcl_callbacks_lock, USYNC_THREAD, NULL); - kernel_init(FREAD | FWRITE); + list_create(&zcl.zcl_callbacks, sizeof (ztest_cb_data_t), + offsetof(ztest_cb_data_t, zcd_node)); /* - * Verify that we can export the pool and reimport it under a - * different name. + * Open our pool. */ - if (ztest_random(2) == 0) { - (void) snprintf(name, 100, "%s_import", pool); - ztest_spa_import_export(pool, name); - ztest_spa_import_export(name, pool); - } + kernel_init(FREAD | FWRITE); + VERIFY(spa_open(zs->zs_pool, &spa, FTAG) == 0); + zs->zs_spa = spa; - /* - * Verify that we can loop over all pools. - */ - mutex_enter(&spa_namespace_lock); - for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) { - if (zopt_verbose > 3) { - (void) printf("spa_next: found %s\n", spa_name(spa)); - } - } - mutex_exit(&spa_namespace_lock); + spa->spa_dedup_ditto = 2 * ZIO_DEDUPDITTO_MIN; /* - * Open our pool. + * We don't expect the pool to suspend unless maxfaults == 0, + * in which case ztest_fault_inject() temporarily takes away + * the only valid replica. */ - VERIFY(spa_open(pool, &spa, FTAG) == 0); + if (MAXFAULTS() == 0) + spa->spa_failmode = ZIO_FAILURE_MODE_WAIT; + else + spa->spa_failmode = ZIO_FAILURE_MODE_PANIC; /* * Create a thread to periodically resume suspended I/O. */ - VERIFY(thr_create(0, 0, ztest_resume, spa, THR_BOUND, + VERIFY(thr_create(0, 0, ztest_resume_thread, spa, THR_BOUND, &resume_tid) == 0); + /* + * Create a deadman thread to abort() if we hang. + */ + VERIFY(thr_create(0, 0, ztest_deadman_thread, zs, THR_BOUND, + NULL) == 0); + /* * Verify that we can safely inquire about about any object, * whether it's allocated or not. To make it interesting, * we probe a 5-wide window around each power of two. * This hits all edge cases, including zero and the max. */ - for (t = 0; t < 64; t++) { - for (d = -5; d <= 5; d++) { + for (int t = 0; t < 64; t++) { + for (int d = -5; d <= 5; d++) { error = dmu_object_info(spa->spa_meta_objset, (1ULL << t) + d, NULL); ASSERT(error == 0 || error == ENOENT || @@ -3110,118 +5139,156 @@ ztest_run(char *pool) } /* - * Now kick off all the tests that run in parallel. + * If we got any ENOSPC errors on the previous run, destroy something. */ + if (zs->zs_enospc_count != 0) { + int d = ztest_random(zopt_datasets); + ztest_dataset_destroy(zs, d); + } zs->zs_enospc_count = 0; - za = umem_zalloc(zopt_threads * sizeof (ztest_args_t), UMEM_NOFAIL); + tid = umem_zalloc(zopt_threads * sizeof (thread_t), UMEM_NOFAIL); if (zopt_verbose >= 4) (void) printf("starting main threads...\n"); - za[0].za_start = gethrtime(); - za[0].za_stop = za[0].za_start + zopt_passtime * NANOSEC; - za[0].za_stop = MIN(za[0].za_stop, zs->zs_stop_time); - za[0].za_kill = za[0].za_stop; - if (ztest_random(100) < zopt_killrate) - za[0].za_kill -= ztest_random(zopt_passtime * NANOSEC); - - for (t = 0; t < zopt_threads; t++) { - d = t % zopt_datasets; - - (void) strcpy(za[t].za_pool, pool); - za[t].za_os = za[d].za_os; - za[t].za_spa = spa; - za[t].za_zilog = za[d].za_zilog; - za[t].za_instance = t; - za[t].za_random = ztest_random(-1ULL); - za[t].za_start = za[0].za_start; - za[t].za_stop = za[0].za_stop; - za[t].za_kill = za[0].za_kill; - - if (t < zopt_datasets) { - ztest_replay_t zr; - int test_future = FALSE; - (void) rw_rdlock(&ztest_shared->zs_name_lock); - (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); - error = dmu_objset_create(name, DMU_OST_OTHER, NULL, 0, - ztest_create_cb, NULL); - if (error == EEXIST) { - test_future = TRUE; - } else if (error == ENOSPC) { - zs->zs_enospc_count++; - (void) rw_unlock(&ztest_shared->zs_name_lock); - break; - } else if (error != 0) { - fatal(0, "dmu_objset_create(%s) = %d", - name, error); - } - error = dmu_objset_open(name, DMU_OST_OTHER, - DS_MODE_USER, &za[d].za_os); - if (error) - fatal(0, "dmu_objset_open('%s') = %d", - name, error); - (void) rw_unlock(&ztest_shared->zs_name_lock); - if (test_future) - ztest_dmu_check_future_leak(&za[t]); - zr.zr_os = za[d].za_os; - zil_replay(zr.zr_os, &zr, &zr.zr_assign, - ztest_replay_vector, NULL); - za[d].za_zilog = zil_open(za[d].za_os, NULL); - } + /* + * Kick off all the tests that run in parallel. + */ + for (int t = 0; t < zopt_threads; t++) { + if (t < zopt_datasets && ztest_dataset_open(zs, t) != 0) + return; + VERIFY(thr_create(0, 0, ztest_thread, (void *)(uintptr_t)t, + THR_BOUND, &tid[t]) == 0); + } - VERIFY(thr_create(0, 0, ztest_thread, &za[t], THR_BOUND, - &za[t].za_thread) == 0); + /* + * Wait for all of the tests to complete. We go in reverse order + * so we don't close datasets while threads are still using them. + */ + for (int t = zopt_threads - 1; t >= 0; t--) { + VERIFY(thr_join(tid[t], NULL, NULL) == 0); + if (t < zopt_datasets) + ztest_dataset_close(zs, t); } - while (--t >= 0) { - VERIFY(thr_join(za[t].za_thread, NULL, NULL) == 0); - if (t < zopt_datasets) { - zil_close(za[t].za_zilog); - dmu_objset_close(za[t].za_os); - } + txg_wait_synced(spa_get_dsl(spa), 0); + + zs->zs_alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + zs->zs_space = metaslab_class_get_space(spa_normal_class(spa)); + + umem_free(tid, zopt_threads * sizeof (thread_t)); + + /* Kill the resume thread */ + ztest_exiting = B_TRUE; + VERIFY(thr_join(resume_tid, NULL, NULL) == 0); + ztest_resume(spa); + + /* + * Right before closing the pool, kick off a bunch of async I/O; + * spa_close() should wait for it to complete. + */ + for (uint64_t object = 1; object < 50; object++) + dmu_prefetch(spa->spa_meta_objset, object, 0, 1ULL << 20); + + spa_close(spa, FTAG); + + /* + * Verify that we can loop over all pools. + */ + mutex_enter(&spa_namespace_lock); + for (spa = spa_next(NULL); spa != NULL; spa = spa_next(spa)) + if (zopt_verbose > 3) + (void) printf("spa_next: found %s\n", spa_name(spa)); + mutex_exit(&spa_namespace_lock); + + /* + * Verify that we can export the pool and reimport it under a + * different name. + */ + if (ztest_random(2) == 0) { + char name[MAXNAMELEN]; + (void) snprintf(name, MAXNAMELEN, "%s_import", zs->zs_pool); + ztest_spa_import_export(zs->zs_pool, name); + ztest_spa_import_export(name, zs->zs_pool); } - if (zopt_verbose >= 3) - show_pool_stats(spa); + kernel_fini(); +} - txg_wait_synced(spa_get_dsl(spa), 0); +static void +ztest_freeze(ztest_shared_t *zs) +{ + ztest_ds_t *zd = &zs->zs_zd[0]; + spa_t *spa; + + if (zopt_verbose >= 3) + (void) printf("testing spa_freeze()...\n"); - zs->zs_alloc = spa_get_alloc(spa); - zs->zs_space = spa_get_space(spa); + kernel_init(FREAD | FWRITE); + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); /* - * If we had out-of-space errors, destroy a random objset. + * Force the first log block to be transactionally allocated. + * We have to do this before we freeze the pool -- otherwise + * the log chain won't be anchored. */ - if (zs->zs_enospc_count != 0) { - (void) rw_rdlock(&ztest_shared->zs_name_lock); - d = (int)ztest_random(zopt_datasets); - (void) snprintf(name, 100, "%s/%s_%d", pool, pool, d); - if (zopt_verbose >= 3) - (void) printf("Destroying %s to free up space\n", name); - (void) dmu_objset_find(name, ztest_destroy_cb, &za[d], - DS_FIND_SNAPSHOTS | DS_FIND_CHILDREN); - (void) rw_unlock(&ztest_shared->zs_name_lock); + while (BP_IS_HOLE(&zd->zd_zilog->zl_header->zh_log)) { + ztest_dmu_object_alloc_free(zd, 0); + zil_commit(zd->zd_zilog, UINT64_MAX, 0); } txg_wait_synced(spa_get_dsl(spa), 0); - umem_free(za, zopt_threads * sizeof (ztest_args_t)); + /* + * Freeze the pool. This stops spa_sync() from doing anything, + * so that the only way to record changes from now on is the ZIL. + */ + spa_freeze(spa); - /* Kill the resume thread */ - ztest_exiting = B_TRUE; - VERIFY(thr_join(resume_tid, NULL, NULL) == 0); + /* + * Run tests that generate log records but don't alter the pool config + * or depend on DSL sync tasks (snapshots, objset create/destroy, etc). + * We do a txg_wait_synced() after each iteration to force the txg + * to increase well beyond the last synced value in the uberblock. + * The ZIL should be OK with that. + */ + while (ztest_random(20) != 0) { + ztest_dmu_write_parallel(zd, 0); + ztest_dmu_object_alloc_free(zd, 0); + txg_wait_synced(spa_get_dsl(spa), 0); + } /* - * Right before closing the pool, kick off a bunch of async I/O; - * spa_close() should wait for it to complete. + * Commit all of the changes we just generated. */ - for (t = 1; t < 50; t++) - dmu_prefetch(spa->spa_meta_objset, t, 0, 1 << 15); + zil_commit(zd->zd_zilog, UINT64_MAX, 0); + txg_wait_synced(spa_get_dsl(spa), 0); + /* + * Close our dataset and close the pool. + */ + ztest_dataset_close(zs, 0); spa_close(spa, FTAG); + kernel_fini(); + /* + * Open and close the pool and dataset to induce log replay. + */ + kernel_init(FREAD | FWRITE); + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + VERIFY3U(0, ==, ztest_dataset_open(zs, 0)); + ztest_dataset_close(zs, 0); + spa_close(spa, FTAG); kernel_fini(); + + list_destroy(&zcl.zcl_callbacks); + + (void) _mutex_destroy(&zcl.zcl_callbacks_lock); + + (void) rwlock_destroy(&zs->zs_name_lock); + (void) _mutex_destroy(&zs->zs_vdev_lock); } void @@ -3249,41 +5316,62 @@ print_time(hrtime_t t, char *timebuf) (void) sprintf(timebuf, "%llus", s); } +static nvlist_t * +make_random_props() +{ + nvlist_t *props; + + if (ztest_random(2) == 0) + return (NULL); + + VERIFY(nvlist_alloc(&props, NV_UNIQUE_NAME, 0) == 0); + VERIFY(nvlist_add_uint64(props, "autoreplace", 1) == 0); + + (void) printf("props:\n"); + dump_nvlist(props, 4); + + return (props); +} + /* * Create a storage pool with the given name and initial vdev size. - * Then create the specified number of datasets in the pool. + * Then test spa_freeze() functionality. */ static void -ztest_init(char *pool) +ztest_init(ztest_shared_t *zs) { spa_t *spa; - int error; - nvlist_t *nvroot; + nvlist_t *nvroot, *props; + + VERIFY(_mutex_init(&zs->zs_vdev_lock, USYNC_THREAD, NULL) == 0); + VERIFY(rwlock_init(&zs->zs_name_lock, USYNC_THREAD, NULL) == 0); kernel_init(FREAD | FWRITE); /* * Create the storage pool. */ - (void) spa_destroy(pool); - ztest_shared->zs_vdev_primaries = 0; + (void) spa_destroy(zs->zs_pool); + ztest_shared->zs_vdev_next_leaf = 0; + zs->zs_splits = 0; + zs->zs_mirrors = zopt_mirrors; nvroot = make_vdev_root(NULL, NULL, zopt_vdev_size, 0, - 0, zopt_raidz, zopt_mirrors, 1); - error = spa_create(pool, nvroot, NULL, NULL, NULL); + 0, zopt_raidz, zs->zs_mirrors, 1); + props = make_random_props(); + VERIFY3U(0, ==, spa_create(zs->zs_pool, nvroot, props, NULL, NULL)); nvlist_free(nvroot); - if (error) - fatal(0, "spa_create() = %d", error); - error = spa_open(pool, &spa, FTAG); - if (error) - fatal(0, "spa_open() = %d", error); - - if (zopt_verbose >= 3) - show_pool_stats(spa); - + VERIFY3U(0, ==, spa_open(zs->zs_pool, &spa, FTAG)); + metaslab_sz = 1ULL << spa->spa_root_vdev->vdev_child[0]->vdev_ms_shift; spa_close(spa, FTAG); kernel_fini(); + + ztest_run_zdb(zs->zs_pool); + + ztest_freeze(zs); + + ztest_run_zdb(zs->zs_pool); } int @@ -3291,11 +5379,12 @@ main(int argc, char **argv) { int kills = 0; int iters = 0; - int i, f; ztest_shared_t *zs; + size_t shared_size; ztest_info_t *zi; char timebuf[100]; char numbuf[6]; + spa_t *spa; (void) setvbuf(stdout, NULL, _IOLBF, 0); @@ -3306,19 +5395,16 @@ main(int argc, char **argv) process_options(argc, argv); - argc -= optind; - argv += optind; - - dprintf_setup(&argc, argv); - /* * Blow away any existing copy of zpool.cache */ if (zopt_init != 0) (void) remove("/tmp/zpool.cache"); + shared_size = sizeof (*zs) + zopt_datasets * sizeof (ztest_ds_t); + zs = ztest_shared = (void *)mmap(0, - P2ROUNDUP(sizeof (ztest_shared_t), getpagesize()), + P2ROUNDUP(shared_size, getpagesize()), PROT_READ | PROT_WRITE, MAP_SHARED | MAP_ANON, -1, 0); if (zopt_verbose >= 1) { @@ -3331,49 +5417,49 @@ main(int argc, char **argv) /* * Create and initialize our storage pool. */ - for (i = 1; i <= zopt_init; i++) { + for (int i = 1; i <= zopt_init; i++) { bzero(zs, sizeof (ztest_shared_t)); if (zopt_verbose >= 3 && zopt_init != 1) (void) printf("ztest_init(), pass %d\n", i); - ztest_init(zopt_pool); + zs->zs_pool = zopt_pool; + ztest_init(zs); } - /* - * Initialize the call targets for each function. - */ - for (f = 0; f < ZTEST_FUNCS; f++) { - zi = &zs->zs_info[f]; + zs->zs_pool = zopt_pool; + zs->zs_proc_start = gethrtime(); + zs->zs_proc_stop = zs->zs_proc_start + zopt_time * NANOSEC; + for (int f = 0; f < ZTEST_FUNCS; f++) { + zi = &zs->zs_info[f]; *zi = ztest_info[f]; - - if (*zi->zi_interval == 0) - zi->zi_call_target = UINT64_MAX; + if (zs->zs_proc_start + zi->zi_interval[0] > zs->zs_proc_stop) + zi->zi_call_next = UINT64_MAX; else - zi->zi_call_target = zopt_time / *zi->zi_interval; + zi->zi_call_next = zs->zs_proc_start + + ztest_random(2 * zi->zi_interval[0] + 1); } - zs->zs_start_time = gethrtime(); - zs->zs_stop_time = zs->zs_start_time + zopt_time * NANOSEC; - /* * Run the tests in a loop. These tests include fault injection * to verify that self-healing data works, and forced crashes * to verify that we never lose on-disk consistency. */ - while (gethrtime() < zs->zs_stop_time) { + while (gethrtime() < zs->zs_proc_stop) { int status; pid_t pid; - char *tmp; /* * Initialize the workload counters for each function. */ - for (f = 0; f < ZTEST_FUNCS; f++) { + for (int f = 0; f < ZTEST_FUNCS; f++) { zi = &zs->zs_info[f]; - zi->zi_calls = 0; + zi->zi_call_count = 0; zi->zi_call_time = 0; } + /* Set the allocation switch size */ + metaslab_df_alloc_threshold = ztest_random(metaslab_sz / 4) + 1; + pid = fork(); if (pid == -1) @@ -3383,7 +5469,7 @@ main(int argc, char **argv) struct rlimit rl = { 1024, 1024 }; (void) setrlimit(RLIMIT_NOFILE, &rl); (void) enable_extended_FILE_stdio(-1, -1); - ztest_run(zopt_pool); + ztest_run(zs); exit(0); } @@ -3416,8 +5502,8 @@ main(int argc, char **argv) if (zopt_verbose >= 1) { hrtime_t now = gethrtime(); - now = MIN(now, zs->zs_stop_time); - print_time(zs->zs_stop_time - now, timebuf); + now = MIN(now, zs->zs_proc_stop); + print_time(zs->zs_proc_stop - now, timebuf); nicenum(zs->zs_space, numbuf); (void) printf("Pass %3d, %8s, %3llu ENOSPC, " @@ -3427,7 +5513,7 @@ main(int argc, char **argv) (u_longlong_t)zs->zs_enospc_count, 100.0 * zs->zs_alloc / zs->zs_space, numbuf, - 100.0 * (now - zs->zs_start_time) / + 100.0 * (now - zs->zs_proc_start) / (zopt_time * NANOSEC), timebuf); } @@ -3437,34 +5523,39 @@ main(int argc, char **argv) "Calls", "Time", "Function"); (void) printf("%7s %9s %s\n", "-----", "----", "--------"); - for (f = 0; f < ZTEST_FUNCS; f++) { + for (int f = 0; f < ZTEST_FUNCS; f++) { Dl_info dli; zi = &zs->zs_info[f]; print_time(zi->zi_call_time, timebuf); (void) dladdr((void *)zi->zi_func, &dli); (void) printf("%7llu %9s %s\n", - (u_longlong_t)zi->zi_calls, timebuf, + (u_longlong_t)zi->zi_call_count, timebuf, dli.dli_sname); } (void) printf("\n"); } /* - * It's possible that we killed a child during a rename test, in - * which case we'll have a 'ztest_tmp' pool lying around instead - * of 'ztest'. Do a blind rename in case this happened. + * It's possible that we killed a child during a rename test, + * in which case we'll have a 'ztest_tmp' pool lying around + * instead of 'ztest'. Do a blind rename in case this happened. */ - tmp = umem_alloc(strlen(zopt_pool) + 5, UMEM_NOFAIL); - (void) strcpy(tmp, zopt_pool); - (void) strcat(tmp, "_tmp"); - kernel_init(FREAD | FWRITE); - (void) spa_rename(tmp, zopt_pool); + kernel_init(FREAD); + if (spa_open(zopt_pool, &spa, FTAG) == 0) { + spa_close(spa, FTAG); + } else { + char tmpname[MAXNAMELEN]; + kernel_fini(); + kernel_init(FREAD | FWRITE); + (void) snprintf(tmpname, sizeof (tmpname), "%s_tmp", + zopt_pool); + (void) spa_rename(tmpname, zopt_pool); + } kernel_fini(); - umem_free(tmp, strlen(tmp) + 1); - } - ztest_verify_blocks(zopt_pool); + ztest_run_zdb(zopt_pool); + } if (zopt_verbose >= 1) { (void) printf("%d killed, %d completed, %.0f%% kill rate\n", diff --git a/external/cddl/osnet/dist/common/avl/avl.c b/external/cddl/osnet/dist/common/avl/avl.c index c9727c643b962..dd39c12d215e9 100644 --- a/external/cddl/osnet/dist/common/avl/avl.c +++ b/external/cddl/osnet/dist/common/avl/avl.c @@ -19,13 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - - /* * AVL - generic AVL tree implementation for kernel use * @@ -243,7 +240,7 @@ avl_nearest(avl_tree_t *tree, avl_index_t where, int direction) * "void *" of the found tree node */ void * -avl_find(avl_tree_t *tree, void *value, avl_index_t *where) +avl_find(avl_tree_t *tree, const void *value, avl_index_t *where) { avl_node_t *node; avl_node_t *prev = NULL; diff --git a/external/cddl/osnet/dist/common/nvpair/nvpair.c b/external/cddl/osnet/dist/common/nvpair/nvpair.c index 77891bf776445..8115091ab9a9a 100644 --- a/external/cddl/osnet/dist/common/nvpair/nvpair.c +++ b/external/cddl/osnet/dist/common/nvpair/nvpair.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -692,6 +690,18 @@ nvlist_remove(nvlist_t *nvl, const char *name, data_type_t type) return (ENOENT); } +int +nvlist_remove_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + if (nvl == NULL || nvp == NULL) + return (EINVAL); + + nvp_buf_unlink(nvl, nvp); + nvpair_free(nvp); + nvp_buf_free(nvl, nvp); + return (0); +} + /* * This function calculates the size of an nvpair value. * @@ -1162,6 +1172,42 @@ nvlist_next_nvpair(nvlist_t *nvl, nvpair_t *nvp) return (curr != NULL ? &curr->nvi_nvp : NULL); } +nvpair_t * +nvlist_prev_nvpair(nvlist_t *nvl, nvpair_t *nvp) +{ + nvpriv_t *priv; + i_nvp_t *curr; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (NULL); + + curr = NVPAIR2I_NVP(nvp); + + if (nvp == NULL) + curr = priv->nvp_last; + else if (priv->nvp_curr == curr || nvlist_contains_nvp(nvl, nvp)) + curr = curr->nvi_prev; + else + curr = NULL; + + priv->nvp_curr = curr; + + return (curr != NULL ? &curr->nvi_nvp : NULL); +} + +boolean_t +nvlist_empty(nvlist_t *nvl) +{ + nvpriv_t *priv; + + if (nvl == NULL || + (priv = (nvpriv_t *)(uintptr_t)nvl->nvl_priv) == NULL) + return (B_TRUE); + + return (priv->nvp_list == NULL); +} + char * nvpair_name(nvpair_t *nvp) { diff --git a/external/cddl/osnet/dist/common/zfs/zfs_comutil.c b/external/cddl/osnet/dist/common/zfs/zfs_comutil.c index 74517a3f6920d..53f485c0b6663 100644 --- a/external/cddl/osnet/dist/common/zfs/zfs_comutil.c +++ b/external/cddl/osnet/dist/common/zfs/zfs_comutil.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file is intended for functions that ought to be common between user * land (libzfs) and the kernel. When many common routines need to be shared @@ -33,10 +31,13 @@ #if defined(_KERNEL) #include +#else +#include #endif #include #include +#include #include /* @@ -63,3 +64,42 @@ zfs_allocatable_devs(nvlist_t *nv) } return (B_FALSE); } + +void +zpool_get_rewind_policy(nvlist_t *nvl, zpool_rewind_policy_t *zrpp) +{ + nvlist_t *policy; + nvpair_t *elem; + char *nm; + + /* Defaults */ + zrpp->zrp_request = ZPOOL_NO_REWIND; + zrpp->zrp_maxmeta = 0; + zrpp->zrp_maxdata = UINT64_MAX; + zrpp->zrp_txg = UINT64_MAX; + + if (nvl == NULL) + return; + + elem = NULL; + while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { + nm = nvpair_name(elem); + if (strcmp(nm, ZPOOL_REWIND_POLICY) == 0) { + if (nvpair_value_nvlist(elem, &policy) == 0) + zpool_get_rewind_policy(policy, zrpp); + return; + } else if (strcmp(nm, ZPOOL_REWIND_REQUEST) == 0) { + if (nvpair_value_uint32(elem, &zrpp->zrp_request) == 0) + if (zrpp->zrp_request & ~ZPOOL_REWIND_POLICIES) + zrpp->zrp_request = ZPOOL_NO_REWIND; + } else if (strcmp(nm, ZPOOL_REWIND_REQUEST_TXG) == 0) { + (void) nvpair_value_uint64(elem, &zrpp->zrp_txg); + } else if (strcmp(nm, ZPOOL_REWIND_META_THRESH) == 0) { + (void) nvpair_value_uint64(elem, &zrpp->zrp_maxmeta); + } else if (strcmp(nm, ZPOOL_REWIND_DATA_THRESH) == 0) { + (void) nvpair_value_uint64(elem, &zrpp->zrp_maxdata); + } + } + if (zrpp->zrp_request == 0) + zrpp->zrp_request = ZPOOL_NO_REWIND; +} diff --git a/external/cddl/osnet/dist/common/zfs/zfs_comutil.h b/external/cddl/osnet/dist/common/zfs/zfs_comutil.h index f517044a80a00..748a79a5c9818 100644 --- a/external/cddl/osnet/dist/common/zfs/zfs_comutil.h +++ b/external/cddl/osnet/dist/common/zfs/zfs_comutil.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZFS_COMUTIL_H #define _ZFS_COMUTIL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -35,7 +33,8 @@ extern "C" { #endif -extern boolean_t zfs_allocatable_devs(nvlist_t *nv); +extern boolean_t zfs_allocatable_devs(nvlist_t *); +extern void zpool_get_rewind_policy(nvlist_t *, zpool_rewind_policy_t *); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/common/zfs/zfs_deleg.c b/external/cddl/osnet/dist/common/zfs/zfs_deleg.c index 0fd5800a84dc5..35f81b584641a 100644 --- a/external/cddl/osnet/dist/common/zfs/zfs_deleg.c +++ b/external/cddl/osnet/dist/common/zfs/zfs_deleg.c @@ -19,13 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #if defined(_KERNEL) #include #include @@ -66,6 +63,12 @@ zfs_deleg_perm_tab_t zfs_deleg_perm_tab[] = { {ZFS_DELEG_PERM_SHARE, ZFS_DELEG_NOTE_SHARE }, {ZFS_DELEG_PERM_SEND, ZFS_DELEG_NOTE_NONE }, {ZFS_DELEG_PERM_USERPROP, ZFS_DELEG_NOTE_USERPROP }, + {ZFS_DELEG_PERM_USERQUOTA, ZFS_DELEG_NOTE_USERQUOTA }, + {ZFS_DELEG_PERM_GROUPQUOTA, ZFS_DELEG_NOTE_GROUPQUOTA }, + {ZFS_DELEG_PERM_USERUSED, ZFS_DELEG_NOTE_USERUSED }, + {ZFS_DELEG_PERM_GROUPUSED, ZFS_DELEG_NOTE_GROUPUSED }, + {ZFS_DELEG_PERM_HOLD, ZFS_DELEG_NOTE_HOLD }, + {ZFS_DELEG_PERM_RELEASE, ZFS_DELEG_NOTE_RELEASE }, {NULL, ZFS_DELEG_NOTE_NONE } }; diff --git a/external/cddl/osnet/dist/common/zfs/zfs_deleg.h b/external/cddl/osnet/dist/common/zfs/zfs_deleg.h index 561b73e63df4a..e90cd0d5f4ba9 100644 --- a/external/cddl/osnet/dist/common/zfs/zfs_deleg.h +++ b/external/cddl/osnet/dist/common/zfs/zfs_deleg.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZFS_DELEG_H #define _ZFS_DELEG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -59,6 +57,12 @@ typedef enum { ZFS_DELEG_NOTE_USERPROP, ZFS_DELEG_NOTE_MOUNT, ZFS_DELEG_NOTE_SHARE, + ZFS_DELEG_NOTE_USERQUOTA, + ZFS_DELEG_NOTE_GROUPQUOTA, + ZFS_DELEG_NOTE_USERUSED, + ZFS_DELEG_NOTE_GROUPUSED, + ZFS_DELEG_NOTE_HOLD, + ZFS_DELEG_NOTE_RELEASE, ZFS_DELEG_NOTE_NONE } zfs_deleg_note_t; diff --git a/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c b/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c new file mode 100644 index 0000000000000..fa43ce6bdb5dd --- /dev/null +++ b/external/cddl/osnet/dist/common/zfs/zfs_fletcher.c @@ -0,0 +1,246 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Fletcher Checksums + * ------------------ + * + * ZFS's 2nd and 4th order Fletcher checksums are defined by the following + * recurrence relations: + * + * a = a + f + * i i-1 i-1 + * + * b = b + a + * i i-1 i + * + * c = c + b (fletcher-4 only) + * i i-1 i + * + * d = d + c (fletcher-4 only) + * i i-1 i + * + * Where + * a_0 = b_0 = c_0 = d_0 = 0 + * and + * f_0 .. f_(n-1) are the input data. + * + * Using standard techniques, these translate into the following series: + * + * __n_ __n_ + * \ | \ | + * a = > f b = > i * f + * n /___| n - i n /___| n - i + * i = 1 i = 1 + * + * + * __n_ __n_ + * \ | i*(i+1) \ | i*(i+1)*(i+2) + * c = > ------- f d = > ------------- f + * n /___| 2 n - i n /___| 6 n - i + * i = 1 i = 1 + * + * For fletcher-2, the f_is are 64-bit, and [ab]_i are 64-bit accumulators. + * Since the additions are done mod (2^64), errors in the high bits may not + * be noticed. For this reason, fletcher-2 is deprecated. + * + * For fletcher-4, the f_is are 32-bit, and [abcd]_i are 64-bit accumulators. + * A conservative estimate of how big the buffer can get before we overflow + * can be estimated using f_i = 0xffffffff for all i: + * + * % bc + * f=2^32-1;d=0; for (i = 1; d<2^64; i++) { d += f*i*(i+1)*(i+2)/6 }; (i-1)*4 + * 2264 + * quit + * % + * + * So blocks of up to 2k will not overflow. Our largest block size is + * 128k, which has 32k 4-byte words, so we can compute the largest possible + * accumulators, then divide by 2^64 to figure the max amount of overflow: + * + * % bc + * a=b=c=d=0; f=2^32-1; for (i=1; i<=32*1024; i++) { a+=f; b+=a; c+=b; d+=c } + * a/2^64;b/2^64;c/2^64;d/2^64 + * 0 + * 0 + * 1365 + * 11186858 + * quit + * % + * + * So a and b cannot overflow. To make sure each bit of input has some + * effect on the contents of c and d, we can look at what the factors of + * the coefficients in the equations for c_n and d_n are. The number of 2s + * in the factors determines the lowest set bit in the multiplier. Running + * through the cases for n*(n+1)/2 reveals that the highest power of 2 is + * 2^14, and for n*(n+1)*(n+2)/6 it is 2^15. So while some data may overflow + * the 64-bit accumulators, every bit of every f_i effects every accumulator, + * even for 128k blocks. + * + * If we wanted to make a stronger version of fletcher4 (fletcher4c?), + * we could do our calculations mod (2^32 - 1) by adding in the carries + * periodically, and store the number of carries in the top 32-bits. + * + * -------------------- + * Checksum Performance + * -------------------- + * + * There are two interesting components to checksum performance: cached and + * uncached performance. With cached data, fletcher-2 is about four times + * faster than fletcher-4. With uncached data, the performance difference is + * negligible, since the cost of a cache fill dominates the processing time. + * Even though fletcher-4 is slower than fletcher-2, it is still a pretty + * efficient pass over the data. + * + * In normal operation, the data which is being checksummed is in a buffer + * which has been filled either by: + * + * 1. a compression step, which will be mostly cached, or + * 2. a bcopy() or copyin(), which will be uncached (because the + * copy is cache-bypassing). + * + * For both cached and uncached data, both fletcher checksums are much faster + * than sha-256, and slower than 'off', which doesn't touch the data at all. + */ + +#include +#include +#include +#include +#include + +void +fletcher_2_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 += ip[0]; + a1 += ip[1]; + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); +} + +void +fletcher_2_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint64_t *ip = buf; + const uint64_t *ipend = ip + (size / sizeof (uint64_t)); + uint64_t a0, b0, a1, b1; + + for (a0 = b0 = a1 = b1 = 0; ip < ipend; ip += 2) { + a0 += BSWAP_64(ip[0]); + a1 += BSWAP_64(ip[1]); + b0 += a0; + b1 += a1; + } + + ZIO_SET_CHECKSUM(zcp, a0, a1, b0, b1); +} + +void +fletcher_4_native(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + for (a = b = c = d = 0; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_byteswap(const void *buf, uint64_t size, zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + for (a = b = c = d = 0; ip < ipend; ip++) { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_incremental_native(const void *buf, uint64_t size, + zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = zcp->zc_word[0]; + b = zcp->zc_word[1]; + c = zcp->zc_word[2]; + d = zcp->zc_word[3]; + + for (; ip < ipend; ip++) { + a += ip[0]; + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} + +void +fletcher_4_incremental_byteswap(const void *buf, uint64_t size, + zio_cksum_t *zcp) +{ + const uint32_t *ip = buf; + const uint32_t *ipend = ip + (size / sizeof (uint32_t)); + uint64_t a, b, c, d; + + a = zcp->zc_word[0]; + b = zcp->zc_word[1]; + c = zcp->zc_word[2]; + d = zcp->zc_word[3]; + + for (; ip < ipend; ip++) { + a += BSWAP_32(ip[0]); + b += a; + c += b; + d += c; + } + + ZIO_SET_CHECKSUM(zcp, a, b, c, d); +} diff --git a/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h b/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h new file mode 100644 index 0000000000000..b49df0cf4f0fd --- /dev/null +++ b/external/cddl/osnet/dist/common/zfs/zfs_fletcher.h @@ -0,0 +1,53 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _ZFS_FLETCHER_H +#define _ZFS_FLETCHER_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * fletcher checksum functions + */ + +void fletcher_2_native(const void *, uint64_t, zio_cksum_t *); +void fletcher_2_byteswap(const void *, uint64_t, zio_cksum_t *); +void fletcher_4_native(const void *, uint64_t, zio_cksum_t *); +void fletcher_4_byteswap(const void *, uint64_t, zio_cksum_t *); +void fletcher_4_incremental_native(const void *, uint64_t, + zio_cksum_t *); +void fletcher_4_incremental_byteswap(const void *, uint64_t, + zio_cksum_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _ZFS_FLETCHER_H */ diff --git a/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c b/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c index a9d109be20ab7..5cfafea471b3d 100644 --- a/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c +++ b/external/cddl/osnet/dist/common/zfs/zfs_namecheck.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Common name validation routines for ZFS. These routines are shared by the * userland code as well as the ioctl() layer to ensure that we don't @@ -61,7 +59,7 @@ valid_char(char c) * Snapshot names must be made up of alphanumeric characters plus the following * characters: * - * [-_.:] + * [-_.: ] */ int snapshot_namecheck(const char *path, namecheck_err_t *why, char *what) @@ -345,19 +343,3 @@ pool_namecheck(const char *pool, namecheck_err_t *why, char *what) return (0); } - -/* - * Check if the dataset name is private for internal usage. - * '$' is reserved for internal dataset names. e.g. "$MOS" - * - * Return 1 if the given name is used internally. - * Return 0 if it is not. - */ -int -dataset_name_hidden(const char *name) -{ - if (strchr(name, '$') != NULL) - return (1); - - return (0); -} diff --git a/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h b/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h index ec85e62f72e81..7711da099be98 100644 --- a/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h +++ b/external/cddl/osnet/dist/common/zfs/zfs_namecheck.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZFS_NAMECHECK_H #define _ZFS_NAMECHECK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -50,7 +48,6 @@ typedef enum { int pool_namecheck(const char *, namecheck_err_t *, char *); int dataset_namecheck(const char *, namecheck_err_t *, char *); int mountpoint_namecheck(const char *, namecheck_err_t *); -int dataset_name_hidden(const char *); int snapshot_namecheck(const char *, namecheck_err_t *, char *); int permset_namecheck(const char *, namecheck_err_t *, char *); diff --git a/external/cddl/osnet/dist/common/zfs/zfs_prop.c b/external/cddl/osnet/dist/common/zfs/zfs_prop.c index effd2dba70922..b6f80614f8faf 100644 --- a/external/cddl/osnet/dist/common/zfs/zfs_prop.c +++ b/external/cddl/osnet/dist/common/zfs/zfs_prop.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -43,6 +43,14 @@ static zprop_desc_t zfs_prop_table[ZFS_NUM_PROPS]; +/* Note this is indexed by zfs_userquota_prop_t, keep the order the same */ +const char *zfs_userquota_prop_prefixes[] = { + "userused@", + "userquota@", + "groupused@", + "groupquota@" +}; + zprop_desc_t * zfs_prop_get_table(void) { @@ -61,6 +69,16 @@ zfs_prop_init(void) { NULL } }; + static zprop_index_t dedup_table[] = { + { "on", ZIO_CHECKSUM_ON }, + { "off", ZIO_CHECKSUM_OFF }, + { "verify", ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY }, + { "sha256", ZIO_CHECKSUM_SHA256 }, + { "sha256,verify", + ZIO_CHECKSUM_SHA256 | ZIO_CHECKSUM_VERIFY }, + { NULL } + }; + static zprop_index_t compress_table[] = { { "on", ZIO_COMPRESS_ON }, { "off", ZIO_COMPRESS_OFF }, @@ -75,6 +93,7 @@ zfs_prop_init(void) { "gzip-7", ZIO_COMPRESS_GZIP_7 }, { "gzip-8", ZIO_COMPRESS_GZIP_8 }, { "gzip-9", ZIO_COMPRESS_GZIP_9 }, + { "zle", ZIO_COMPRESS_ZLE }, { NULL } }; @@ -133,6 +152,7 @@ zfs_prop_init(void) { "1", 1 }, { "2", 2 }, { "3", 3 }, + { "4", 4 }, { "current", ZPL_VERSION }, { NULL } }; @@ -143,6 +163,12 @@ zfs_prop_init(void) { NULL } }; + static zprop_index_t logbias_table[] = { + { "latency", ZFS_LOGBIAS_LATENCY }, + { "throughput", ZFS_LOGBIAS_THROUGHPUT }, + { NULL } + }; + static zprop_index_t canmount_table[] = { { "off", ZFS_CANMOUNT_OFF }, { "on", ZFS_CANMOUNT_ON }, @@ -162,10 +188,15 @@ zfs_prop_init(void) PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "on | off | fletcher2 | fletcher4 | sha256", "CHECKSUM", checksum_table); + register_index(ZFS_PROP_DEDUP, "dedup", ZIO_CHECKSUM_OFF, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "on | off | verify | sha256[,verify]", "DEDUP", + dedup_table); register_index(ZFS_PROP_COMPRESSION, "compression", ZIO_COMPRESS_DEFAULT, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, - "on | off | lzjb | gzip | gzip-[1-9]", "COMPRESS", compress_table); + "on | off | lzjb | gzip | gzip-[1-9] | zle", "COMPRESS", + compress_table); register_index(ZFS_PROP_SNAPDIR, "snapdir", ZFS_SNAPDIR_HIDDEN, PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "hidden | visible", "SNAPDIR", snapdir_table); @@ -187,6 +218,9 @@ zfs_prop_init(void) ZFS_CACHE_ALL, PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT | ZFS_TYPE_VOLUME, "all | none | metadata", "SECONDARYCACHE", cache_table); + register_index(ZFS_PROP_LOGBIAS, "logbias", ZFS_LOGBIAS_LATENCY, + PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, + "latency | throughput", "LOGBIAS", logbias_table); /* inherit index (boolean) properties */ register_index(ZFS_PROP_ATIME, "atime", 1, PROP_INHERIT, @@ -218,7 +252,7 @@ zfs_prop_init(void) /* default index properties */ register_index(ZFS_PROP_VERSION, "version", 0, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_SNAPSHOT, - "1 | 2 | 3 | current", "VERSION", version_table); + "1 | 2 | 3 | 4 | current", "VERSION", version_table); register_index(ZFS_PROP_CANMOUNT, "canmount", ZFS_CANMOUNT_ON, PROP_DEFAULT, ZFS_TYPE_FILESYSTEM, "on | off | noauto", "CANMOUNT", canmount_table); @@ -226,6 +260,9 @@ zfs_prop_init(void) /* readonly index (boolean) properties */ register_index(ZFS_PROP_MOUNTED, "mounted", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM, "yes | no", "MOUNTED", boolean_table); + register_index(ZFS_PROP_DEFER_DESTROY, "defer_destroy", 0, + PROP_READONLY, ZFS_TYPE_SNAPSHOT, "yes | no", "DEFER_DESTROY", + boolean_table); /* set once index properties */ register_index(ZFS_PROP_NORMALIZE, "normalization", 0, @@ -254,6 +291,8 @@ zfs_prop_init(void) ZFS_TYPE_DATASET, "filesystem | volume | snapshot", "TYPE"); register_string(ZFS_PROP_SHARESMB, "sharesmb", "off", PROP_INHERIT, ZFS_TYPE_FILESYSTEM, "on | off | sharemgr(1M) options", "SHARESMB"); + register_string(ZFS_PROP_MLSLABEL, "mlslabel", ZFS_MLSLABEL_DEFAULT, + PROP_INHERIT, ZFS_TYPE_DATASET, "", "MLSLABEL"); /* readonly number properties */ register_number(ZFS_PROP_USED, "used", 0, PROP_READONLY, @@ -265,8 +304,8 @@ zfs_prop_init(void) register_number(ZFS_PROP_COMPRESSRATIO, "compressratio", 0, PROP_READONLY, ZFS_TYPE_DATASET, "<1.00x or higher if compressed>", "RATIO"); - register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", 8192, - PROP_ONETIME, + register_number(ZFS_PROP_VOLBLOCKSIZE, "volblocksize", + ZVOL_DEFAULT_BLOCKSIZE, PROP_ONETIME, ZFS_TYPE_VOLUME, "512 to 128k, power of 2", "VOLBLOCK"); register_number(ZFS_PROP_USEDSNAP, "usedbysnapshots", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDSNAP"); @@ -277,6 +316,8 @@ zfs_prop_init(void) register_number(ZFS_PROP_USEDREFRESERV, "usedbyrefreservation", 0, PROP_READONLY, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME, "", "USEDREFRESERV"); + register_number(ZFS_PROP_USERREFS, "userrefs", 0, PROP_READONLY, + ZFS_TYPE_SNAPSHOT, "", "USERREFS"); /* default number properties */ register_number(ZFS_PROP_QUOTA, "quota", 0, PROP_DEFAULT, @@ -298,15 +339,25 @@ zfs_prop_init(void) /* hidden properties */ register_hidden(ZFS_PROP_CREATETXG, "createtxg", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_DATASET, NULL); + PROP_READONLY, ZFS_TYPE_DATASET, "CREATETXG"); register_hidden(ZFS_PROP_NUMCLONES, "numclones", PROP_TYPE_NUMBER, - PROP_READONLY, ZFS_TYPE_SNAPSHOT, NULL); + PROP_READONLY, ZFS_TYPE_SNAPSHOT, "NUMCLONES"); register_hidden(ZFS_PROP_NAME, "name", PROP_TYPE_STRING, PROP_READONLY, ZFS_TYPE_DATASET, "NAME"); register_hidden(ZFS_PROP_ISCSIOPTIONS, "iscsioptions", PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, "ISCSIOPTIONS"); + register_hidden(ZFS_PROP_STMF_SHAREINFO, "stmf_sbd_lu", + PROP_TYPE_STRING, PROP_INHERIT, ZFS_TYPE_VOLUME, + "STMF_SBD_LU"); register_hidden(ZFS_PROP_GUID, "guid", PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, "GUID"); + register_hidden(ZFS_PROP_USERACCOUNTING, "useraccounting", + PROP_TYPE_NUMBER, PROP_READONLY, ZFS_TYPE_DATASET, + "USERACCOUNTING"); + register_hidden(ZFS_PROP_UNIQUE, "unique", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "UNIQUE"); + register_hidden(ZFS_PROP_OBJSETID, "objsetid", PROP_TYPE_NUMBER, + PROP_READONLY, ZFS_TYPE_DATASET, "OBJSETID"); /* oddball properties */ register_impl(ZFS_PROP_CREATION, "creation", PROP_TYPE_NUMBER, 0, NULL, @@ -318,6 +369,11 @@ boolean_t zfs_prop_delegatable(zfs_prop_t prop) { zprop_desc_t *pd = &zfs_prop_table[prop]; + + /* The mlslabel property is never delegatable. */ + if (prop == ZFS_PROP_MLSLABEL) + return (B_FALSE); + return (pd->pd_attr != PROP_READONLY); } @@ -330,7 +386,6 @@ zfs_name_to_prop(const char *propname) return (zprop_name_to_prop(propname, ZFS_TYPE_DATASET)); } - /* * For user property names, we allow all lowercase alphanumeric characters, plus * a few useful punctuation characters. @@ -367,6 +422,26 @@ zfs_prop_user(const char *name) return (B_TRUE); } +/* + * Returns true if this is a valid userspace-type property (one with a '@'). + * Note that after the @, any character is valid (eg, another @, for SID + * user@domain). + */ +boolean_t +zfs_prop_userquota(const char *name) +{ + zfs_userquota_prop_t prop; + + for (prop = 0; prop < ZFS_NUM_USERQUOTA_PROPS; prop++) { + if (strncmp(name, zfs_userquota_prop_prefixes[prop], + strlen(zfs_userquota_prop_prefixes[prop])) == 0) { + return (B_TRUE); + } + } + + return (B_FALSE); +} + /* * Tables of index types, plus functions to convert between the user view * (strings) and internal representation (uint64_t). @@ -383,6 +458,12 @@ zfs_prop_index_to_string(zfs_prop_t prop, uint64_t index, const char **string) return (zprop_index_to_string(prop, index, string, ZFS_TYPE_DATASET)); } +uint64_t +zfs_prop_random_value(zfs_prop_t prop, uint64_t seed) +{ + return (zprop_random_value(prop, seed, ZFS_TYPE_DATASET)); +} + /* * Returns TRUE if the property applies to any of the given dataset types. */ diff --git a/external/cddl/osnet/dist/common/zfs/zfs_prop.h b/external/cddl/osnet/dist/common/zfs/zfs_prop.h index da5ae43093e54..38d429aa84c90 100644 --- a/external/cddl/osnet/dist/common/zfs/zfs_prop.h +++ b/external/cddl/osnet/dist/common/zfs/zfs_prop.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZFS_PROP_H #define _ZFS_PROP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -79,6 +77,7 @@ typedef struct { /* "zfs get" help message */ const zprop_index_t *pd_table; /* for index properties, a table */ /* defining the possible values */ + size_t pd_table_size; /* number of entries in pd_table[] */ } zprop_desc_t; /* @@ -118,6 +117,7 @@ int zprop_iter_common(zprop_func, void *, boolean_t, boolean_t, zfs_type_t); int zprop_name_to_prop(const char *, zfs_type_t); int zprop_string_to_index(int, const char *, uint64_t *, zfs_type_t); int zprop_index_to_string(int, uint64_t, const char **, zfs_type_t); +uint64_t zprop_random_value(int, uint64_t, zfs_type_t); const char *zprop_values(int, zfs_type_t); size_t zprop_width(int, boolean_t *, zfs_type_t); boolean_t zprop_valid_for_type(int, zfs_type_t); diff --git a/external/cddl/osnet/dist/common/zfs/zpool_prop.c b/external/cddl/osnet/dist/common/zfs/zpool_prop.c index f5efe18d248b1..c8a3ca205f42a 100644 --- a/external/cddl/osnet/dist/common/zfs/zpool_prop.c +++ b/external/cddl/osnet/dist/common/zfs/zpool_prop.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -74,20 +74,24 @@ zpool_prop_init(void) /* readonly number properties */ register_number(ZPOOL_PROP_SIZE, "size", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "SIZE"); - register_number(ZPOOL_PROP_USED, "used", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "USED"); - register_number(ZPOOL_PROP_AVAILABLE, "available", 0, PROP_READONLY, - ZFS_TYPE_POOL, "", "AVAIL"); + register_number(ZPOOL_PROP_FREE, "free", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "FREE"); + register_number(ZPOOL_PROP_ALLOCATED, "allocated", 0, PROP_READONLY, + ZFS_TYPE_POOL, "", "ALLOC"); register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "CAP"); register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "GUID"); register_number(ZPOOL_PROP_HEALTH, "health", 0, PROP_READONLY, ZFS_TYPE_POOL, "", "HEALTH"); + register_number(ZPOOL_PROP_DEDUPRATIO, "dedupratio", 0, PROP_READONLY, + ZFS_TYPE_POOL, "<1.00x or higher if deduped>", "DEDUP"); /* default number properties */ register_number(ZPOOL_PROP_VERSION, "version", SPA_VERSION, PROP_DEFAULT, ZFS_TYPE_POOL, "", "VERSION"); + register_number(ZPOOL_PROP_DEDUPDITTO, "dedupditto", 0, + PROP_DEFAULT, ZFS_TYPE_POOL, "", "DEDUPDITTO"); /* default index (boolean) properties */ register_index(ZPOOL_PROP_DELEGATION, "delegation", 1, PROP_DEFAULT, @@ -96,6 +100,8 @@ zpool_prop_init(void) ZFS_TYPE_POOL, "on | off", "REPLACE", boolean_table); register_index(ZPOOL_PROP_LISTSNAPS, "listsnapshots", 0, PROP_DEFAULT, ZFS_TYPE_POOL, "on | off", "LISTSNAPS", boolean_table); + register_index(ZPOOL_PROP_AUTOEXPAND, "autoexpand", 0, PROP_DEFAULT, + ZFS_TYPE_POOL, "on | off", "EXPAND", boolean_table); /* default index properties */ register_index(ZPOOL_PROP_FAILUREMODE, "failmode", @@ -164,6 +170,12 @@ zpool_prop_index_to_string(zpool_prop_t prop, uint64_t index, return (zprop_index_to_string(prop, index, string, ZFS_TYPE_POOL)); } +uint64_t +zpool_prop_random_value(zpool_prop_t prop, uint64_t seed) +{ + return (zprop_random_value(prop, seed, ZFS_TYPE_POOL)); +} + #ifndef _KERNEL const char * diff --git a/external/cddl/osnet/dist/common/zfs/zprop_common.c b/external/cddl/osnet/dist/common/zfs/zprop_common.c index bd267e2e61cac..992fe5e71603f 100644 --- a/external/cddl/osnet/dist/common/zfs/zprop_common.c +++ b/external/cddl/osnet/dist/common/zfs/zprop_common.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Common routines used by zfs and zpool property management. */ @@ -78,6 +76,8 @@ register_impl(int prop, const char *name, zprop_type_t type, pd = &prop_tbl[prop]; ASSERT(pd->pd_name == NULL || pd->pd_name == name); + ASSERT(name != NULL); + ASSERT(colname != NULL); pd->pd_name = name; pd->pd_propnum = prop; @@ -91,6 +91,9 @@ register_impl(int prop, const char *name, zprop_type_t type, pd->pd_rightalign = rightalign; pd->pd_visible = visible; pd->pd_table = idx_tbl; + pd->pd_table_size = 0; + while (idx_tbl && (idx_tbl++)->pi_name != NULL) + pd->pd_table_size++; } void @@ -205,9 +208,6 @@ propname_match(const char *p, size_t len, zprop_desc_t *prop_entry) #ifndef _KERNEL const char *colname = prop_entry->pd_colname; int c; - - if (colname == NULL) - return (B_FALSE); #endif if (len == strlen(propname) && @@ -215,7 +215,7 @@ propname_match(const char *p, size_t len, zprop_desc_t *prop_entry) return (B_TRUE); #ifndef _KERNEL - if (len != strlen(colname)) + if (colname == NULL || len != strlen(colname)) return (B_FALSE); for (c = 0; c < len; c++) @@ -312,6 +312,25 @@ zprop_index_to_string(int prop, uint64_t index, const char **string, return (-1); } +/* + * Return a random valid property value. Used by ztest. + */ +uint64_t +zprop_random_value(int prop, uint64_t seed, zfs_type_t type) +{ + zprop_desc_t *prop_tbl; + const zprop_index_t *idx_tbl; + + ASSERT((uint_t)prop < zprop_get_numprops(type)); + prop_tbl = zprop_get_proptable(type); + idx_tbl = prop_tbl[prop].pd_table; + + if (idx_tbl == NULL) + return (seed); + + return (idx_tbl[seed % prop_tbl[prop].pd_table_size].pi_value); +} + const char * zprop_values(int prop, zfs_type_t type) { diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/drti.c b/external/cddl/osnet/dist/lib/libdtrace/common/drti.c index f8570e686f5b3..3b5f0cbbdf306 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/drti.c +++ b/external/cddl/osnet/dist/lib/libdtrace/common/drti.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -61,13 +58,14 @@ static const char *olddevname = "/devices/pseudo/dtrace@0:helper"; static const char *modname; /* Name of this load object */ static int gen; /* DOF helper generation */ extern dof_hdr_t __SUNW_dof; /* DOF defined in the .SUNW_dof section */ +static boolean_t dof_init_debug = B_FALSE; /* From DTRACE_DOF_INIT_DEBUG */ static void dprintf(int debug, const char *fmt, ...) { va_list ap; - if (debug && getenv("DTRACE_DOF_INIT_DEBUG") == NULL) + if (debug && !dof_init_debug) return; va_start(ap, fmt); @@ -104,6 +102,9 @@ dtrace_dof_init(void) if (getenv("DTRACE_DOF_INIT_DISABLE") != NULL) return; + if (getenv("DTRACE_DOF_INIT_DEBUG") != NULL) + dof_init_debug = B_TRUE; + if (dlinfo(RTLD_SELF, RTLD_DI_LINKMAP, &lmp) == -1 || lmp == NULL) { dprintf(1, "couldn't discover module name or address\n"); return; diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c index 62d39e07dd416..564189a000adb 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c +++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_consume.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -1063,7 +1061,7 @@ dt_print_usym(dtrace_hdl_t *dtp, FILE *fp, caddr_t addr, dtrace_actkind_t act) do { n = len; s = alloca(n); - } while ((len = dtrace_uaddr2str(dtp, pid, pc, s, n)) >= n); + } while ((len = dtrace_uaddr2str(dtp, pid, pc, s, n)) > n); return (dt_printf(dtp, fp, format, s)); } diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_error.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_error.c index 5005f593a43da..0bfabc919c857 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_error.c +++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_error.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -105,7 +103,8 @@ static const struct { { EDT_BADSETOPT, "Invalid setopt() library action" }, { EDT_BADSTACKPC, "Invalid stack program counter size" }, { EDT_BADAGGVAR, "Invalid aggregation variable identifier" }, - { EDT_OVERSION, "Client requested deprecated version of library" } + { EDT_OVERSION, "Client requested deprecated version of library" }, + { EDT_ENABLING_ERR, "Failed to enable probe" } }; static const int _dt_nerr = sizeof (_dt_errlist) / sizeof (_dt_errlist[0]); diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h b/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h index 9b22dfbb641a1..1937ce06474de 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h +++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_impl.h @@ -20,21 +20,20 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _DT_IMPL_H #define _DT_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -498,7 +497,8 @@ enum { EDT_BADSETOPT, /* invalid setopt library action */ EDT_BADSTACKPC, /* invalid stack program counter size */ EDT_BADAGGVAR, /* invalid aggregation variable identifier */ - EDT_OVERSION /* client is requesting deprecated version */ + EDT_OVERSION, /* client is requesting deprecated version */ + EDT_ENABLING_ERR /* failed to enable probe */ }; /* @@ -568,17 +568,8 @@ extern int dt_buffered_flush(dtrace_hdl_t *, dtrace_probedata_t *, extern void dt_buffered_disable(dtrace_hdl_t *); extern void dt_buffered_destroy(dtrace_hdl_t *); -extern int dt_rw_read_held(pthread_rwlock_t *); -extern int dt_rw_write_held(pthread_rwlock_t *); -extern int dt_mutex_held(pthread_mutex_t *); - extern uint64_t dt_stddev(uint64_t *, uint64_t); -#define DT_RW_READ_HELD(x) dt_rw_read_held(x) -#define DT_RW_WRITE_HELD(x) dt_rw_write_held(x) -#define DT_RW_LOCK_HELD(x) (DT_RW_READ_HELD(x) || DT_RW_WRITE_HELD(x)) -#define DT_MUTEX_HELD(x) dt_mutex_held(x) - extern int dt_options_load(dtrace_hdl_t *); extern void dt_dprintf(const char *, ...); diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c index 25197031ce112..f8fdc4edbeb2a 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c +++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_module.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -66,6 +64,10 @@ dt_module_symhash_insert(dt_module_t *dmp, const char *name, uint_t id) static uint_t dt_module_syminit32(dt_module_t *dmp) { +#if STT_NUM != (STT_TLS + 1) +#error "STT_NUM has grown. update dt_module_syminit32()" +#endif + const Elf32_Sym *sym = dmp->dm_symtab.cts_data; const char *base = dmp->dm_strtab.cts_data; size_t ss_size = dmp->dm_strtab.cts_size; @@ -95,6 +97,10 @@ dt_module_syminit32(dt_module_t *dmp) static uint_t dt_module_syminit64(dt_module_t *dmp) { +#if STT_NUM != (STT_TLS + 1) +#error "STT_NUM has grown. update dt_module_syminit64()" +#endif + const Elf64_Sym *sym = dmp->dm_symtab.cts_data; const char *base = dmp->dm_strtab.cts_data; size_t ss_size = dmp->dm_strtab.cts_size; @@ -468,7 +474,7 @@ dt_module_load_sect(dtrace_hdl_t *dtp, dt_module_t *dmp, ctf_sect_t *ctsp) Elf_Data *dp; Elf_Scn *sp; - if (elf_getshstrndx(dmp->dm_elf, &shstrs) == 0) + if (elf_getshdrstrndx(dmp->dm_elf, &shstrs) == -1) return (dt_set_errno(dtp, EDT_NOTLOADED)); for (sp = NULL; (sp = elf_nextscn(dmp->dm_elf, sp)) != NULL; ) { @@ -817,7 +823,7 @@ dt_module_update(dtrace_hdl_t *dtp, const char *name) (void) close(fd); if (dmp->dm_elf == NULL || err == -1 || - elf_getshstrndx(dmp->dm_elf, &shstrs) == 0) { + elf_getshdrstrndx(dmp->dm_elf, &shstrs) == -1) { dt_dprintf("failed to load %s: %s\n", fname, elf_errmsg(elf_errno())); dt_module_destroy(dtp, dmp); diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c index 091772405ffd1..241805154adcf 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c +++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_pid.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -576,7 +576,7 @@ dt_pid_create_usdt_probes(dtrace_probedesc_t *pdp, dtrace_hdl_t *dtp, struct ps_prochandle *P = dpr->dpr_proc; int ret = 0; - assert(DT_MUTEX_HELD(&dpr->dpr_lock)); + assert(MUTEX_HELD(&dpr->dpr_lock)); (void) Pupdate_maps(P); if (Pobject_iter(P, dt_pid_usdt_mapping, P) != 0) { diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c index 953511b1d029c..4400771214c9b 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c +++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_printf.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -330,7 +328,7 @@ pfprint_addr(dtrace_hdl_t *dtp, FILE *fp, const char *format, do { n = len; s = alloca(n); - } while ((len = dtrace_addr2str(dtp, val, s, n)) >= n); + } while ((len = dtrace_addr2str(dtp, val, s, n)) > n); return (dt_printf(dtp, fp, format, s)); } @@ -383,7 +381,7 @@ pfprint_uaddr(dtrace_hdl_t *dtp, FILE *fp, const char *format, do { n = len; s = alloca(n); - } while ((len = dtrace_uaddr2str(dtp, pid, val, s, n)) >= n); + } while ((len = dtrace_uaddr2str(dtp, pid, val, s, n)) > n); return (dt_printf(dtp, fp, format, s)); } @@ -1223,6 +1221,20 @@ pfprint_average(dtrace_hdl_t *dtp, FILE *fp, const char *format, data[0] ? data[1] / normal / data[0] : 0)); } +/*ARGSUSED*/ +static int +pfprint_stddev(dtrace_hdl_t *dtp, FILE *fp, const char *format, + const dt_pfargd_t *pfd, const void *addr, size_t size, uint64_t normal) +{ + const uint64_t *data = addr; + + if (size != sizeof (uint64_t) * 4) + return (dt_set_errno(dtp, EDT_DMISMATCH)); + + return (dt_printf(dtp, fp, format, + dt_stddev((uint64_t *)data, normal))); +} + /*ARGSUSED*/ static int pfprint_quantize(dtrace_hdl_t *dtp, FILE *fp, const char *format, @@ -1415,6 +1427,9 @@ dt_printf_format(dtrace_hdl_t *dtp, FILE *fp, const dt_pfargv_t *pfv, case DTRACEAGG_AVG: func = pfprint_average; break; + case DTRACEAGG_STDDEV: + func = pfprint_stddev; + break; case DTRACEAGG_QUANTIZE: func = pfprint_quantize; break; diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c index 419f13b8474ca..001534163bd04 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c +++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_proc.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * DTrace Process Control * @@ -99,7 +97,7 @@ dt_proc_bpcreate(dt_proc_t *dpr, uintptr_t addr, dt_bkpt_f *func, void *data) struct ps_prochandle *P = dpr->dpr_proc; dt_bkpt_t *dbp; - assert(DT_MUTEX_HELD(&dpr->dpr_lock)); + assert(MUTEX_HELD(&dpr->dpr_lock)); if ((dbp = dt_zalloc(dpr->dpr_hdl, sizeof (dt_bkpt_t))) != NULL) { dbp->dbp_func = func; @@ -121,7 +119,7 @@ dt_proc_bpdestroy(dt_proc_t *dpr, int delbkpts) int state = Pstate(dpr->dpr_proc); dt_bkpt_t *dbp, *nbp; - assert(DT_MUTEX_HELD(&dpr->dpr_lock)); + assert(MUTEX_HELD(&dpr->dpr_lock)); for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = nbp) { if (delbkpts && dbp->dbp_active && @@ -141,7 +139,7 @@ dt_proc_bpmatch(dtrace_hdl_t *dtp, dt_proc_t *dpr) const lwpstatus_t *psp = &Pstatus(dpr->dpr_proc)->pr_lwp; dt_bkpt_t *dbp; - assert(DT_MUTEX_HELD(&dpr->dpr_lock)); + assert(MUTEX_HELD(&dpr->dpr_lock)); for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = dt_list_next(dbp)) { @@ -167,7 +165,7 @@ dt_proc_bpenable(dt_proc_t *dpr) { dt_bkpt_t *dbp; - assert(DT_MUTEX_HELD(&dpr->dpr_lock)); + assert(MUTEX_HELD(&dpr->dpr_lock)); for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = dt_list_next(dbp)) { @@ -184,7 +182,7 @@ dt_proc_bpdisable(dt_proc_t *dpr) { dt_bkpt_t *dbp; - assert(DT_MUTEX_HELD(&dpr->dpr_lock)); + assert(MUTEX_HELD(&dpr->dpr_lock)); for (dbp = dt_list_next(&dpr->dpr_bps); dbp != NULL; dbp = dt_list_next(dbp)) { @@ -232,7 +230,7 @@ dt_proc_notify(dtrace_hdl_t *dtp, dt_proc_hash_t *dph, dt_proc_t *dpr, static void dt_proc_stop(dt_proc_t *dpr, uint8_t why) { - assert(DT_MUTEX_HELD(&dpr->dpr_lock)); + assert(MUTEX_HELD(&dpr->dpr_lock)); assert(why != DT_PROC_STOP_IDLE); if (dpr->dpr_stop & why) { @@ -333,7 +331,7 @@ dt_proc_attach(dt_proc_t *dpr, int exec) rd_err_e err; GElf_Sym sym; - assert(DT_MUTEX_HELD(&dpr->dpr_lock)); + assert(MUTEX_HELD(&dpr->dpr_lock)); if (exec) { if (psp->pr_lwp.pr_errno != 0) @@ -399,7 +397,7 @@ dt_proc_waitrun(dt_proc_t *dpr) const long wstop = PCWSTOP; int pfd = Pctlfd(P); - assert(DT_MUTEX_HELD(&dpr->dpr_lock)); + assert(MUTEX_HELD(&dpr->dpr_lock)); assert(psp->pr_flags & PR_STOPPED); assert(Pstate(P) == PS_STOP); @@ -712,9 +710,12 @@ dt_proc_destroy(dtrace_hdl_t *dtp, struct ps_prochandle *P) if (!(Pstatus(dpr->dpr_proc)->pr_flags & (PR_KLC | PR_RLC))) { dt_dprintf("abandoning pid %d\n", (int)dpr->dpr_pid); rflag = PRELEASE_HANG; + } else if (Pstatus(dpr->dpr_proc)->pr_flags & PR_KLC) { + dt_dprintf("killing pid %d\n", (int)dpr->dpr_pid); + rflag = PRELEASE_KILL; /* apply kill-on-last-close */ } else { dt_dprintf("releasing pid %d\n", (int)dpr->dpr_pid); - rflag = 0; /* apply kill or run-on-last-close */ + rflag = 0; /* apply run-on-last-close */ } if (dpr->dpr_tid) { diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_program.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_program.c index 29d883aca4d24..8105df0737d01 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_program.c +++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_program.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -173,6 +171,9 @@ dtrace_program_exec(dtrace_hdl_t *dtp, dtrace_prog_t *pgp, case E2BIG: err = EDT_DIFSIZE; break; + case EBUSY: + err = EDT_ENABLING_ERR; + break; default: err = errno; } diff --git a/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c b/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c index b2163e69e9a65..97221c84d6cc0 100644 --- a/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c +++ b/external/cddl/osnet/dist/lib/libdtrace/common/dt_subr.c @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,13 +18,12 @@ * * CDDL HEADER END */ + /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -57,8 +55,8 @@ int dtrace_xstr2desc(dtrace_hdl_t *dtp, dtrace_probespec_t spec, const char *s, int argc, char *const argv[], dtrace_probedesc_t *pdp) { - size_t off, len, vlen; - const char *p, *q, *v; + size_t off, len, vlen, wlen; + const char *p, *q, *v, *w; char buf[32]; /* for id_t as %d (see below) */ @@ -74,6 +72,8 @@ dtrace_xstr2desc(dtrace_hdl_t *dtp, dtrace_probespec_t spec, q = p + 1; vlen = 0; + w = NULL; + wlen = 0; if ((v = strchr(q, '$')) != NULL && v < q + len) { /* @@ -98,14 +98,14 @@ dtrace_xstr2desc(dtrace_hdl_t *dtp, dtrace_probespec_t spec, } if (isdigit(v[1])) { - char *end; long i; errno = 0; - i = strtol(v + 1, &end, 10); + i = strtol(v + 1, (char **)&w, 10); + + wlen = vlen - (w - v); - if (i < 0 || i >= argc || - errno != 0 || end != v + vlen) + if (i < 0 || i >= argc || errno != 0) return (dt_set_errno(dtp, EDT_BADSPCV)); v = argv[i]; @@ -141,7 +141,7 @@ dtrace_xstr2desc(dtrace_hdl_t *dtp, dtrace_probespec_t spec, off = dtrace_probespecs[spec--].dtps_offset; bcopy(q, (char *)pdp + off, len); bcopy(v, (char *)pdp + off + len, vlen); - + bcopy(w, (char *)pdp + off + len + vlen, wlen); } while (--p >= s); pdp->dtpd_id = DTRACE_IDNONE; @@ -803,30 +803,6 @@ dt_popcb(const ulong_t *bp, ulong_t n) return (popc + dt_popc(bp[maxw] & ((1UL << maxb) - 1))); } -struct _rwlock; -struct _lwp_mutex; - -int -dt_rw_read_held(pthread_rwlock_t *lock) -{ - extern int _rw_read_held(struct _rwlock *); - return (_rw_read_held((struct _rwlock *)lock)); -} - -int -dt_rw_write_held(pthread_rwlock_t *lock) -{ - extern int _rw_write_held(struct _rwlock *); - return (_rw_write_held((struct _rwlock *)lock)); -} - -int -dt_mutex_held(pthread_mutex_t *lock) -{ - extern int _mutex_held(struct _lwp_mutex *); - return (_mutex_held((struct _lwp_mutex *)lock)); -} - static int dt_string2str(char *s, char *str, int nbytes) { diff --git a/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c b/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c index 0845cb08cf8d8..57915cd7373e2 100644 --- a/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c +++ b/external/cddl/osnet/dist/lib/libnvpair/libnvpair.c @@ -19,14 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include +#include #include #include #include "libnvpair.h" @@ -272,6 +271,156 @@ nvlist_print(FILE *fp, nvlist_t *nvl) nvlist_print_with_indent(fp, nvl, 0); } + +#define NVP(elem, type, vtype, ptype, format) { \ + vtype value; \ +\ + (void) nvpair_value_##type(elem, &value); \ + (void) printf("%*s%s: " format "\n", indent, "", \ + nvpair_name(elem), (ptype)value); \ +} + +#define NVPA(elem, type, vtype, ptype, format) { \ + uint_t i, count; \ + vtype *value; \ +\ + (void) nvpair_value_##type(elem, &value, &count); \ + for (i = 0; i < count; i++) { \ + (void) printf("%*s%s[%d]: " format "\n", indent, "", \ + nvpair_name(elem), i, (ptype)value[i]); \ + } \ +} + +/* + * Similar to nvlist_print() but handles arrays slightly differently. + */ +void +dump_nvlist(nvlist_t *list, int indent) +{ + nvpair_t *elem = NULL; + boolean_t bool_value; + nvlist_t *nvlist_value; + nvlist_t **nvlist_array_value; + uint_t i, count; + + if (list == NULL) { + return; + } + + while ((elem = nvlist_next_nvpair(list, elem)) != NULL) { + switch (nvpair_type(elem)) { + case DATA_TYPE_BOOLEAN_VALUE: + (void) nvpair_value_boolean_value(elem, &bool_value); + (void) printf("%*s%s: %s\n", indent, "", + nvpair_name(elem), bool_value ? "true" : "false"); + break; + + case DATA_TYPE_BYTE: + NVP(elem, byte, uchar_t, int, "%u"); + break; + + case DATA_TYPE_INT8: + NVP(elem, int8, int8_t, int, "%d"); + break; + + case DATA_TYPE_UINT8: + NVP(elem, uint8, uint8_t, int, "%u"); + break; + + case DATA_TYPE_INT16: + NVP(elem, int16, int16_t, int, "%d"); + break; + + case DATA_TYPE_UINT16: + NVP(elem, uint16, uint16_t, int, "%u"); + break; + + case DATA_TYPE_INT32: + NVP(elem, int32, int32_t, long, "%ld"); + break; + + case DATA_TYPE_UINT32: + NVP(elem, uint32, uint32_t, ulong_t, "%lu"); + break; + + case DATA_TYPE_INT64: + NVP(elem, int64, int64_t, longlong_t, "%lld"); + break; + + case DATA_TYPE_UINT64: + NVP(elem, uint64, uint64_t, u_longlong_t, "%llu"); + break; + + case DATA_TYPE_STRING: + NVP(elem, string, char *, char *, "'%s'"); + break; + + case DATA_TYPE_BYTE_ARRAY: + NVPA(elem, byte_array, uchar_t, int, "%u"); + break; + + case DATA_TYPE_INT8_ARRAY: + NVPA(elem, int8_array, int8_t, int, "%d"); + break; + + case DATA_TYPE_UINT8_ARRAY: + NVPA(elem, uint8_array, uint8_t, int, "%u"); + break; + + case DATA_TYPE_INT16_ARRAY: + NVPA(elem, int16_array, int16_t, int, "%d"); + break; + + case DATA_TYPE_UINT16_ARRAY: + NVPA(elem, uint16_array, uint16_t, int, "%u"); + break; + + case DATA_TYPE_INT32_ARRAY: + NVPA(elem, int32_array, int32_t, long, "%ld"); + break; + + case DATA_TYPE_UINT32_ARRAY: + NVPA(elem, uint32_array, uint32_t, ulong_t, "%lu"); + break; + + case DATA_TYPE_INT64_ARRAY: + NVPA(elem, int64_array, int64_t, longlong_t, "%lld"); + break; + + case DATA_TYPE_UINT64_ARRAY: + NVPA(elem, uint64_array, uint64_t, u_longlong_t, + "%llu"); + break; + + case DATA_TYPE_STRING_ARRAY: + NVPA(elem, string_array, char *, char *, "'%s'"); + break; + + case DATA_TYPE_NVLIST: + (void) nvpair_value_nvlist(elem, &nvlist_value); + (void) printf("%*s%s:\n", indent, "", + nvpair_name(elem)); + dump_nvlist(nvlist_value, indent + 4); + break; + + case DATA_TYPE_NVLIST_ARRAY: + (void) nvpair_value_nvlist_array(elem, + &nvlist_array_value, &count); + for (i = 0; i < count; i++) { + (void) printf("%*s%s[%u]:\n", indent, "", + nvpair_name(elem), i); + dump_nvlist(nvlist_array_value[i], indent + 4); + } + break; + + default: + (void) printf(dgettext(TEXT_DOMAIN, "bad config type " + "%d for %s\n"), nvpair_type(elem), + nvpair_name(elem)); + } + } +} + /* * Determine if string 'value' matches 'nvp' value. The 'value' string is * converted, depending on the type of 'nvp', prior to match. For numeric diff --git a/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h b/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h index e655e0d4069dc..15c1c781679f8 100644 --- a/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h +++ b/external/cddl/osnet/dist/lib/libnvpair/libnvpair.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _LIBNVPAIR_H #define _LIBNVPAIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -40,6 +38,7 @@ extern "C" { void nvlist_print(FILE *, nvlist_t *); int nvpair_value_match(nvpair_t *, int, char *, char **); int nvpair_value_match_regex(nvpair_t *, int, char *, regex_t *, char **); +void dump_nvlist(nvlist_t *, int); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/lib/libshare/common/libshare.h b/external/cddl/osnet/dist/lib/libshare/common/libshare.h index a560b7731fcdb..e733ea4d10bdb 100644 --- a/external/cddl/osnet/dist/lib/libshare/common/libshare.h +++ b/external/cddl/osnet/dist/lib/libshare/common/libshare.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,8 +31,6 @@ #ifndef _LIBSHARE_H #define _LIBSHARE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h b/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h index c650865f30adb..b5630534749fd 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -65,7 +66,6 @@ enum { EZFS_BADSTREAM, /* bad backup stream */ EZFS_DSREADONLY, /* dataset is readonly */ EZFS_VOLTOOBIG, /* volume is too large for 32-bit system */ - EZFS_VOLHASDATA, /* volume already contains data */ EZFS_INVALIDNAME, /* invalid dataset name */ EZFS_BADRESTORE, /* unable to restore to destination */ EZFS_BADBACKUP, /* backup failed */ @@ -84,7 +84,6 @@ enum { EZFS_UMOUNTFAILED, /* failed to unmount dataset */ EZFS_UNSHARENFSFAILED, /* unshare(1M) failed */ EZFS_SHARENFSFAILED, /* share(1M) failed */ - EZFS_DEVLINKS, /* failed to create zvol links */ EZFS_PERM, /* permission denied */ EZFS_NOSPC, /* out of space */ EZFS_IO, /* I/O error */ @@ -115,6 +114,13 @@ enum { EZFS_VDEVNOTSUP, /* unsupported vdev type */ EZFS_NOTSUP, /* ops not supported on this dataset */ EZFS_ACTIVE_SPARE, /* pool has active shared spare devices */ + EZFS_UNPLAYED_LOGS, /* log device has unplayed logs */ + EZFS_REFTAG_RELE, /* snapshot release: tag not found */ + EZFS_REFTAG_HOLD, /* snapshot hold: tag already exists */ + EZFS_TAGTOOLONG, /* snapshot hold/rele: tag too long */ + EZFS_PIPEFAILED, /* pipe create failed */ + EZFS_THREADCREATEFAILED, /* thread create failed */ + EZFS_POSTSPLIT_ONLINE, /* onlining a disk after splitting it */ EZFS_UNKNOWN }; @@ -175,6 +181,14 @@ extern void libzfs_print_on_error(libzfs_handle_t *, boolean_t); extern int libzfs_errno(libzfs_handle_t *); extern const char *libzfs_error_action(libzfs_handle_t *); extern const char *libzfs_error_description(libzfs_handle_t *); +extern void libzfs_mnttab_init(libzfs_handle_t *); +extern void libzfs_mnttab_fini(libzfs_handle_t *); +extern void libzfs_mnttab_cache(libzfs_handle_t *, boolean_t); +extern int libzfs_mnttab_find(libzfs_handle_t *, const char *, + struct mnttab *); +extern void libzfs_mnttab_add(libzfs_handle_t *, const char *, + const char *, const char *); +extern void libzfs_mnttab_remove(libzfs_handle_t *, const char *); /* * Basic handle functions @@ -201,11 +215,19 @@ extern int zpool_create(libzfs_handle_t *, const char *, nvlist_t *, extern int zpool_destroy(zpool_handle_t *); extern int zpool_add(zpool_handle_t *, nvlist_t *); +typedef struct splitflags { + /* do not split, but return the config that would be split off */ + int dryrun : 1; + + /* after splitting, import the pool */ + int import : 1; +} splitflags_t; + /* * Functions to manipulate pool and vdev state */ extern int zpool_scrub(zpool_handle_t *, pool_scrub_type_t); -extern int zpool_clear(zpool_handle_t *, const char *); +extern int zpool_clear(zpool_handle_t *, const char *, nvlist_t *); extern int zpool_vdev_online(zpool_handle_t *, const char *, int, vdev_state_t *); @@ -214,13 +236,17 @@ extern int zpool_vdev_attach(zpool_handle_t *, const char *, const char *, nvlist_t *, int); extern int zpool_vdev_detach(zpool_handle_t *, const char *); extern int zpool_vdev_remove(zpool_handle_t *, const char *); +extern int zpool_vdev_split(zpool_handle_t *, char *, nvlist_t **, nvlist_t *, + splitflags_t); -extern int zpool_vdev_fault(zpool_handle_t *, uint64_t); -extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t); +extern int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); +extern int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); extern int zpool_vdev_clear(zpool_handle_t *, uint64_t); extern nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, boolean_t *, boolean_t *); +extern nvlist_t *zpool_find_vdev_by_physpath(zpool_handle_t *, const char *, + boolean_t *, boolean_t *, boolean_t *); extern int zpool_label_disk(libzfs_handle_t *, zpool_handle_t *, char *); /* @@ -256,9 +282,15 @@ typedef enum { ZPOOL_STATUS_HOSTID_MISMATCH, /* last accessed by another system */ ZPOOL_STATUS_IO_FAILURE_WAIT, /* failed I/O, failmode 'wait' */ ZPOOL_STATUS_IO_FAILURE_CONTINUE, /* failed I/O, failmode 'continue' */ + ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ + + /* + * These faults have no corresponding message ID. At the time we are + * checking the status, the original reason for the FMA fault (I/O or + * checksum errors) has been lost. + */ ZPOOL_STATUS_FAULTED_DEV_R, /* faulted device with replicas */ ZPOOL_STATUS_FAULTED_DEV_NR, /* faulted device with no replicas */ - ZPOOL_STATUS_BAD_LOG, /* cannot read log chain(s) */ /* * The following are not faults per se, but still an error possibly @@ -268,6 +300,7 @@ typedef enum { ZPOOL_STATUS_VERSION_OLDER, /* older on-disk version */ ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device online */ + ZPOOL_STATUS_REMOVED_DEV, /* removed device */ /* * Finally, the following indicates a healthy pool. @@ -277,6 +310,7 @@ typedef enum { extern zpool_status_t zpool_get_status(zpool_handle_t *, char **); extern zpool_status_t zpool_import_status(nvlist_t *, char **); +extern void zpool_dump_ddt(const ddt_stat_t *dds, const ddt_histogram_t *ddh); /* * Statistics and configuration functions. @@ -289,6 +323,7 @@ extern int zpool_get_errlog(zpool_handle_t *, nvlist_t **); * Import and export functions */ extern int zpool_export(zpool_handle_t *, boolean_t); +extern int zpool_export_force(zpool_handle_t *); extern int zpool_import(libzfs_handle_t *, nvlist_t *, const char *, char *altroot); extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, @@ -297,30 +332,48 @@ extern int zpool_import_props(libzfs_handle_t *, nvlist_t *, const char *, /* * Search for pools to import */ + +typedef struct importargs { + char **path; /* a list of paths to search */ + int paths; /* number of paths to search */ + char *poolname; /* name of a pool to find */ + uint64_t guid; /* guid of a pool to find */ + char *cachefile; /* cachefile to use for import */ + int can_be_active : 1; /* can the pool be active? */ + int unique : 1; /* does 'poolname' already exist? */ + int exists : 1; /* set on return if pool already exists */ +} importargs_t; + +extern nvlist_t *zpool_search_import(libzfs_handle_t *, importargs_t *); + +/* legacy pool search routines */ extern nvlist_t *zpool_find_import(libzfs_handle_t *, int, char **); extern nvlist_t *zpool_find_import_cached(libzfs_handle_t *, const char *, char *, uint64_t); -extern nvlist_t *zpool_find_import_byname(libzfs_handle_t *, int, char **, - char *); -extern nvlist_t *zpool_find_import_byguid(libzfs_handle_t *, int, char **, - uint64_t); -extern nvlist_t *zpool_find_import_activeok(libzfs_handle_t *, int, char **); /* * Miscellaneous pool functions */ struct zfs_cmd; -extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *); +extern const char *hist_event_table[LOG_END]; + +extern char *zpool_vdev_name(libzfs_handle_t *, zpool_handle_t *, nvlist_t *, + boolean_t verbose); extern int zpool_upgrade(zpool_handle_t *, uint64_t); extern int zpool_get_history(zpool_handle_t *, nvlist_t **); +extern int zpool_history_unpack(char *, uint64_t, uint64_t *, + nvlist_t ***, uint_t *); extern void zpool_set_history_str(const char *subcommand, int argc, char **argv, char *history_str); extern int zpool_stage_history(libzfs_handle_t *, const char *); extern void zpool_obj_to_path(zpool_handle_t *, uint64_t, uint64_t, char *, size_t len); extern int zfs_ioctl(libzfs_handle_t *, int, struct zfs_cmd *); -extern int zpool_get_physpath(zpool_handle_t *, char *); +extern int zpool_get_physpath(zpool_handle_t *, char *, size_t); +extern void zpool_explain_recover(libzfs_handle_t *, const char *, int, + nvlist_t *); + /* * Basic handle manipulations. These functions do not create or destroy the * underlying datasets, only the references to them. @@ -351,13 +404,20 @@ extern const char *zfs_prop_to_name(zfs_prop_t); extern int zfs_prop_set(zfs_handle_t *, const char *, const char *); extern int zfs_prop_get(zfs_handle_t *, zfs_prop_t, char *, size_t, zprop_source_t *, char *, size_t, boolean_t); +extern int zfs_prop_get_recvd(zfs_handle_t *, const char *, char *, size_t, + boolean_t); extern int zfs_prop_get_numeric(zfs_handle_t *, zfs_prop_t, uint64_t *, zprop_source_t *, char *, size_t); +extern int zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue); +extern int zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal); extern uint64_t zfs_prop_get_int(zfs_handle_t *, zfs_prop_t); -extern int zfs_prop_inherit(zfs_handle_t *, const char *); +extern int zfs_prop_inherit(zfs_handle_t *, const char *, boolean_t); extern const char *zfs_prop_values(zfs_prop_t); extern int zfs_prop_is_string(zfs_prop_t prop); extern nvlist_t *zfs_get_user_props(zfs_handle_t *); +extern nvlist_t *zfs_get_recvd_props(zfs_handle_t *); typedef struct zprop_list { int pl_prop; @@ -365,10 +425,12 @@ typedef struct zprop_list { struct zprop_list *pl_next; boolean_t pl_all; size_t pl_width; + size_t pl_recvd_width; boolean_t pl_fixed; } zprop_list_t; -extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **); +extern int zfs_expand_proplist(zfs_handle_t *, zprop_list_t **, boolean_t); +extern void zfs_prune_proplist(zfs_handle_t *, uint8_t *); #define ZFS_MOUNTPOINT_NONE "none" #define ZFS_MOUNTPOINT_LEGACY "legacy" @@ -391,13 +453,24 @@ extern int zprop_get_list(libzfs_handle_t *, char *, zprop_list_t **, zfs_type_t); extern void zprop_free_list(zprop_list_t *); +#define ZFS_GET_NCOLS 5 + +typedef enum { + GET_COL_NONE, + GET_COL_NAME, + GET_COL_PROPERTY, + GET_COL_VALUE, + GET_COL_RECVD, + GET_COL_SOURCE +} zfs_get_column_t; + /* * Functions for printing zfs or zpool properties */ typedef struct zprop_get_cbdata { int cb_sources; - int cb_columns[4]; - int cb_colwidths[5]; + zfs_get_column_t cb_columns[ZFS_GET_NCOLS]; + int cb_colwidths[ZFS_GET_NCOLS + 1]; boolean_t cb_scripted; boolean_t cb_literal; boolean_t cb_first; @@ -406,12 +479,8 @@ typedef struct zprop_get_cbdata { } zprop_get_cbdata_t; void zprop_print_one_property(const char *, zprop_get_cbdata_t *, - const char *, const char *, zprop_source_t, const char *); - -#define GET_COL_NAME 1 -#define GET_COL_PROPERTY 2 -#define GET_COL_VALUE 3 -#define GET_COL_SOURCE 4 + const char *, const char *, zprop_source_t, const char *, + const char *); /* * Iterator functions. @@ -422,6 +491,7 @@ extern int zfs_iter_children(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_dependents(zfs_handle_t *, boolean_t, zfs_iter_f, void *); extern int zfs_iter_filesystems(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *); +extern int zfs_iter_snapshots_sorted(zfs_handle_t *, zfs_iter_f, void *); /* * Functions to create and destroy datasets. @@ -429,15 +499,53 @@ extern int zfs_iter_snapshots(zfs_handle_t *, zfs_iter_f, void *); extern int zfs_create(libzfs_handle_t *, const char *, zfs_type_t, nvlist_t *); extern int zfs_create_ancestors(libzfs_handle_t *, const char *); -extern int zfs_destroy(zfs_handle_t *); -extern int zfs_destroy_snaps(zfs_handle_t *, char *); +extern int zfs_destroy(zfs_handle_t *, boolean_t); +extern int zfs_destroy_snaps(zfs_handle_t *, char *, boolean_t); extern int zfs_clone(zfs_handle_t *, const char *, nvlist_t *); extern int zfs_snapshot(libzfs_handle_t *, const char *, boolean_t, nvlist_t *); extern int zfs_rollback(zfs_handle_t *, zfs_handle_t *, boolean_t); extern int zfs_rename(zfs_handle_t *, const char *, boolean_t); + +typedef struct sendflags { + /* print informational messages (ie, -v was specified) */ + int verbose : 1; + + /* recursive send (ie, -R) */ + int replicate : 1; + + /* for incrementals, do all intermediate snapshots */ + int doall : 1; /* (ie, -I) */ + + /* if dataset is a clone, do incremental from its origin */ + int fromorigin : 1; + + /* do deduplication */ + int dedup : 1; + + /* send properties (ie, -p) */ + int props : 1; +} sendflags_t; + +typedef boolean_t (snapfilter_cb_t)(zfs_handle_t *, void *); + extern int zfs_send(zfs_handle_t *, const char *, const char *, - boolean_t, boolean_t, boolean_t, boolean_t, int); + sendflags_t, int, snapfilter_cb_t, void *); + extern int zfs_promote(zfs_handle_t *); +extern int zfs_hold(zfs_handle_t *, const char *, const char *, boolean_t, + boolean_t, boolean_t); +extern int zfs_hold_range(zfs_handle_t *, const char *, const char *, + const char *, boolean_t, boolean_t); +extern int zfs_release(zfs_handle_t *, const char *, const char *, boolean_t); +extern int zfs_release_range(zfs_handle_t *, const char *, const char *, + const char *, boolean_t); +extern uint64_t zvol_volsize_to_reservation(uint64_t, nvlist_t *); + +typedef int (*zfs_userspace_cb_t)(void *arg, const char *domain, + uid_t rid, uint64_t space); + +extern int zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, + zfs_userspace_cb_t func, void *arg); typedef struct recvflags { /* print informational messages (ie, -v was specified) */ @@ -446,6 +554,12 @@ typedef struct recvflags { /* the destination is a prefix, not the exact fs (ie, -d) */ int isprefix : 1; + /* + * Only the tail of the sent snapshot path is appended to the + * destination to determine the received snapshot name (ie, -e). + */ + int istail : 1; + /* do not actually do the recv, just check if it would work (ie, -n) */ int dryrun : 1; @@ -457,6 +571,9 @@ typedef struct recvflags { /* byteswap flag is used internally; callers need not specify */ int byteswap : 1; + + /* do not mount file systems as they are extracted (private) */ + int nomount : 1; } recvflags_t; extern int zfs_receive(libzfs_handle_t *, const char *, recvflags_t, @@ -473,17 +590,6 @@ extern boolean_t zfs_dataset_exists(libzfs_handle_t *, const char *, zfs_type_t); extern int zfs_spa_version(zfs_handle_t *, int *); -/* - * dataset permission functions. - */ -extern int zfs_perm_set(zfs_handle_t *, nvlist_t *); -extern int zfs_perm_remove(zfs_handle_t *, nvlist_t *); -extern int zfs_build_perms(zfs_handle_t *, char *, char *, - zfs_deleg_who_type_t, zfs_deleg_inherit_t, nvlist_t **nvlist_t); -extern int zfs_perm_get(zfs_handle_t *, zfs_allow_t **); -extern void zfs_free_allows(zfs_allow_t *); -extern void zfs_deleg_permissions(void); - /* * Mount support functions. */ @@ -518,7 +624,7 @@ extern boolean_t zfs_is_shared_iscsi(zfs_handle_t *); extern int zfs_share_iscsi(zfs_handle_t *); extern int zfs_unshare_iscsi(zfs_handle_t *); extern int zfs_iscsi_perm_check(libzfs_handle_t *, char *, ucred_t *); -extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, +extern int zfs_deleg_share_nfs(libzfs_handle_t *, char *, char *, char *, void *, void *, int, zfs_share_op_t); /* @@ -543,18 +649,22 @@ extern int zpool_in_use(libzfs_handle_t *, int, pool_state_t *, char **, boolean_t *); /* - * ftyp special. Read the label from a given device. + * Label manipulation. */ extern int zpool_read_label(int, nvlist_t **); +extern int zpool_clear_label(int); + +/* is this zvol valid for use as a dump device? */ +extern int zvol_check_dump_config(char *); /* - * Create and remove zvol /dev links. + * Management interfaces for SMB ACL files */ -extern int zpool_create_zvol_links(zpool_handle_t *); -extern int zpool_remove_zvol_links(zpool_handle_t *); -/* is this zvol valid for use as a dump device? */ -extern int zvol_check_dump_config(char *); +int zfs_smb_acl_add(libzfs_handle_t *, char *, char *, char *); +int zfs_smb_acl_remove(libzfs_handle_t *, char *, char *, char *); +int zfs_smb_acl_purge(libzfs_handle_t *, char *, char *); +int zfs_smb_acl_rename(libzfs_handle_t *, char *, char *, char *, char *); /* * Enable and disable datasets within a pool by mounting/unmounting and @@ -563,6 +673,17 @@ extern int zvol_check_dump_config(char *); extern int zpool_enable_datasets(zpool_handle_t *, const char *, int); extern int zpool_disable_datasets(zpool_handle_t *, boolean_t); +/* + * Mappings between vdev and FRU. + */ +extern void libzfs_fru_refresh(libzfs_handle_t *); +extern const char *libzfs_fru_lookup(libzfs_handle_t *, const char *); +extern const char *libzfs_fru_devpath(libzfs_handle_t *, const char *); +extern boolean_t libzfs_fru_compare(libzfs_handle_t *, const char *, + const char *); +extern boolean_t libzfs_fru_notself(libzfs_handle_t *, const char *); +extern int zpool_fru_set(zpool_handle_t *, uint64_t, const char *); + #ifdef __cplusplus } #endif diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c index b905bc6cb6afc..c970d1e488d1b 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_changelist.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. * * Portions Copyright 2007 Ramprakash Jelari @@ -119,18 +119,8 @@ changelist_prefix(prop_changelist_t *clp) if (ZFS_IS_VOLUME(cn->cn_handle)) { switch (clp->cl_realprop) { case ZFS_PROP_NAME: - /* - * If this was a rename, unshare the zvol, and - * remove the /dev/zvol links. - */ + /* If this was a rename, unshare the zvol */ (void) zfs_unshare_iscsi(cn->cn_handle); - - if (zvol_remove_link(cn->cn_handle->zfs_hdl, - cn->cn_handle->zfs_name) != 0) { - ret = -1; - cn->cn_needpost = B_FALSE; - (void) zfs_share_iscsi(cn->cn_handle); - } break; case ZFS_PROP_VOLSIZE: @@ -218,6 +208,7 @@ changelist_postfix(prop_changelist_t *clp) boolean_t sharenfs; boolean_t sharesmb; + boolean_t mounted; /* * If we are in the global zone, but this dataset is exported @@ -234,15 +225,7 @@ changelist_postfix(prop_changelist_t *clp) zfs_refresh_properties(cn->cn_handle); if (ZFS_IS_VOLUME(cn->cn_handle)) { - /* - * If we're doing a rename, recreate the /dev/zvol - * links. - */ - if (clp->cl_realprop == ZFS_PROP_NAME && - zvol_create_link(cn->cn_handle->zfs_hdl, - cn->cn_handle->zfs_name) != 0) { - errors++; - } else if (cn->cn_shared || + if (cn->cn_shared || clp->cl_prop == ZFS_PROP_SHAREISCSI) { if (zfs_prop_get(cn->cn_handle, ZFS_PROP_SHAREISCSI, shareopts, @@ -272,20 +255,29 @@ changelist_postfix(prop_changelist_t *clp) shareopts, sizeof (shareopts), NULL, NULL, 0, B_FALSE) == 0) && (strcmp(shareopts, "off") != 0)); - if ((cn->cn_mounted || clp->cl_waslegacy || sharenfs || - sharesmb) && !zfs_is_mounted(cn->cn_handle, NULL) && - zfs_mount(cn->cn_handle, NULL, 0) != 0) - errors++; + mounted = zfs_is_mounted(cn->cn_handle, NULL); + + if (!mounted && (cn->cn_mounted || + ((sharenfs || sharesmb || clp->cl_waslegacy) && + (zfs_prop_get_int(cn->cn_handle, + ZFS_PROP_CANMOUNT) == ZFS_CANMOUNT_ON)))) { + + if (zfs_mount(cn->cn_handle, NULL, 0) != 0) + errors++; + else + mounted = TRUE; + } /* - * We always re-share even if the filesystem is currently - * shared, so that we can adopt any new options. + * If the file system is mounted we always re-share even + * if the filesystem is currently shared, so that we can + * adopt any new options. */ - if (sharenfs) + if (sharenfs && mounted) errors += zfs_share_nfs(cn->cn_handle); else if (cn->cn_shared || clp->cl_waslegacy) errors += zfs_unshare_nfs(cn->cn_handle, NULL); - if (sharesmb) + if (sharesmb && mounted) errors += zfs_share_smb(cn->cn_handle); else if (cn->cn_shared || clp->cl_waslegacy) errors += zfs_unshare_smb(cn->cn_handle, NULL); @@ -498,6 +490,14 @@ change_one(zfs_handle_t *zhp, void *data) &idx); uu_list_insert(clp->cl_list, cn, idx); } else { + /* + * Add this child to beginning of the list. Children + * below this one in the hierarchy will get added above + * this one in the list. This produces a list in + * reverse dataset name order. + * This is necessary when the original mountpoint + * is legacy or none. + */ ASSERT(!clp->cl_alldependents); verify(uu_list_insert_before(clp->cl_list, uu_list_first(clp->cl_list), cn) == 0); @@ -564,6 +564,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, zfs_handle_t *temp; char property[ZFS_MAXPROPLEN]; uu_compare_fn_t *compare = NULL; + boolean_t legacy = B_FALSE; if ((clp = zfs_alloc(zhp->zfs_hdl, sizeof (prop_changelist_t))) == NULL) return (NULL); @@ -576,8 +577,19 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, if (prop == ZFS_PROP_NAME || prop == ZFS_PROP_ZONED || prop == ZFS_PROP_MOUNTPOINT || prop == ZFS_PROP_SHARENFS || prop == ZFS_PROP_SHARESMB) { - compare = compare_mountpoints; - clp->cl_sorted = B_TRUE; + + if (zfs_prop_get(zhp, ZFS_PROP_MOUNTPOINT, + property, sizeof (property), + NULL, NULL, 0, B_FALSE) == 0 && + (strcmp(property, "legacy") == 0 || + strcmp(property, "none") == 0)) { + + legacy = B_TRUE; + } + if (!legacy) { + compare = compare_mountpoints; + clp->cl_sorted = B_TRUE; + } } clp->cl_pool = uu_list_pool_create("changelist_pool", @@ -621,8 +633,6 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, clp->cl_prop = ZFS_PROP_MOUNTPOINT; } else if (prop == ZFS_PROP_VOLSIZE) { clp->cl_prop = ZFS_PROP_MOUNTPOINT; - } else if (prop == ZFS_PROP_VERSION) { - clp->cl_prop = ZFS_PROP_MOUNTPOINT; } else { clp->cl_prop = prop; } @@ -687,6 +697,12 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, (void) uu_list_find(clp->cl_list, cn, NULL, &idx); uu_list_insert(clp->cl_list, cn, idx); } else { + /* + * Add the target dataset to the end of the list. + * The list is not really unsorted. The list will be + * in reverse dataset name order. This is necessary + * when the original mountpoint is legacy or none. + */ verify(uu_list_insert_after(clp->cl_list, uu_list_last(clp->cl_list), cn) == 0); } @@ -695,11 +711,7 @@ changelist_gather(zfs_handle_t *zhp, zfs_prop_t prop, int gather_flags, * If the mountpoint property was previously 'legacy', or 'none', * record it as the behavior of changelist_postfix() will be different. */ - if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && - (zfs_prop_get(zhp, prop, property, sizeof (property), - NULL, NULL, 0, B_FALSE) == 0 && - (strcmp(property, "legacy") == 0 || - strcmp(property, "none") == 0))) { + if ((clp->cl_prop == ZFS_PROP_MOUNTPOINT) && legacy) { /* * do not automatically mount ex-legacy datasets if * we specifically set canmount to noauto diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c index 94640d1b128cf..dc27238c9cf37 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_config.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * The pool configuration repository is stored in /etc/zfs/zpool.cache as a * single packed nvlist. While it would be nice to just read in this @@ -313,21 +311,33 @@ zpool_iter(libzfs_handle_t *hdl, zpool_iter_f func, void *data) zpool_handle_t *zhp; int ret; - if (namespace_reload(hdl) != 0) + /* + * If someone makes a recursive call to zpool_iter(), we want to avoid + * refreshing the namespace because that will invalidate the parent + * context. We allow recursive calls, but simply re-use the same + * namespace AVL tree. + */ + if (!hdl->libzfs_pool_iter && namespace_reload(hdl) != 0) return (-1); + hdl->libzfs_pool_iter++; for (cn = uu_avl_first(hdl->libzfs_ns_avl); cn != NULL; cn = uu_avl_next(hdl->libzfs_ns_avl, cn)) { - if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) + if (zpool_open_silent(hdl, cn->cn_name, &zhp) != 0) { + hdl->libzfs_pool_iter--; return (-1); + } if (zhp == NULL) continue; - if ((ret = func(zhp, data)) != 0) + if ((ret = func(zhp, data)) != 0) { + hdl->libzfs_pool_iter--; return (ret); + } } + hdl->libzfs_pool_iter--; return (0); } diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c index a8005ffc0cf53..bd63372301d53 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_dataset.c @@ -20,14 +20,12 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#include #include #include -#include #include #include #include @@ -38,15 +36,17 @@ #include #include #include -#include #include -#include #include #include #include #include #include +#include +#include +#include +#include #include #include #include @@ -56,7 +56,8 @@ #include "libzfs_impl.h" #include "zfs_deleg.h" -static int zvol_create_link_common(libzfs_handle_t *, const char *, int); +static int userquota_propname_decode(const char *propname, boolean_t zoned, + zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp); /* * Given a single type (not a mask of types), return the type in a human @@ -108,7 +109,6 @@ path_to_str(const char *path, int types) return (path_to_str(path, types & ~ZFS_TYPE_SNAPSHOT)); } - /* * The user has requested either filesystems or volumes. * We have no way of knowing a priori what type this would be, so always @@ -123,8 +123,8 @@ path_to_str(const char *path, int types) /* * Validate a ZFS path. This is used even before trying to open the dataset, to - * provide a more meaningful error message. We place a more useful message in - * 'buf' detailing exactly why the name was not valid. + * provide a more meaningful error message. We call zfs_error_aux() to + * explain exactly why the name was not valid. */ static int zfs_validate_name(libzfs_handle_t *hdl, const char *path, int type, @@ -319,21 +319,43 @@ zpool_free_handles(libzfs_handle_t *hdl) * Utility function to gather stats (objset and zpl) for the given object. */ static int -get_stats(zfs_handle_t *zhp) +get_stats_ioctl(zfs_handle_t *zhp, zfs_cmd_t *zc) { - zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; - nvlist_t *allprops, *userprops; - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); + + while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, zc) != 0) { + if (errno == ENOMEM) { + if (zcmd_expand_dst_nvlist(hdl, zc) != 0) { + return (-1); + } + } else { + return (-1); + } + } + return (0); +} + +/* + * Utility function to get the received properties of the given object. + */ +static int +get_recvd_props_ioctl(zfs_handle_t *zhp) +{ + libzfs_handle_t *hdl = zhp->zfs_hdl; + nvlist_t *recvdprops; + zfs_cmd_t zc = { 0 }; + int err; if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) return (-1); - while (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_OBJSET_STATS, &zc) != 0) { + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + while (ioctl(hdl->libzfs_fd, ZFS_IOC_OBJSET_RECVD_PROPS, &zc) != 0) { if (errno == ENOMEM) { if (zcmd_expand_dst_nvlist(hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); return (-1); } } else { @@ -342,15 +364,32 @@ get_stats(zfs_handle_t *zhp) } } - zhp->zfs_dmustats = zc.zc_objset_stats; /* structure assignment */ + err = zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &recvdprops); + zcmd_free_nvlists(&zc); + if (err != 0) + return (-1); - if (zcmd_read_dst_nvlist(hdl, &zc, &allprops) != 0) { - zcmd_free_nvlists(&zc); + nvlist_free(zhp->zfs_recvd_props); + zhp->zfs_recvd_props = recvdprops; + + return (0); +} + +static int +put_stats_zhdl(zfs_handle_t *zhp, zfs_cmd_t *zc) +{ + nvlist_t *allprops, *userprops; + + zhp->zfs_dmustats = zc->zc_objset_stats; /* structure assignment */ + + if (zcmd_read_dst_nvlist(zhp->zfs_hdl, zc, &allprops) != 0) { return (-1); } - zcmd_free_nvlists(&zc); - + /* + * XXX Why do we store the user props separately, in addition to + * storing them in zfs_props? + */ if ((userprops = process_user_props(zhp, allprops)) == NULL) { nvlist_free(allprops); return (-1); @@ -365,6 +404,22 @@ get_stats(zfs_handle_t *zhp) return (0); } +static int +get_stats(zfs_handle_t *zhp) +{ + int rc = 0; + zfs_cmd_t zc = { 0 }; + + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + if (get_stats_ioctl(zhp, &zc) != 0) + rc = -1; + else if (put_stats_zhdl(zhp, &zc) != 0) + rc = -1; + zcmd_free_nvlists(&zc); + return (rc); +} + /* * Refresh the properties currently stored in the handle. */ @@ -378,74 +433,11 @@ zfs_refresh_properties(zfs_handle_t *zhp) * Makes a handle from the given dataset name. Used by zfs_open() and * zfs_iter_* to create child handles on the fly. */ -zfs_handle_t * -make_dataset_handle(libzfs_handle_t *hdl, const char *path) +static int +make_dataset_handle_common(zfs_handle_t *zhp, zfs_cmd_t *zc) { - zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); - char *logstr; - - if (zhp == NULL) - return (NULL); - - zhp->zfs_hdl = hdl; - - /* - * Preserve history log string. - * any changes performed here will be - * logged as an internal event. - */ - logstr = zhp->zfs_hdl->libzfs_log_str; - zhp->zfs_hdl->libzfs_log_str = NULL; -top: - (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); - - if (get_stats(zhp) != 0) { - zhp->zfs_hdl->libzfs_log_str = logstr; - free(zhp); - return (NULL); - } - - if (zhp->zfs_dmustats.dds_inconsistent) { - zfs_cmd_t zc = { 0 }; - - /* - * If it is dds_inconsistent, then we've caught it in - * the middle of a 'zfs receive' or 'zfs destroy', and - * it is inconsistent from the ZPL's point of view, so - * can't be mounted. However, it could also be that we - * have crashed in the middle of one of those - * operations, in which case we need to get rid of the - * inconsistent state. We do that by either rolling - * back to the previous snapshot (which will fail if - * there is none), or destroying the filesystem. Note - * that if we are still in the middle of an active - * 'receive' or 'destroy', then the rollback and destroy - * will fail with EBUSY and we will drive on as usual. - */ - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - - if (zhp->zfs_dmustats.dds_type == DMU_OST_ZVOL) { - (void) zvol_remove_link(hdl, zhp->zfs_name); - zc.zc_objset_type = DMU_OST_ZVOL; - } else { - zc.zc_objset_type = DMU_OST_ZFS; - } - - /* - * If we can successfully destroy it, pretend that it - * never existed. - */ - if (ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc) == 0) { - zhp->zfs_hdl->libzfs_log_str = logstr; - free(zhp); - errno = ENOENT; - return (NULL); - } - /* If we can successfully roll it back, reget the stats */ - if (ioctl(hdl->libzfs_fd, ZFS_IOC_ROLLBACK, &zc) == 0) - goto top; - } + if (put_stats_zhdl(zhp, zc) != 0) + return (-1); /* * We've managed to open the dataset and gather statistics. Determine @@ -467,8 +459,53 @@ make_dataset_handle(libzfs_handle_t *hdl, const char *path) else abort(); /* we should never see any other types */ - zhp->zfs_hdl->libzfs_log_str = logstr; zhp->zpool_hdl = zpool_handle(zhp); + return (0); +} + +zfs_handle_t * +make_dataset_handle(libzfs_handle_t *hdl, const char *path) +{ + zfs_cmd_t zc = { 0 }; + + zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = hdl; + (void) strlcpy(zhp->zfs_name, path, sizeof (zhp->zfs_name)); + if (zcmd_alloc_dst_nvlist(hdl, &zc, 0) != 0) { + free(zhp); + return (NULL); + } + if (get_stats_ioctl(zhp, &zc) == -1) { + zcmd_free_nvlists(&zc); + free(zhp); + return (NULL); + } + if (make_dataset_handle_common(zhp, &zc) == -1) { + free(zhp); + zhp = NULL; + } + zcmd_free_nvlists(&zc); + return (zhp); +} + +static zfs_handle_t * +make_dataset_handle_zc(libzfs_handle_t *hdl, zfs_cmd_t *zc) +{ + zfs_handle_t *zhp = calloc(sizeof (zfs_handle_t), 1); + + if (zhp == NULL) + return (NULL); + + zhp->zfs_hdl = hdl; + (void) strlcpy(zhp->zfs_name, zc->zc_name, sizeof (zhp->zfs_name)); + if (make_dataset_handle_common(zhp, zc) == -1) { + free(zhp); + return (NULL); + } return (zhp); } @@ -524,9 +561,145 @@ zfs_close(zfs_handle_t *zhp) free(zhp->zfs_mntopts); nvlist_free(zhp->zfs_props); nvlist_free(zhp->zfs_user_props); + nvlist_free(zhp->zfs_recvd_props); free(zhp); } +typedef struct mnttab_node { + struct mnttab mtn_mt; + avl_node_t mtn_node; +} mnttab_node_t; + +static int +libzfs_mnttab_cache_compare(const void *arg1, const void *arg2) +{ + const mnttab_node_t *mtn1 = arg1; + const mnttab_node_t *mtn2 = arg2; + int rv; + + rv = strcmp(mtn1->mtn_mt.mnt_special, mtn2->mtn_mt.mnt_special); + + if (rv == 0) + return (0); + return (rv > 0 ? 1 : -1); +} + +void +libzfs_mnttab_init(libzfs_handle_t *hdl) +{ + assert(avl_numnodes(&hdl->libzfs_mnttab_cache) == 0); + avl_create(&hdl->libzfs_mnttab_cache, libzfs_mnttab_cache_compare, + sizeof (mnttab_node_t), offsetof(mnttab_node_t, mtn_node)); +} + +void +libzfs_mnttab_update(libzfs_handle_t *hdl) +{ + struct mnttab entry; + + rewind(hdl->libzfs_mnttab); + while (getmntent(hdl->libzfs_mnttab, &entry) == 0) { + mnttab_node_t *mtn; + + if (strcmp(entry.mnt_fstype, MNTTYPE_ZFS) != 0) + continue; + mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); + mtn->mtn_mt.mnt_special = zfs_strdup(hdl, entry.mnt_special); + mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, entry.mnt_mountp); + mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, entry.mnt_fstype); + mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); + avl_add(&hdl->libzfs_mnttab_cache, mtn); + } +} + +void +libzfs_mnttab_fini(libzfs_handle_t *hdl) +{ + void *cookie = NULL; + mnttab_node_t *mtn; + + while (mtn = avl_destroy_nodes(&hdl->libzfs_mnttab_cache, &cookie)) { + free(mtn->mtn_mt.mnt_special); + free(mtn->mtn_mt.mnt_mountp); + free(mtn->mtn_mt.mnt_fstype); + free(mtn->mtn_mt.mnt_mntopts); + free(mtn); + } + avl_destroy(&hdl->libzfs_mnttab_cache); +} + +void +libzfs_mnttab_cache(libzfs_handle_t *hdl, boolean_t enable) +{ + hdl->libzfs_mnttab_enable = enable; +} + +int +libzfs_mnttab_find(libzfs_handle_t *hdl, const char *fsname, + struct mnttab *entry) +{ + mnttab_node_t find; + mnttab_node_t *mtn; + + if (!hdl->libzfs_mnttab_enable) { + struct mnttab srch = { 0 }; + + if (avl_numnodes(&hdl->libzfs_mnttab_cache)) + libzfs_mnttab_fini(hdl); + rewind(hdl->libzfs_mnttab); + srch.mnt_special = (char *)fsname; + srch.mnt_fstype = MNTTYPE_ZFS; + if (getmntany(hdl->libzfs_mnttab, entry, &srch) == 0) + return (0); + else + return (ENOENT); + } + + if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) + libzfs_mnttab_update(hdl); + + find.mtn_mt.mnt_special = (char *)fsname; + mtn = avl_find(&hdl->libzfs_mnttab_cache, &find, NULL); + if (mtn) { + *entry = mtn->mtn_mt; + return (0); + } + return (ENOENT); +} + +void +libzfs_mnttab_add(libzfs_handle_t *hdl, const char *special, + const char *mountp, const char *mntopts) +{ + mnttab_node_t *mtn; + + if (avl_numnodes(&hdl->libzfs_mnttab_cache) == 0) + return; + mtn = zfs_alloc(hdl, sizeof (mnttab_node_t)); + mtn->mtn_mt.mnt_special = zfs_strdup(hdl, special); + mtn->mtn_mt.mnt_mountp = zfs_strdup(hdl, mountp); + mtn->mtn_mt.mnt_fstype = zfs_strdup(hdl, MNTTYPE_ZFS); + mtn->mtn_mt.mnt_mntopts = zfs_strdup(hdl, mntopts); + avl_add(&hdl->libzfs_mnttab_cache, mtn); +} + +void +libzfs_mnttab_remove(libzfs_handle_t *hdl, const char *fsname) +{ + mnttab_node_t find; + mnttab_node_t *ret; + + find.mtn_mt.mnt_special = (char *)fsname; + if (ret = avl_find(&hdl->libzfs_mnttab_cache, (void *)&find, NULL)) { + avl_remove(&hdl->libzfs_mnttab_cache, ret); + free(ret->mtn_mt.mnt_special); + free(ret->mtn_mt.mnt_mountp); + free(ret->mtn_mt.mnt_fstype); + free(ret->mtn_mt.mnt_mntopts); + free(ret); + } +} + int zfs_spa_version(zfs_handle_t *zhp, int *spa_version) { @@ -581,23 +754,18 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, return (NULL); } + /* + * Make sure this property is valid and applies to this type. + */ + elem = NULL; while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { const char *propname = nvpair_name(elem); - /* - * Make sure this property is valid and applies to this type. - */ - if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { - if (!zfs_prop_user(propname)) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "invalid property '%s'"), propname); - (void) zfs_error(hdl, EZFS_BADPROP, errbuf); - goto error; - } - + prop = zfs_name_to_prop(propname); + if (prop == ZPROP_INVAL && zfs_prop_user(propname)) { /* - * If this is a user property, make sure it's a + * This is a user property: make sure it's a * string, and that it's less than ZAP_MAXNAMELEN. */ if (nvpair_type(elem) != DATA_TYPE_STRING) { @@ -623,6 +791,10 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, continue; } + /* + * Currently, only user properties can be modified on + * snapshots. + */ if (type == ZFS_TYPE_SNAPSHOT) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "this property can not be modified for snapshots")); @@ -630,6 +802,85 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, goto error; } + if (prop == ZPROP_INVAL && zfs_prop_userquota(propname)) { + zfs_userquota_prop_t uqtype; + char newpropname[128]; + char domain[128]; + uint64_t rid; + uint64_t valary[3]; + + if (userquota_propname_decode(propname, zoned, + &uqtype, domain, sizeof (domain), &rid) != 0) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, + "'%s' has an invalid user/group name"), + propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + if (uqtype != ZFS_PROP_USERQUOTA && + uqtype != ZFS_PROP_GROUPQUOTA) { + zfs_error_aux(hdl, + dgettext(TEXT_DOMAIN, "'%s' is readonly"), + propname); + (void) zfs_error(hdl, EZFS_PROPREADONLY, + errbuf); + goto error; + } + + if (nvpair_type(elem) == DATA_TYPE_STRING) { + (void) nvpair_value_string(elem, &strval); + if (strcmp(strval, "none") == 0) { + intval = 0; + } else if (zfs_nicestrtonum(hdl, + strval, &intval) != 0) { + (void) zfs_error(hdl, + EZFS_BADPROP, errbuf); + goto error; + } + } else if (nvpair_type(elem) == + DATA_TYPE_UINT64) { + (void) nvpair_value_uint64(elem, &intval); + if (intval == 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "use 'none' to disable " + "userquota/groupquota")); + goto error; + } + } else { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "'%s' must be a number"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + + /* + * Encode the prop name as + * userquota@-domain, to make it easy + * for the kernel to decode. + */ + (void) snprintf(newpropname, sizeof (newpropname), + "%s%llx-%s", zfs_userquota_prop_prefixes[uqtype], + (longlong_t)rid, domain); + valary[0] = uqtype; + valary[1] = rid; + valary[2] = intval; + if (nvlist_add_uint64_array(ret, newpropname, + valary, 3) != 0) { + (void) no_memory(hdl); + goto error; + } + continue; + } + + if (prop == ZPROP_INVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid property '%s'"), propname); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + goto error; + } + if (!zfs_prop_valid_for_type(prop, type)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' does not " @@ -700,6 +951,60 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, break; + case ZFS_PROP_MLSLABEL: + { + /* + * Verify the mlslabel string and convert to + * internal hex label string. + */ + + m_label_t *new_sl; + char *hex = NULL; /* internal label string */ + + /* Default value is already OK. */ + if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) + break; + + /* Verify the label can be converted to binary form */ + if (((new_sl = m_label_alloc(MAC_LABEL)) == NULL) || + (str_to_label(strval, &new_sl, MAC_LABEL, + L_NO_CORRECTION, NULL) == -1)) { + goto badlabel; + } + + /* Now translate to hex internal label string */ + if (label_to_str(new_sl, &hex, M_INTERNAL, + DEF_NAMES) != 0) { + if (hex) + free(hex); + goto badlabel; + } + m_label_free(new_sl); + + /* If string is already in internal form, we're done. */ + if (strcmp(strval, hex) == 0) { + free(hex); + break; + } + + /* Replace the label string with the internal form. */ + (void) nvlist_remove(ret, zfs_prop_to_name(prop), + DATA_TYPE_STRING); + verify(nvlist_add_string(ret, zfs_prop_to_name(prop), + hex) == 0); + free(hex); + + break; + +badlabel: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid mlslabel '%s'"), strval); + (void) zfs_error(hdl, EZFS_BADPROP, errbuf); + m_label_free(new_sl); /* OK if null */ + goto error; + + } + case ZFS_PROP_MOUNTPOINT: { namecheck_err_t why; @@ -769,7 +1074,7 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, } else if (getzoneid() != GLOBAL_ZONEID) { /* * If zoned property is 'off', this must be in - * a globle zone. If not, something is wrong. + * a global zone. If not, something is wrong. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "'%s' cannot be set while dataset " @@ -953,808 +1258,82 @@ zfs_valid_proplist(libzfs_handle_t *hdl, zfs_type_t type, nvlist_t *nvl, return (NULL); } -static int -zfs_get_perm_who(const char *who, zfs_deleg_who_type_t *who_type, - uint64_t *ret_who) +void +zfs_setprop_error(libzfs_handle_t *hdl, zfs_prop_t prop, int err, + char *errbuf) { - struct passwd *pwd; - struct group *grp; - uid_t id; - - if (*who_type == ZFS_DELEG_EVERYONE || *who_type == ZFS_DELEG_CREATE || - *who_type == ZFS_DELEG_NAMED_SET) { - *ret_who = -1; - return (0); - } - if (who == NULL && !(*who_type == ZFS_DELEG_EVERYONE)) - return (EZFS_BADWHO); - - if (*who_type == ZFS_DELEG_WHO_UNKNOWN && - strcmp(who, "everyone") == 0) { - *ret_who = -1; - *who_type = ZFS_DELEG_EVERYONE; - return (0); - } - - pwd = getpwnam(who); - grp = getgrnam(who); - - if ((*who_type == ZFS_DELEG_USER) && pwd) { - *ret_who = pwd->pw_uid; - } else if ((*who_type == ZFS_DELEG_GROUP) && grp) { - *ret_who = grp->gr_gid; - } else if (pwd) { - *ret_who = pwd->pw_uid; - *who_type = ZFS_DELEG_USER; - } else if (grp) { - *ret_who = grp->gr_gid; - *who_type = ZFS_DELEG_GROUP; - } else { - char *end; - - id = strtol(who, &end, 10); - if (errno != 0 || *end != '\0') { - return (EZFS_BADWHO); - } else { - *ret_who = id; - if (*who_type == ZFS_DELEG_WHO_UNKNOWN) - *who_type = ZFS_DELEG_USER; - } - } - - return (0); -} - -static void -zfs_perms_add_to_nvlist(nvlist_t *who_nvp, char *name, nvlist_t *perms_nvp) -{ - if (perms_nvp != NULL) { - verify(nvlist_add_nvlist(who_nvp, - name, perms_nvp) == 0); - } else { - verify(nvlist_add_boolean(who_nvp, name) == 0); - } -} - -static void -helper(zfs_deleg_who_type_t who_type, uint64_t whoid, char *whostr, - zfs_deleg_inherit_t inherit, nvlist_t *who_nvp, nvlist_t *perms_nvp, - nvlist_t *sets_nvp) -{ - boolean_t do_perms, do_sets; - char name[ZFS_MAX_DELEG_NAME]; - - do_perms = (nvlist_next_nvpair(perms_nvp, NULL) != NULL); - do_sets = (nvlist_next_nvpair(sets_nvp, NULL) != NULL); - - if (!do_perms && !do_sets) - do_perms = do_sets = B_TRUE; - - if (do_perms) { - zfs_deleg_whokey(name, who_type, inherit, - (who_type == ZFS_DELEG_NAMED_SET) ? - whostr : (void *)&whoid); - zfs_perms_add_to_nvlist(who_nvp, name, perms_nvp); - } - if (do_sets) { - zfs_deleg_whokey(name, toupper(who_type), inherit, - (who_type == ZFS_DELEG_NAMED_SET) ? - whostr : (void *)&whoid); - zfs_perms_add_to_nvlist(who_nvp, name, sets_nvp); - } -} - -static void -zfs_perms_add_who_nvlist(nvlist_t *who_nvp, uint64_t whoid, void *whostr, - nvlist_t *perms_nvp, nvlist_t *sets_nvp, - zfs_deleg_who_type_t who_type, zfs_deleg_inherit_t inherit) -{ - if (who_type == ZFS_DELEG_NAMED_SET || who_type == ZFS_DELEG_CREATE) { - helper(who_type, whoid, whostr, 0, - who_nvp, perms_nvp, sets_nvp); - } else { - if (inherit & ZFS_DELEG_PERM_LOCAL) { - helper(who_type, whoid, whostr, ZFS_DELEG_LOCAL, - who_nvp, perms_nvp, sets_nvp); - } - if (inherit & ZFS_DELEG_PERM_DESCENDENT) { - helper(who_type, whoid, whostr, ZFS_DELEG_DESCENDENT, - who_nvp, perms_nvp, sets_nvp); - } - } -} - -/* - * Construct nvlist to pass down to kernel for setting/removing permissions. - * - * The nvlist is constructed as a series of nvpairs with an optional embedded - * nvlist of permissions to remove or set. The topmost nvpairs are the actual - * base attribute named stored in the dsl. - * Arguments: - * - * whostr: is a comma separated list of users, groups, or a single set name. - * whostr may be null for everyone or create perms. - * who_type: is the type of entry in whostr. Typically this will be - * ZFS_DELEG_WHO_UNKNOWN. - * perms: common separated list of permissions. May be null if user - * is requested to remove permissions by who. - * inherit: Specifies the inheritance of the permissions. Will be either - * ZFS_DELEG_PERM_LOCAL and/or ZFS_DELEG_PERM_DESCENDENT. - * nvp The constructed nvlist to pass to zfs_perm_set(). - * The output nvp will look something like this. - * ul$1234 -> {create ; destroy } - * Ul$1234 -> { @myset } - * s-$@myset - { snapshot; checksum; compression } - */ -int -zfs_build_perms(zfs_handle_t *zhp, char *whostr, char *perms, - zfs_deleg_who_type_t who_type, zfs_deleg_inherit_t inherit, nvlist_t **nvp) -{ - nvlist_t *who_nvp; - nvlist_t *perms_nvp = NULL; - nvlist_t *sets_nvp = NULL; - char errbuf[1024]; - char *who_tok, *perm; - int error; - - *nvp = NULL; - - if (perms) { - if ((error = nvlist_alloc(&perms_nvp, - NV_UNIQUE_NAME, 0)) != 0) { - return (1); - } - if ((error = nvlist_alloc(&sets_nvp, - NV_UNIQUE_NAME, 0)) != 0) { - nvlist_free(perms_nvp); - return (1); - } - } - - if ((error = nvlist_alloc(&who_nvp, NV_UNIQUE_NAME, 0)) != 0) { - if (perms_nvp) - nvlist_free(perms_nvp); - if (sets_nvp) - nvlist_free(sets_nvp); - return (1); - } - - if (who_type == ZFS_DELEG_NAMED_SET) { - namecheck_err_t why; - char what; - - if ((error = permset_namecheck(whostr, &why, &what)) != 0) { - nvlist_free(who_nvp); - if (perms_nvp) - nvlist_free(perms_nvp); - if (sets_nvp) - nvlist_free(sets_nvp); - - switch (why) { - case NAME_ERR_NO_AT: - zfs_error_aux(zhp->zfs_hdl, - dgettext(TEXT_DOMAIN, - "set definition must begin with an '@' " - "character")); - } - return (zfs_error(zhp->zfs_hdl, - EZFS_BADPERMSET, whostr)); - } - } - - /* - * Build up nvlist(s) of permissions. Two nvlists are maintained. - * The first nvlist perms_nvp will have normal permissions and the - * other sets_nvp will have only permssion set names in it. - */ - for (perm = strtok(perms, ","); perm; perm = strtok(NULL, ",")) { - const char *perm_canonical = zfs_deleg_canonicalize_perm(perm); - - if (perm_canonical) { - verify(nvlist_add_boolean(perms_nvp, - perm_canonical) == 0); - } else if (perm[0] == '@') { - verify(nvlist_add_boolean(sets_nvp, perm) == 0); - } else { - nvlist_free(who_nvp); - nvlist_free(perms_nvp); - nvlist_free(sets_nvp); - return (zfs_error(zhp->zfs_hdl, EZFS_BADPERM, perm)); - } - } - - if (whostr && who_type != ZFS_DELEG_CREATE) { - who_tok = strtok(whostr, ","); - if (who_tok == NULL) { - nvlist_free(who_nvp); - if (perms_nvp) - nvlist_free(perms_nvp); - if (sets_nvp) - nvlist_free(sets_nvp); - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "Who string is NULL"), - whostr); - return (zfs_error(zhp->zfs_hdl, EZFS_BADWHO, errbuf)); - } - } - - /* - * Now create the nvlist(s) - */ - do { - uint64_t who_id; - - error = zfs_get_perm_who(who_tok, &who_type, - &who_id); - if (error) { - nvlist_free(who_nvp); - if (perms_nvp) - nvlist_free(perms_nvp); - if (sets_nvp) - nvlist_free(sets_nvp); - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, - "Unable to determine uid/gid for " - "%s "), who_tok); - return (zfs_error(zhp->zfs_hdl, EZFS_BADWHO, errbuf)); - } + switch (err) { + case ENOSPC: /* - * add entries for both local and descendent when required + * For quotas and reservations, ENOSPC indicates + * something different; setting a quota or reservation + * doesn't use any disk space. */ - zfs_perms_add_who_nvlist(who_nvp, who_id, who_tok, - perms_nvp, sets_nvp, who_type, inherit); - - } while (who_tok = strtok(NULL, ",")); - *nvp = who_nvp; - return (0); -} - -static int -zfs_perm_set_common(zfs_handle_t *zhp, nvlist_t *nvp, boolean_t unset) -{ - zfs_cmd_t zc = { 0 }; - int error; - char errbuf[1024]; - - (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "Cannot update 'allows' for '%s'"), - zhp->zfs_name); - - if (zcmd_write_src_nvlist(zhp->zfs_hdl, &zc, nvp)) - return (-1); - - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - zc.zc_perm_action = unset; - - error = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_SET_FSACL, &zc); - if (error && errno == ENOTSUP) { - (void) snprintf(errbuf, sizeof (errbuf), - gettext("Pool must be upgraded to use 'allow/unallow'")); - zcmd_free_nvlists(&zc); - return (zfs_error(zhp->zfs_hdl, EZFS_BADVERSION, errbuf)); - } else if (error) { - return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); - } - zcmd_free_nvlists(&zc); - - return (error); -} - -int -zfs_perm_set(zfs_handle_t *zhp, nvlist_t *nvp) -{ - return (zfs_perm_set_common(zhp, nvp, B_FALSE)); -} - -int -zfs_perm_remove(zfs_handle_t *zhp, nvlist_t *perms) -{ - return (zfs_perm_set_common(zhp, perms, B_TRUE)); -} - -static int -perm_compare(const void *arg1, const void *arg2) -{ - const zfs_perm_node_t *node1 = arg1; - const zfs_perm_node_t *node2 = arg2; - int ret; - - ret = strcmp(node1->z_pname, node2->z_pname); - - if (ret > 0) - return (1); - if (ret < 0) - return (-1); - else - return (0); -} - -static void -zfs_destroy_perm_tree(avl_tree_t *tree) -{ - zfs_perm_node_t *permnode; - void *cookie = NULL; - - while ((permnode = avl_destroy_nodes(tree, &cookie)) != NULL) - free(permnode); - avl_destroy(tree); -} - -static void -zfs_destroy_tree(avl_tree_t *tree) -{ - zfs_allow_node_t *allownode; - void *cookie = NULL; - - while ((allownode = avl_destroy_nodes(tree, &cookie)) != NULL) { - zfs_destroy_perm_tree(&allownode->z_localdescend); - zfs_destroy_perm_tree(&allownode->z_local); - zfs_destroy_perm_tree(&allownode->z_descend); - free(allownode); - } - avl_destroy(tree); -} - -void -zfs_free_allows(zfs_allow_t *allow) -{ - zfs_allow_t *allownext; - zfs_allow_t *freeallow; - - allownext = allow; - while (allownext) { - zfs_destroy_tree(&allownext->z_sets); - zfs_destroy_tree(&allownext->z_crperms); - zfs_destroy_tree(&allownext->z_user); - zfs_destroy_tree(&allownext->z_group); - zfs_destroy_tree(&allownext->z_everyone); - freeallow = allownext; - allownext = allownext->z_next; - free(freeallow); - } -} - -static zfs_allow_t * -zfs_alloc_perm_tree(zfs_handle_t *zhp, zfs_allow_t *prev, char *setpoint) -{ - zfs_allow_t *ptree; - - if ((ptree = zfs_alloc(zhp->zfs_hdl, - sizeof (zfs_allow_t))) == NULL) { - return (NULL); - } - - (void) strlcpy(ptree->z_setpoint, setpoint, sizeof (ptree->z_setpoint)); - avl_create(&ptree->z_sets, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - avl_create(&ptree->z_crperms, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - avl_create(&ptree->z_user, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - avl_create(&ptree->z_group, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - avl_create(&ptree->z_everyone, - perm_compare, sizeof (zfs_allow_node_t), - offsetof(zfs_allow_node_t, z_node)); - - if (prev) - prev->z_next = ptree; - ptree->z_next = NULL; - return (ptree); -} + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_REFQUOTA: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "size is less than current used or " + "reserved space")); + (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); + break; -/* - * Add permissions to the appropriate AVL permission tree. - * The appropriate tree may not be the requested tree. - * For example if ld indicates a local permission, but - * same permission also exists as a descendent permission - * then the permission will be removed from the descendent - * tree and add the the local+descendent tree. - */ -static int -zfs_coalesce_perm(zfs_handle_t *zhp, zfs_allow_node_t *allownode, - char *perm, char ld) -{ - zfs_perm_node_t pnode, *permnode, *permnode2; - zfs_perm_node_t *newnode; - avl_index_t where, where2; - avl_tree_t *tree, *altree; - - (void) strlcpy(pnode.z_pname, perm, sizeof (pnode.z_pname)); - - if (ld == ZFS_DELEG_NA) { - tree = &allownode->z_localdescend; - altree = &allownode->z_descend; - } else if (ld == ZFS_DELEG_LOCAL) { - tree = &allownode->z_local; - altree = &allownode->z_descend; - } else { - tree = &allownode->z_descend; - altree = &allownode->z_local; - } - permnode = avl_find(tree, &pnode, &where); - permnode2 = avl_find(altree, &pnode, &where2); - - if (permnode2) { - avl_remove(altree, permnode2); - free(permnode2); - if (permnode == NULL) { - tree = &allownode->z_localdescend; - } - } + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFRESERVATION: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "size is greater than available space")); + (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); + break; - /* - * Now insert new permission in either requested location - * local/descendent or into ld when perm will exist in both. - */ - if (permnode == NULL) { - if ((newnode = zfs_alloc(zhp->zfs_hdl, - sizeof (zfs_perm_node_t))) == NULL) { - return (-1); + default: + (void) zfs_standard_error(hdl, err, errbuf); + break; } - *newnode = pnode; - avl_add(tree, newnode); - } - return (0); -} + break; -/* - * Uggh, this is going to be a bit complicated. - * we have an nvlist coming out of the kernel that - * will indicate where the permission is set and then - * it will contain allow of the various "who's", and what - * their permissions are. To further complicate this - * we will then have to coalesce the local,descendent - * and local+descendent permissions where appropriate. - * The kernel only knows about a permission as being local - * or descendent, but not both. - * - * In order to make this easier for zfs_main to deal with - * a series of AVL trees will be used to maintain - * all of this, primarily for sorting purposes as well - * as the ability to quickly locate a specific entry. - * - * What we end up with are tree's for sets, create perms, - * user, groups and everyone. With each of those trees - * we have subtrees for local, descendent and local+descendent - * permissions. - */ -int -zfs_perm_get(zfs_handle_t *zhp, zfs_allow_t **zfs_perms) -{ - zfs_cmd_t zc = { 0 }; - int error; - nvlist_t *nvlist; - nvlist_t *permnv, *sourcenv; - nvpair_t *who_pair, *source_pair; - nvpair_t *perm_pair; - char errbuf[1024]; - zfs_allow_t *zallowp, *newallowp; - char ld; - char *nvpname; - uid_t uid; - gid_t gid; - avl_tree_t *tree; - avl_index_t where; + case EBUSY: + (void) zfs_standard_error(hdl, EBUSY, errbuf); + break; - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + case EROFS: + (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); + break; - if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) - return (-1); + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool and or dataset must be upgraded to set this " + "property or value")); + (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); + break; - while (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_GET_FSACL, &zc) != 0) { - if (errno == ENOMEM) { - if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, &zc) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - } else if (errno == ENOTSUP) { - zcmd_free_nvlists(&zc); - (void) snprintf(errbuf, sizeof (errbuf), - gettext("Pool must be upgraded to use 'allow'")); - return (zfs_error(zhp->zfs_hdl, - EZFS_BADVERSION, errbuf)); + case ERANGE: + if (prop == ZFS_PROP_COMPRESSION) { + (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "property setting is not allowed on " + "bootable datasets")); + (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); } else { - zcmd_free_nvlists(&zc); - return (-1); + (void) zfs_standard_error(hdl, err, errbuf); } - } - - if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &nvlist) != 0) { - zcmd_free_nvlists(&zc); - return (-1); - } - - zcmd_free_nvlists(&zc); - - source_pair = nvlist_next_nvpair(nvlist, NULL); - - if (source_pair == NULL) { - *zfs_perms = NULL; - return (0); - } - - *zfs_perms = zfs_alloc_perm_tree(zhp, NULL, nvpair_name(source_pair)); - if (*zfs_perms == NULL) { - return (0); - } - - zallowp = *zfs_perms; - - for (;;) { - struct passwd *pwd; - struct group *grp; - zfs_allow_node_t *allownode; - zfs_allow_node_t findallownode; - zfs_allow_node_t *newallownode; - - (void) strlcpy(zallowp->z_setpoint, - nvpair_name(source_pair), - sizeof (zallowp->z_setpoint)); - - if ((error = nvpair_value_nvlist(source_pair, &sourcenv)) != 0) - goto abort; + break; + case EOVERFLOW: /* - * Make sure nvlist is composed correctly + * This platform can't address a volume this big. */ - if (zfs_deleg_verify_nvlist(sourcenv)) { - goto abort; - } - - who_pair = nvlist_next_nvpair(sourcenv, NULL); - if (who_pair == NULL) { - goto abort; - } - - do { - error = nvpair_value_nvlist(who_pair, &permnv); - if (error) { - goto abort; - } - - /* - * First build up the key to use - * for looking up in the various - * who trees. - */ - ld = nvpair_name(who_pair)[1]; - nvpname = nvpair_name(who_pair); - switch (nvpair_name(who_pair)[0]) { - case ZFS_DELEG_USER: - case ZFS_DELEG_USER_SETS: - tree = &zallowp->z_user; - uid = atol(&nvpname[3]); - pwd = getpwuid(uid); - (void) snprintf(findallownode.z_key, - sizeof (findallownode.z_key), "user %s", - (pwd) ? pwd->pw_name : - &nvpair_name(who_pair)[3]); - break; - case ZFS_DELEG_GROUP: - case ZFS_DELEG_GROUP_SETS: - tree = &zallowp->z_group; - gid = atol(&nvpname[3]); - grp = getgrgid(gid); - (void) snprintf(findallownode.z_key, - sizeof (findallownode.z_key), "group %s", - (grp) ? grp->gr_name : - &nvpair_name(who_pair)[3]); - break; - case ZFS_DELEG_CREATE: - case ZFS_DELEG_CREATE_SETS: - tree = &zallowp->z_crperms; - (void) strlcpy(findallownode.z_key, "", - sizeof (findallownode.z_key)); - break; - case ZFS_DELEG_EVERYONE: - case ZFS_DELEG_EVERYONE_SETS: - (void) snprintf(findallownode.z_key, - sizeof (findallownode.z_key), "everyone"); - tree = &zallowp->z_everyone; - break; - case ZFS_DELEG_NAMED_SET: - case ZFS_DELEG_NAMED_SET_SETS: - (void) snprintf(findallownode.z_key, - sizeof (findallownode.z_key), "%s", - &nvpair_name(who_pair)[3]); - tree = &zallowp->z_sets; - break; - } - - /* - * Place who in tree - */ - allownode = avl_find(tree, &findallownode, &where); - if (allownode == NULL) { - if ((newallownode = zfs_alloc(zhp->zfs_hdl, - sizeof (zfs_allow_node_t))) == NULL) { - goto abort; - } - avl_create(&newallownode->z_localdescend, - perm_compare, - sizeof (zfs_perm_node_t), - offsetof(zfs_perm_node_t, z_node)); - avl_create(&newallownode->z_local, - perm_compare, - sizeof (zfs_perm_node_t), - offsetof(zfs_perm_node_t, z_node)); - avl_create(&newallownode->z_descend, - perm_compare, - sizeof (zfs_perm_node_t), - offsetof(zfs_perm_node_t, z_node)); - (void) strlcpy(newallownode->z_key, - findallownode.z_key, - sizeof (findallownode.z_key)); - avl_insert(tree, newallownode, where); - allownode = newallownode; - } - - /* - * Now iterate over the permissions and - * place them in the appropriate local, - * descendent or local+descendent tree. - * - * The permissions are added to the tree - * via zfs_coalesce_perm(). - */ - perm_pair = nvlist_next_nvpair(permnv, NULL); - if (perm_pair == NULL) - goto abort; - do { - if (zfs_coalesce_perm(zhp, allownode, - nvpair_name(perm_pair), ld) != 0) - goto abort; - } while (perm_pair = nvlist_next_nvpair(permnv, - perm_pair)); - } while (who_pair = nvlist_next_nvpair(sourcenv, who_pair)); - - source_pair = nvlist_next_nvpair(nvlist, source_pair); - if (source_pair == NULL) +#ifdef _ILP32 + if (prop == ZFS_PROP_VOLSIZE) { + (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); break; - - /* - * allocate another node from the link list of - * zfs_allow_t structures - */ - newallowp = zfs_alloc_perm_tree(zhp, zallowp, - nvpair_name(source_pair)); - if (newallowp == NULL) { - goto abort; } - zallowp = newallowp; - } - nvlist_free(nvlist); - return (0); -abort: - zfs_free_allows(*zfs_perms); - nvlist_free(nvlist); - return (-1); -} - -static char * -zfs_deleg_perm_note(zfs_deleg_note_t note) -{ - /* - * Don't put newlines on end of lines - */ - switch (note) { - case ZFS_DELEG_NOTE_CREATE: - return (dgettext(TEXT_DOMAIN, - "Must also have the 'mount' ability")); - case ZFS_DELEG_NOTE_DESTROY: - return (dgettext(TEXT_DOMAIN, - "Must also have the 'mount' ability")); - case ZFS_DELEG_NOTE_SNAPSHOT: - return (dgettext(TEXT_DOMAIN, - "Must also have the 'mount' ability")); - case ZFS_DELEG_NOTE_ROLLBACK: - return (dgettext(TEXT_DOMAIN, - "Must also have the 'mount' ability")); - case ZFS_DELEG_NOTE_CLONE: - return (dgettext(TEXT_DOMAIN, "Must also have the 'create' " - "ability and 'mount'\n" - "\t\t\t\tability in the origin file system")); - case ZFS_DELEG_NOTE_PROMOTE: - return (dgettext(TEXT_DOMAIN, "Must also have the 'mount'\n" - "\t\t\t\tand 'promote' ability in the origin file system")); - case ZFS_DELEG_NOTE_RENAME: - return (dgettext(TEXT_DOMAIN, "Must also have the 'mount' " - "and 'create' \n\t\t\t\tability in the new parent")); - case ZFS_DELEG_NOTE_RECEIVE: - return (dgettext(TEXT_DOMAIN, "Must also have the 'mount'" - " and 'create' ability")); - case ZFS_DELEG_NOTE_USERPROP: - return (dgettext(TEXT_DOMAIN, - "Allows changing any user property")); - case ZFS_DELEG_NOTE_ALLOW: - return (dgettext(TEXT_DOMAIN, - "Must also have the permission that is being\n" - "\t\t\t\tallowed")); - case ZFS_DELEG_NOTE_MOUNT: - return (dgettext(TEXT_DOMAIN, - "Allows mount/umount of ZFS datasets")); - case ZFS_DELEG_NOTE_SHARE: - return (dgettext(TEXT_DOMAIN, - "Allows sharing file systems over NFS or SMB\n" - "\t\t\t\tprotocols")); - case ZFS_DELEG_NOTE_NONE: +#endif + /* FALLTHROUGH */ default: - return (dgettext(TEXT_DOMAIN, "")); + (void) zfs_standard_error(hdl, err, errbuf); } } -typedef enum { - ZFS_DELEG_SUBCOMMAND, - ZFS_DELEG_PROP, - ZFS_DELEG_OTHER -} zfs_deleg_perm_type_t; - -/* - * is the permission a subcommand or other? - */ -zfs_deleg_perm_type_t -zfs_deleg_perm_type(const char *perm) -{ - if (strcmp(perm, "userprop") == 0) - return (ZFS_DELEG_OTHER); - else - return (ZFS_DELEG_SUBCOMMAND); -} - -static char * -zfs_deleg_perm_type_str(zfs_deleg_perm_type_t type) -{ - switch (type) { - case ZFS_DELEG_SUBCOMMAND: - return (dgettext(TEXT_DOMAIN, "subcommand")); - case ZFS_DELEG_PROP: - return (dgettext(TEXT_DOMAIN, "property")); - case ZFS_DELEG_OTHER: - return (dgettext(TEXT_DOMAIN, "other")); - } - return (""); -} - -/*ARGSUSED*/ -static int -zfs_deleg_prop_cb(int prop, void *cb) -{ - if (zfs_prop_delegatable(prop)) - (void) fprintf(stderr, "%-15s %-15s\n", zfs_prop_to_name(prop), - zfs_deleg_perm_type_str(ZFS_DELEG_PROP)); - - return (ZPROP_CONT); -} - -void -zfs_deleg_permissions(void) -{ - int i; - - (void) fprintf(stderr, "\n%-15s %-15s\t%s\n\n", "NAME", - "TYPE", "NOTES"); - - /* - * First print out the subcommands - */ - for (i = 0; zfs_deleg_perm_tab[i].z_perm != NULL; i++) { - (void) fprintf(stderr, "%-15s %-15s\t%s\n", - zfs_deleg_perm_tab[i].z_perm, - zfs_deleg_perm_type_str( - zfs_deleg_perm_type(zfs_deleg_perm_tab[i].z_perm)), - zfs_deleg_perm_note(zfs_deleg_perm_tab[i].z_note)); - } - - (void) zprop_iter(zfs_deleg_prop_cb, NULL, B_FALSE, B_TRUE, - ZFS_TYPE_DATASET|ZFS_TYPE_VOLUME); -} - /* * Given a property name and value, set the property for the given dataset. */ @@ -1821,80 +1400,9 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) goto error; ret = zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc); - if (ret != 0) { - switch (errno) { - - case ENOSPC: - /* - * For quotas and reservations, ENOSPC indicates - * something different; setting a quota or reservation - * doesn't use any disk space. - */ - switch (prop) { - case ZFS_PROP_QUOTA: - case ZFS_PROP_REFQUOTA: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "size is less than current used or " - "reserved space")); - (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); - break; - - case ZFS_PROP_RESERVATION: - case ZFS_PROP_REFRESERVATION: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "size is greater than available space")); - (void) zfs_error(hdl, EZFS_PROPSPACE, errbuf); - break; - - default: - (void) zfs_standard_error(hdl, errno, errbuf); - break; - } - break; - - case EBUSY: - if (prop == ZFS_PROP_VOLBLOCKSIZE) - (void) zfs_error(hdl, EZFS_VOLHASDATA, errbuf); - else - (void) zfs_standard_error(hdl, EBUSY, errbuf); - break; - - case EROFS: - (void) zfs_error(hdl, EZFS_DSREADONLY, errbuf); - break; - - case ENOTSUP: - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "pool and or dataset must be upgraded to set this " - "property or value")); - (void) zfs_error(hdl, EZFS_BADVERSION, errbuf); - break; - case ERANGE: - if (prop == ZFS_PROP_COMPRESSION) { - (void) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "property setting is not allowed on " - "bootable datasets")); - (void) zfs_error(hdl, EZFS_NOTSUP, errbuf); - } else { - (void) zfs_standard_error(hdl, errno, errbuf); - } - break; - - case EOVERFLOW: - /* - * This platform can't address a volume this big. - */ -#ifdef _ILP32 - if (prop == ZFS_PROP_VOLSIZE) { - (void) zfs_error(hdl, EZFS_VOLTOOBIG, errbuf); - break; - } -#endif - /* FALLTHROUGH */ - default: - (void) zfs_standard_error(hdl, errno, errbuf); - } + if (ret != 0) { + zfs_setprop_error(hdl, prop, errno, errbuf); } else { if (do_prefix) ret = changelist_postfix(cl); @@ -1916,10 +1424,11 @@ zfs_prop_set(zfs_handle_t *zhp, const char *propname, const char *propval) } /* - * Given a property, inherit the value from the parent dataset. + * Given a property, inherit the value from the parent dataset, or if received + * is TRUE, revert to the received value, if any. */ int -zfs_prop_inherit(zfs_handle_t *zhp, const char *propname) +zfs_prop_inherit(zfs_handle_t *zhp, const char *propname, boolean_t received) { zfs_cmd_t zc = { 0 }; int ret; @@ -1931,6 +1440,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname) (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot inherit %s for '%s'"), propname, zhp->zfs_name); + zc.zc_cookie = received; if ((prop = zfs_name_to_prop(propname)) == ZPROP_INVAL) { /* * For user properties, the amount of work we have to do is very @@ -1957,7 +1467,7 @@ zfs_prop_inherit(zfs_handle_t *zhp, const char *propname) if (zfs_prop_readonly(prop)) return (zfs_error(hdl, EZFS_PROPREADONLY, errbuf)); - if (!zfs_prop_inheritable(prop)) + if (!zfs_prop_inheritable(prop) && !received) return (zfs_error(hdl, EZFS_PROPNONINHERIT, errbuf)); /* @@ -2031,6 +1541,8 @@ getprop_uint64(zfs_handle_t *zhp, zfs_prop_t prop, char **source) verify(nvlist_lookup_uint64(nv, ZPROP_VALUE, &value) == 0); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { + verify(!zhp->zfs_props_table || + zhp->zfs_props_table[prop] == B_TRUE); value = zfs_prop_default_numeric(prop); *source = ""; } @@ -2050,6 +1562,8 @@ getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) verify(nvlist_lookup_string(nv, ZPROP_VALUE, &value) == 0); (void) nvlist_lookup_string(nv, ZPROP_SOURCE, source); } else { + verify(!zhp->zfs_props_table || + zhp->zfs_props_table[prop] == B_TRUE); if ((value = (char *)zfs_prop_default_string(prop)) == NULL) value = ""; *source = ""; @@ -2058,6 +1572,26 @@ getprop_string(zfs_handle_t *zhp, zfs_prop_t prop, char **source) return (value); } +static boolean_t +zfs_is_recvd_props_mode(zfs_handle_t *zhp) +{ + return (zhp->zfs_props == zhp->zfs_recvd_props); +} + +static void +zfs_set_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) +{ + *cookie = (uint64_t)(uintptr_t)zhp->zfs_props; + zhp->zfs_props = zhp->zfs_recvd_props; +} + +static void +zfs_unset_recvd_props_mode(zfs_handle_t *zhp, uint64_t *cookie) +{ + zhp->zfs_props = (nvlist_t *)(uintptr_t)*cookie; + *cookie = 0; +} + /* * Internal function for getting a numeric property. Both zfs_prop_get() and * zfs_prop_get_int() are built using this interface. @@ -2076,6 +1610,7 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, struct mnttab mnt; char *mntopt_on = NULL; char *mntopt_off = NULL; + boolean_t received = zfs_is_recvd_props_mode(zhp); *source = NULL; @@ -2123,15 +1658,11 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, */ if (!zhp->zfs_mntcheck && (mntopt_on != NULL || prop == ZFS_PROP_MOUNTED)) { - struct mnttab entry, search = { 0 }; - FILE *mnttab = zhp->zfs_hdl->libzfs_mnttab; + libzfs_handle_t *hdl = zhp->zfs_hdl; + struct mnttab entry; - search.mnt_special = (char *)zhp->zfs_name; - search.mnt_fstype = MNTTYPE_ZFS; - rewind(mnttab); - - if (getmntany(mnttab, &entry, &search) == 0) { - zhp->zfs_mntopts = zfs_strdup(zhp->zfs_hdl, + if (libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0) { + zhp->zfs_mntopts = zfs_strdup(hdl, entry.mnt_mntopts); if (zhp->zfs_mntopts == NULL) return (-1); @@ -2155,6 +1686,9 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, case ZFS_PROP_NBMAND: *val = getprop_uint64(zhp, prop, source); + if (received) + break; + if (hasmntopt(&mnt, mntopt_on) && !*val) { *val = B_TRUE; if (src) @@ -2167,22 +1701,17 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, break; case ZFS_PROP_CANMOUNT: - *val = getprop_uint64(zhp, prop, source); - if (*val != ZFS_CANMOUNT_ON) - *source = zhp->zfs_name; - else - *source = ""; /* default */ - break; - + case ZFS_PROP_VOLSIZE: case ZFS_PROP_QUOTA: case ZFS_PROP_REFQUOTA: case ZFS_PROP_RESERVATION: case ZFS_PROP_REFRESERVATION: *val = getprop_uint64(zhp, prop, source); - if (*val == 0) - *source = ""; /* default */ - else + + if (*source == NULL) { + /* not default, must be local */ *source = zhp->zfs_name; + } break; case ZFS_PROP_MOUNTED: @@ -2203,21 +1732,13 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_OBJSET_ZPLPROPS, &zc)) { zcmd_free_nvlists(&zc); - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "unable to get %s property"), - zfs_prop_to_name(prop)); - return (zfs_error(zhp->zfs_hdl, EZFS_BADVERSION, - dgettext(TEXT_DOMAIN, "internal error"))); + return (-1); } if (zcmd_read_dst_nvlist(zhp->zfs_hdl, &zc, &zplprops) != 0 || nvlist_lookup_uint64(zplprops, zfs_prop_to_name(prop), val) != 0) { zcmd_free_nvlists(&zc); - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "unable to get %s property"), - zfs_prop_to_name(prop)); - return (zfs_error(zhp->zfs_hdl, EZFS_NOMEM, - dgettext(TEXT_DOMAIN, "internal error"))); + return (-1); } if (zplprops) nvlist_free(zplprops); @@ -2230,13 +1751,13 @@ get_numeric_property(zfs_handle_t *zhp, zfs_prop_t prop, zprop_source_t *src, case PROP_TYPE_INDEX: *val = getprop_uint64(zhp, prop, source); /* - * If we tried to use a defalut value for a + * If we tried to use a default value for a * readonly property, it means that it was not - * present; return an error. + * present. */ if (zfs_prop_readonly(prop) && - *source && (*source)[0] == '\0') { - return (-1); + *source != NULL && (*source)[0] == '\0') { + *source = NULL; } break; @@ -2266,6 +1787,8 @@ get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source, *srctype = ZPROP_SRC_NONE; } else if (source[0] == '\0') { *srctype = ZPROP_SRC_DEFAULT; + } else if (strstr(source, ZPROP_SOURCE_VAL_RECVD) != NULL) { + *srctype = ZPROP_SRC_RECEIVED; } else { if (strcmp(source, zhp->zfs_name) == 0) { *srctype = ZPROP_SRC_LOCAL; @@ -2277,6 +1800,43 @@ get_source(zfs_handle_t *zhp, zprop_source_t *srctype, char *source, } +int +zfs_prop_get_recvd(zfs_handle_t *zhp, const char *propname, char *propbuf, + size_t proplen, boolean_t literal) +{ + zfs_prop_t prop; + int err = 0; + + if (zhp->zfs_recvd_props == NULL) + if (get_recvd_props_ioctl(zhp) != 0) + return (-1); + + prop = zfs_name_to_prop(propname); + + if (prop != ZPROP_INVAL) { + uint64_t cookie; + if (!nvlist_exists(zhp->zfs_recvd_props, propname)) + return (-1); + zfs_set_recvd_props_mode(zhp, &cookie); + err = zfs_prop_get(zhp, prop, propbuf, proplen, + NULL, NULL, 0, literal); + zfs_unset_recvd_props_mode(zhp, &cookie); + } else if (zfs_prop_userquota(propname)) { + return (-1); + } else { + nvlist_t *propval; + char *recvdval; + if (nvlist_lookup_nvlist(zhp->zfs_recvd_props, + propname, &propval) != 0) + return (-1); + verify(nvlist_lookup_string(propval, ZPROP_VALUE, + &recvdval) == 0); + (void) strlcpy(propbuf, recvdval, proplen); + } + + return (err == 0 ? 0 : -1); +} + /* * Retrieve a property from the given object. If 'literal' is specified, then * numbers are left as exact values. Otherwise, numbers are converted to a @@ -2292,6 +1852,7 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, uint64_t val; char *str; const char *strval; + boolean_t received = zfs_is_recvd_props_mode(zhp); /* * Check to see if this property applies to our object @@ -2299,6 +1860,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) return (-1); + if (received && zfs_prop_readonly(prop)) + return (-1); + if (src) *src = ZPROP_SRC_NONE; @@ -2338,10 +1902,22 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, if (str[0] == '/') { char buf[MAXPATHLEN]; char *root = buf; - const char *relpath = zhp->zfs_name + strlen(source); + const char *relpath; - if (relpath[0] == '/') - relpath++; + /* + * If we inherit the mountpoint, even from a dataset + * with a received value, the source will be the path of + * the dataset we inherit from. If source is + * ZPROP_SOURCE_VAL_RECVD, the received value is not + * inherited. + */ + if (strcmp(source, ZPROP_SOURCE_VAL_RECVD) == 0) { + relpath = ""; + } else { + relpath = zhp->zfs_name + strlen(source); + if (relpath[0] == '/') + relpath++; + } if ((zpool_get_prop(zhp->zpool_hdl, ZPOOL_PROP_ALTROOT, buf, MAXPATHLEN, NULL)) || @@ -2420,8 +1996,9 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, case ZFS_PROP_COMPRESSRATIO: if (get_numeric_property(zhp, prop, src, &source, &val) != 0) return (-1); - (void) snprintf(propbuf, proplen, "%lld.%02lldx", (longlong_t) - val / 100, (longlong_t)val % 100); + (void) snprintf(propbuf, proplen, "%llu.%02llux", + (u_longlong_t)(val / 100), + (u_longlong_t)(val % 100)); break; case ZFS_PROP_TYPE: @@ -2466,6 +2043,44 @@ zfs_prop_get(zfs_handle_t *zhp, zfs_prop_t prop, char *propbuf, size_t proplen, (void) strlcpy(propbuf, zhp->zfs_name, proplen); break; + case ZFS_PROP_MLSLABEL: + { + m_label_t *new_sl = NULL; + char *ascii = NULL; /* human readable label */ + + (void) strlcpy(propbuf, + getprop_string(zhp, prop, &source), proplen); + + if (literal || (strcasecmp(propbuf, + ZFS_MLSLABEL_DEFAULT) == 0)) + break; + + /* + * Try to translate the internal hex string to + * human-readable output. If there are any + * problems just use the hex string. + */ + + if (str_to_label(propbuf, &new_sl, MAC_LABEL, + L_NO_CORRECTION, NULL) == -1) { + m_label_free(new_sl); + break; + } + + if (label_to_str(new_sl, &ascii, M_LABEL, + DEF_NAMES) != 0) { + if (ascii) + free(ascii); + m_label_free(new_sl); + break; + } + m_label_free(new_sl); + + (void) strlcpy(propbuf, ascii, proplen); + free(ascii); + } + break; + default: switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: @@ -2520,40 +2135,249 @@ zfs_prop_get_int(zfs_handle_t *zhp, zfs_prop_t prop) } int -zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) +zfs_prop_set_int(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t val) +{ + char buf[64]; + + (void) snprintf(buf, sizeof (buf), "%llu", (longlong_t)val); + return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); +} + +/* + * Similar to zfs_prop_get(), but returns the value as an integer. + */ +int +zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, + zprop_source_t *src, char *statbuf, size_t statlen) +{ + char *source; + + /* + * Check to see if this property applies to our object + */ + if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) { + return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, + dgettext(TEXT_DOMAIN, "cannot get property '%s'"), + zfs_prop_to_name(prop))); + } + + if (src) + *src = ZPROP_SRC_NONE; + + if (get_numeric_property(zhp, prop, src, &source, value) != 0) + return (-1); + + get_source(zhp, src, source, statbuf, statlen); + + return (0); +} + +static int +idmap_id_to_numeric_domain_rid(uid_t id, boolean_t isuser, + char **domainp, idmap_rid_t *ridp) +{ + idmap_handle_t *idmap_hdl = NULL; + idmap_get_handle_t *get_hdl = NULL; + idmap_stat status; + int err = EINVAL; + + if (idmap_init(&idmap_hdl) != IDMAP_SUCCESS) + goto out; + if (idmap_get_create(idmap_hdl, &get_hdl) != IDMAP_SUCCESS) + goto out; + + if (isuser) { + err = idmap_get_sidbyuid(get_hdl, id, + IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); + } else { + err = idmap_get_sidbygid(get_hdl, id, + IDMAP_REQ_FLG_USE_CACHE, domainp, ridp, &status); + } + if (err == IDMAP_SUCCESS && + idmap_get_mappings(get_hdl) == IDMAP_SUCCESS && + status == IDMAP_SUCCESS) + err = 0; + else + err = EINVAL; +out: + if (get_hdl) + idmap_get_destroy(get_hdl); + if (idmap_hdl) + (void) idmap_fini(idmap_hdl); + return (err); +} + +/* + * convert the propname into parameters needed by kernel + * Eg: userquota@ahrens -> ZFS_PROP_USERQUOTA, "", 126829 + * Eg: userused@matt@domain -> ZFS_PROP_USERUSED, "S-1-123-456", 789 + */ +static int +userquota_propname_decode(const char *propname, boolean_t zoned, + zfs_userquota_prop_t *typep, char *domain, int domainlen, uint64_t *ridp) +{ + zfs_userquota_prop_t type; + char *cp, *end; + char *numericsid = NULL; + boolean_t isuser; + + domain[0] = '\0'; + + /* Figure out the property type ({user|group}{quota|space}) */ + for (type = 0; type < ZFS_NUM_USERQUOTA_PROPS; type++) { + if (strncmp(propname, zfs_userquota_prop_prefixes[type], + strlen(zfs_userquota_prop_prefixes[type])) == 0) + break; + } + if (type == ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + *typep = type; + + isuser = (type == ZFS_PROP_USERQUOTA || + type == ZFS_PROP_USERUSED); + + cp = strchr(propname, '@') + 1; + + if (strchr(cp, '@')) { + /* + * It's a SID name (eg "user@domain") that needs to be + * turned into S-1-domainID-RID. + */ + directory_error_t e; + if (zoned && getzoneid() == GLOBAL_ZONEID) + return (ENOENT); + if (isuser) { + e = directory_sid_from_user_name(NULL, + cp, &numericsid); + } else { + e = directory_sid_from_group_name(NULL, + cp, &numericsid); + } + if (e != NULL) { + directory_error_free(e); + return (ENOENT); + } + if (numericsid == NULL) + return (ENOENT); + cp = numericsid; + /* will be further decoded below */ + } + + if (strncmp(cp, "S-1-", 4) == 0) { + /* It's a numeric SID (eg "S-1-234-567-89") */ + (void) strlcpy(domain, cp, domainlen); + cp = strrchr(domain, '-'); + *cp = '\0'; + cp++; + + errno = 0; + *ridp = strtoull(cp, &end, 10); + if (numericsid) { + free(numericsid); + numericsid = NULL; + } + if (errno != 0 || *end != '\0') + return (EINVAL); + } else if (!isdigit(*cp)) { + /* + * It's a user/group name (eg "user") that needs to be + * turned into a uid/gid + */ + if (zoned && getzoneid() == GLOBAL_ZONEID) + return (ENOENT); + if (isuser) { + struct passwd *pw; + pw = getpwnam(cp); + if (pw == NULL) + return (ENOENT); + *ridp = pw->pw_uid; + } else { + struct group *gr; + gr = getgrnam(cp); + if (gr == NULL) + return (ENOENT); + *ridp = gr->gr_gid; + } + } else { + /* It's a user/group ID (eg "12345"). */ + uid_t id = strtoul(cp, &end, 10); + idmap_rid_t rid; + char *mapdomain; + + if (*end != '\0') + return (EINVAL); + if (id > MAXUID) { + /* It's an ephemeral ID. */ + if (idmap_id_to_numeric_domain_rid(id, isuser, + &mapdomain, &rid) != 0) + return (ENOENT); + (void) strlcpy(domain, mapdomain, domainlen); + *ridp = rid; + } else { + *ridp = id; + } + } + + ASSERT3P(numericsid, ==, NULL); + return (0); +} + +static int +zfs_prop_get_userquota_common(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue, zfs_userquota_prop_t *typep) +{ + int err; + zfs_cmd_t zc = { 0 }; + + (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + err = userquota_propname_decode(propname, + zfs_prop_get_int(zhp, ZFS_PROP_ZONED), + typep, zc.zc_value, sizeof (zc.zc_value), &zc.zc_guid); + zc.zc_objset_type = *typep; + if (err) + return (err); + + err = ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_USERSPACE_ONE, &zc); + if (err) + return (err); + + *propvalue = zc.zc_cookie; + return (0); +} + +int +zfs_prop_get_userquota_int(zfs_handle_t *zhp, const char *propname, + uint64_t *propvalue) { - char buf[64]; + zfs_userquota_prop_t type; - zfs_nicenum(val, buf, sizeof (buf)); - return (zfs_prop_set(zhp, zfs_prop_to_name(prop), buf)); + return (zfs_prop_get_userquota_common(zhp, propname, propvalue, + &type)); } -/* - * Similar to zfs_prop_get(), but returns the value as an integer. - */ int -zfs_prop_get_numeric(zfs_handle_t *zhp, zfs_prop_t prop, uint64_t *value, - zprop_source_t *src, char *statbuf, size_t statlen) +zfs_prop_get_userquota(zfs_handle_t *zhp, const char *propname, + char *propbuf, int proplen, boolean_t literal) { - char *source; - - /* - * Check to see if this property applies to our object - */ - if (!zfs_prop_valid_for_type(prop, zhp->zfs_type)) { - return (zfs_error_fmt(zhp->zfs_hdl, EZFS_PROPTYPE, - dgettext(TEXT_DOMAIN, "cannot get property '%s'"), - zfs_prop_to_name(prop))); - } - - if (src) - *src = ZPROP_SRC_NONE; + int err; + uint64_t propvalue; + zfs_userquota_prop_t type; - if (get_numeric_property(zhp, prop, src, &source, value) != 0) - return (-1); + err = zfs_prop_get_userquota_common(zhp, propname, &propvalue, + &type); - get_source(zhp, src, source, statbuf, statlen); + if (err) + return (err); + if (literal) { + (void) snprintf(propbuf, proplen, "%llu", propvalue); + } else if (propvalue == 0 && + (type == ZFS_PROP_USERQUOTA || type == ZFS_PROP_GROUPQUOTA)) { + (void) strlcpy(propbuf, "none", proplen); + } else { + zfs_nicenum(propvalue, propbuf, proplen); + } return (0); } @@ -2575,6 +2399,46 @@ zfs_get_type(const zfs_handle_t *zhp) return (zhp->zfs_type); } +static int +zfs_do_list_ioctl(zfs_handle_t *zhp, int arg, zfs_cmd_t *zc) +{ + int rc; + uint64_t orig_cookie; + + orig_cookie = zc->zc_cookie; +top: + (void) strlcpy(zc->zc_name, zhp->zfs_name, sizeof (zc->zc_name)); + rc = ioctl(zhp->zfs_hdl->libzfs_fd, arg, zc); + + if (rc == -1) { + switch (errno) { + case ENOMEM: + /* expand nvlist memory and try again */ + if (zcmd_expand_dst_nvlist(zhp->zfs_hdl, zc) != 0) { + zcmd_free_nvlists(zc); + return (-1); + } + zc->zc_cookie = orig_cookie; + goto top; + /* + * An errno value of ESRCH indicates normal completion. + * If ENOENT is returned, then the underlying dataset + * has been removed since we obtained the handle. + */ + case ESRCH: + case ENOENT: + rc = 1; + break; + default: + rc = zfs_standard_error(zhp->zfs_hdl, errno, + dgettext(TEXT_DOMAIN, + "cannot iterate filesystems")); + break; + } + } + return (rc); +} + /* * Iterate over all child filesystems */ @@ -2588,37 +2452,27 @@ zfs_iter_filesystems(zfs_handle_t *zhp, zfs_iter_f func, void *data) if (zhp->zfs_type != ZFS_TYPE_FILESYSTEM) return (0); - for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0; - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) { - /* - * Ignore private dataset names. - */ - if (dataset_name_hidden(zc.zc_name)) - continue; + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_DATASET_LIST_NEXT, + &zc)) == 0) { /* * Silently ignore errors, as the only plausible explanation is * that the pool has since been removed. */ - if ((nzhp = make_dataset_handle(zhp->zfs_hdl, - zc.zc_name)) == NULL) + if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, + &zc)) == NULL) { continue; + } - if ((ret = func(nzhp, data)) != 0) + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); return (ret); + } } - - /* - * An errno value of ESRCH indicates normal completion. If ENOENT is - * returned, then the underlying dataset has been removed since we - * obtained the handle. - */ - if (errno != ESRCH && errno != ENOENT) - return (zfs_standard_error(zhp->zfs_hdl, errno, - dgettext(TEXT_DOMAIN, "cannot iterate filesystems"))); - - return (0); + zcmd_free_nvlists(&zc); + return ((ret < 0) ? ret : 0); } /* @@ -2634,29 +2488,23 @@ zfs_iter_snapshots(zfs_handle_t *zhp, zfs_iter_f func, void *data) if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) return (0); - for ((void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SNAPSHOT_LIST_NEXT, - &zc) == 0; - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name))) { + if (zcmd_alloc_dst_nvlist(zhp->zfs_hdl, &zc, 0) != 0) + return (-1); + while ((ret = zfs_do_list_ioctl(zhp, ZFS_IOC_SNAPSHOT_LIST_NEXT, + &zc)) == 0) { - if ((nzhp = make_dataset_handle(zhp->zfs_hdl, - zc.zc_name)) == NULL) + if ((nzhp = make_dataset_handle_zc(zhp->zfs_hdl, + &zc)) == NULL) { continue; + } - if ((ret = func(nzhp, data)) != 0) + if ((ret = func(nzhp, data)) != 0) { + zcmd_free_nvlists(&zc); return (ret); + } } - - /* - * An errno value of ESRCH indicates normal completion. If ENOENT is - * returned, then the underlying dataset has been removed since we - * obtained the handle. Silently ignore this case, and return success. - */ - if (errno != ESRCH && errno != ENOENT) - return (zfs_standard_error(zhp->zfs_hdl, errno, - dgettext(TEXT_DOMAIN, "cannot iterate filesystems"))); - - return (0); + zcmd_free_nvlists(&zc); + return ((ret < 0) ? ret : 0); } /* @@ -2673,6 +2521,27 @@ zfs_iter_children(zfs_handle_t *zhp, zfs_iter_f func, void *data) return (zfs_iter_snapshots(zhp, func, data)); } +/* + * Is one dataset name a child dataset of another? + * + * Needs to handle these cases: + * Dataset 1 "a/foo" "a/foo" "a/foo" "a/foo" + * Dataset 2 "a/fo" "a/foobar" "a/bar/baz" "a/foo/bar" + * Descendant? No. No. No. Yes. + */ +static boolean_t +is_descendant(const char *ds1, const char *ds2) +{ + size_t d1len = strlen(ds1); + + /* ds2 can't be a descendant if it's smaller */ + if (strlen(ds2) < d1len) + return (B_FALSE); + + /* otherwise, compare strings and verify that there's a '/' char */ + return (ds2[d1len] == '/' && (strncmp(ds1, ds2, d1len) == 0)); +} + /* * Given a complete name, return just the portion that refers to the parent. * Can return NULL if this is a pool. @@ -2708,9 +2577,10 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, char *slash; zfs_handle_t *zhp; char errbuf[1024]; + uint64_t is_zoned; - (void) snprintf(errbuf, sizeof (errbuf), "cannot create '%s'", - path); + (void) snprintf(errbuf, sizeof (errbuf), + dgettext(TEXT_DOMAIN, "cannot create '%s'"), path); /* get parent, and check to see if this is just a pool */ if (parent_name(path, parent, sizeof (parent)) != 0) { @@ -2750,9 +2620,12 @@ check_parents(libzfs_handle_t *hdl, const char *path, uint64_t *zoned, return (zfs_standard_error(hdl, errno, errbuf)); } - *zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + is_zoned = zfs_prop_get_int(zhp, ZFS_PROP_ZONED); + if (zoned != NULL) + *zoned = is_zoned; + /* we are in a non-global zone, but parent is in the global zone */ - if (getzoneid() != GLOBAL_ZONEID && !(*zoned)) { + if (getzoneid() != GLOBAL_ZONEID && !is_zoned) { (void) zfs_standard_error(hdl, EPERM, errbuf); zfs_close(zhp); return (-1); @@ -2884,11 +2757,10 @@ int zfs_create_ancestors(libzfs_handle_t *hdl, const char *path) { int prefix; - uint64_t zoned; char *path_copy; int rc; - if (check_parents(hdl, path, &zoned, B_TRUE, &prefix) != 0) + if (check_parents(hdl, path, NULL, B_TRUE, &prefix) != 0) return (-1); if ((path_copy = strdup(path)) != NULL) { @@ -3002,18 +2874,6 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, /* create the dataset */ ret = zfs_ioctl(hdl, ZFS_IOC_CREATE, &zc); - if (ret == 0 && type == ZFS_TYPE_VOLUME) { - ret = zvol_create_link(hdl, path); - if (ret) { - (void) zfs_standard_error(hdl, errno, - dgettext(TEXT_DOMAIN, - "Volume successfully created, but device links " - "were not created")); - zcmd_free_nvlists(&zc); - return (-1); - } - } - zcmd_free_nvlists(&zc); /* check for failure */ @@ -3069,7 +2929,7 @@ zfs_create(libzfs_handle_t *hdl, const char *path, zfs_type_t type, * isn't mounted, and that there are no active dependents. */ int -zfs_destroy(zfs_handle_t *zhp) +zfs_destroy(zfs_handle_t *zhp, boolean_t defer) { zfs_cmd_t zc = { 0 }; @@ -3085,14 +2945,12 @@ zfs_destroy(zfs_handle_t *zhp) return (-1); } - if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) - return (-1); - zc.zc_objset_type = DMU_OST_ZVOL; } else { zc.zc_objset_type = DMU_OST_ZFS; } + zc.zc_defer_destroy = defer; if (zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY, &zc) != 0) { return (zfs_standard_error_fmt(zhp->zfs_hdl, errno, dgettext(TEXT_DOMAIN, "cannot destroy '%s'"), @@ -3111,13 +2969,13 @@ struct destroydata { }; static int -zfs_remove_link_cb(zfs_handle_t *zhp, void *arg) +zfs_check_snap_cb(zfs_handle_t *zhp, void *arg) { struct destroydata *dd = arg; zfs_handle_t *szhp; char name[ZFS_MAXNAMELEN]; boolean_t closezhp = dd->closezhp; - int rv; + int rv = 0; (void) strlcpy(name, zhp->zfs_name, sizeof (name)); (void) strlcat(name, "@", sizeof (name)); @@ -3129,17 +2987,9 @@ zfs_remove_link_cb(zfs_handle_t *zhp, void *arg) zfs_close(szhp); } - if (zhp->zfs_type == ZFS_TYPE_VOLUME) { - (void) zvol_remove_link(zhp->zfs_hdl, name); - /* - * NB: this is simply a best-effort. We don't want to - * return an error, because then we wouldn't visit all - * the volumes. - */ - } - dd->closezhp = B_TRUE; - rv = zfs_iter_filesystems(zhp, zfs_remove_link_cb, arg); + if (!dd->gotone) + rv = zfs_iter_filesystems(zhp, zfs_check_snap_cb, arg); if (closezhp) zfs_close(zhp); return (rv); @@ -3149,14 +2999,14 @@ zfs_remove_link_cb(zfs_handle_t *zhp, void *arg) * Destroys all snapshots with the given name in zhp & descendants. */ int -zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname) +zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname, boolean_t defer) { zfs_cmd_t zc = { 0 }; int ret; struct destroydata dd = { 0 }; dd.snapname = snapname; - (void) zfs_remove_link_cb(zhp, &dd); + (void) zfs_check_snap_cb(zhp, &dd); if (!dd.gotone) { return (zfs_standard_error_fmt(zhp->zfs_hdl, ENOENT, @@ -3166,6 +3016,7 @@ zfs_destroy_snaps(zfs_handle_t *zhp, char *snapname) (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + zc.zc_defer_destroy = defer; ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_DESTROY_SNAPS, &zc); if (ret != 0) { @@ -3273,70 +3124,11 @@ zfs_clone(zfs_handle_t *zhp, const char *target, nvlist_t *props) return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); } - } else if (ZFS_IS_VOLUME(zhp)) { - ret = zvol_create_link(zhp->zfs_hdl, target); } return (ret); } -typedef struct promote_data { - char cb_mountpoint[MAXPATHLEN]; - const char *cb_target; - const char *cb_errbuf; - uint64_t cb_pivot_txg; -} promote_data_t; - -static int -promote_snap_cb(zfs_handle_t *zhp, void *data) -{ - promote_data_t *pd = data; - zfs_handle_t *szhp; - char snapname[MAXPATHLEN]; - int rv = 0; - - /* We don't care about snapshots after the pivot point */ - if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) > pd->cb_pivot_txg) { - zfs_close(zhp); - return (0); - } - - /* Remove the device link if it's a zvol. */ - if (ZFS_IS_VOLUME(zhp)) - (void) zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name); - - /* Check for conflicting names */ - (void) strlcpy(snapname, pd->cb_target, sizeof (snapname)); - (void) strlcat(snapname, strchr(zhp->zfs_name, '@'), sizeof (snapname)); - szhp = make_dataset_handle(zhp->zfs_hdl, snapname); - if (szhp != NULL) { - zfs_close(szhp); - zfs_error_aux(zhp->zfs_hdl, dgettext(TEXT_DOMAIN, - "snapshot name '%s' from origin \n" - "conflicts with '%s' from target"), - zhp->zfs_name, snapname); - rv = zfs_error(zhp->zfs_hdl, EZFS_EXISTS, pd->cb_errbuf); - } - zfs_close(zhp); - return (rv); -} - -static int -promote_snap_done_cb(zfs_handle_t *zhp, void *data) -{ - promote_data_t *pd = data; - - /* We don't care about snapshots after the pivot point */ - if (zfs_prop_get_int(zhp, ZFS_PROP_CREATETXG) <= pd->cb_pivot_txg) { - /* Create the device link if it's a zvol. */ - if (ZFS_IS_VOLUME(zhp)) - (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name); - } - - zfs_close(zhp); - return (0); -} - /* * Promotes the given clone fs to be the clone parent. */ @@ -3346,10 +3138,7 @@ zfs_promote(zfs_handle_t *zhp) libzfs_handle_t *hdl = zhp->zfs_hdl; zfs_cmd_t zc = { 0 }; char parent[MAXPATHLEN]; - char *cp; int ret; - zfs_handle_t *pzhp; - promote_data_t pd; char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, @@ -3367,29 +3156,7 @@ zfs_promote(zfs_handle_t *zhp) "not a cloned filesystem")); return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); } - cp = strchr(parent, '@'); - *cp = '\0'; - - /* Walk the snapshots we will be moving */ - pzhp = zfs_open(hdl, zhp->zfs_dmustats.dds_origin, ZFS_TYPE_SNAPSHOT); - if (pzhp == NULL) - return (-1); - pd.cb_pivot_txg = zfs_prop_get_int(pzhp, ZFS_PROP_CREATETXG); - zfs_close(pzhp); - pd.cb_target = zhp->zfs_name; - pd.cb_errbuf = errbuf; - pzhp = zfs_open(hdl, parent, ZFS_TYPE_DATASET); - if (pzhp == NULL) - return (-1); - (void) zfs_prop_get(pzhp, ZFS_PROP_MOUNTPOINT, pd.cb_mountpoint, - sizeof (pd.cb_mountpoint), NULL, NULL, 0, FALSE); - ret = zfs_iter_snapshots(pzhp, promote_snap_cb, &pd); - if (ret != 0) { - zfs_close(pzhp); - return (-1); - } - /* issue the ioctl */ (void) strlcpy(zc.zc_value, zhp->zfs_dmustats.dds_origin, sizeof (zc.zc_value)); (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); @@ -3398,62 +3165,18 @@ zfs_promote(zfs_handle_t *zhp) if (ret != 0) { int save_errno = errno; - (void) zfs_iter_snapshots(pzhp, promote_snap_done_cb, &pd); - zfs_close(pzhp); - switch (save_errno) { case EEXIST: - /* - * There is a conflicting snapshot name. We - * should have caught this above, but they could - * have renamed something in the mean time. - */ + /* There is a conflicting snapshot name. */ zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "conflicting snapshot name from parent '%s'"), - parent); + "conflicting snapshot '%s' from parent '%s'"), + zc.zc_string, parent); return (zfs_error(hdl, EZFS_EXISTS, errbuf)); default: return (zfs_standard_error(hdl, save_errno, errbuf)); } - } else { - (void) zfs_iter_snapshots(zhp, promote_snap_done_cb, &pd); - } - - zfs_close(pzhp); - return (ret); -} - -struct createdata { - const char *cd_snapname; - int cd_ifexists; -}; - -static int -zfs_create_link_cb(zfs_handle_t *zhp, void *arg) -{ - struct createdata *cd = arg; - int ret; - - if (zhp->zfs_type == ZFS_TYPE_VOLUME) { - char name[MAXPATHLEN]; - - (void) strlcpy(name, zhp->zfs_name, sizeof (name)); - (void) strlcat(name, "@", sizeof (name)); - (void) strlcat(name, cd->cd_snapname, sizeof (name)); - (void) zvol_create_link_common(zhp->zfs_hdl, name, - cd->cd_ifexists); - /* - * NB: this is simply a best-effort. We don't want to - * return an error, because then we wouldn't visit all - * the volumes. - */ } - - ret = zfs_iter_filesystems(zhp, zfs_create_link_cb, cd); - - zfs_close(zhp); - return (ret); } @@ -3517,31 +3240,11 @@ zfs_snapshot(libzfs_handle_t *hdl, const char *path, boolean_t recursive, * if it was recursive, the one that actually failed will be in * zc.zc_name. */ - if (ret != 0) + if (ret != 0) { (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot create snapshot '%s@%s'"), zc.zc_name, zc.zc_value); - - if (ret == 0 && recursive) { - struct createdata cd; - - cd.cd_snapname = delim + 1; - cd.cd_ifexists = B_FALSE; - (void) zfs_iter_filesystems(zhp, zfs_create_link_cb, &cd); - } - if (ret == 0 && zhp->zfs_type == ZFS_TYPE_VOLUME) { - ret = zvol_create_link(zhp->zfs_hdl, path); - if (ret != 0) { - (void) zfs_standard_error(hdl, errno, - dgettext(TEXT_DOMAIN, - "Volume successfully snapshotted, but device links " - "were not created")); - zfs_close(zhp); - return (-1); - } - } - - if (ret != 0) (void) zfs_standard_error(hdl, errno, errbuf); + } zfs_close(zhp); @@ -3581,7 +3284,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data) logstr = zhp->zfs_hdl->libzfs_log_str; zhp->zfs_hdl->libzfs_log_str = NULL; - cbp->cb_error |= zfs_destroy(zhp); + cbp->cb_error |= zfs_destroy(zhp, B_FALSE); zhp->zfs_hdl->libzfs_log_str = logstr; } } else { @@ -3595,7 +3298,7 @@ rollback_destroy(zfs_handle_t *zhp, void *data) zfs_close(zhp); return (0); } - if (zfs_destroy(zhp) != 0) + if (zfs_destroy(zhp, B_FALSE) != 0) cbp->cb_error = B_TRUE; else changelist_remove(clp, zhp->zfs_name); @@ -3644,8 +3347,6 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) */ if (zhp->zfs_type == ZFS_TYPE_VOLUME) { - if (zvol_remove_link(zhp->zfs_hdl, zhp->zfs_name) != 0) - return (-1); if (zfs_which_resv_prop(zhp, &resv_prop) < 0) return (-1); old_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); @@ -3683,10 +3384,6 @@ zfs_rollback(zfs_handle_t *zhp, zfs_handle_t *snap, boolean_t force) */ if ((zhp->zfs_type == ZFS_TYPE_VOLUME) && (zhp = make_dataset_handle(zhp->zfs_hdl, zhp->zfs_name))) { - if (err = zvol_create_link(zhp->zfs_hdl, zhp->zfs_name)) { - zfs_close(zhp); - return (err); - } if (restore_resv) { new_volsize = zfs_prop_get_int(zhp, ZFS_PROP_VOLSIZE); if (old_volsize != new_volsize) @@ -3801,14 +3498,11 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) if (!zfs_validate_name(hdl, target, zhp->zfs_type, B_TRUE)) return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); - uint64_t unused; /* validate parents */ - if (check_parents(hdl, target, &unused, B_FALSE, NULL) != 0) + if (check_parents(hdl, target, NULL, B_FALSE, NULL) != 0) return (-1); - (void) parent_name(target, parent, sizeof (parent)); - /* make sure we're in the same pool */ verify((delim = strchr(target, '/')) != NULL); if (strncmp(zhp->zfs_name, target, delim - target) != 0 || @@ -3819,10 +3513,9 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) } /* new name cannot be a child of the current dataset name */ - if (strncmp(parent, zhp->zfs_name, - strlen(zhp->zfs_name)) == 0) { + if (is_descendant(zhp->zfs_name, target)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "New dataset name cannot be a descendent of " + "New dataset name cannot be a descendant of " "current dataset name")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } @@ -3839,7 +3532,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) } if (recursive) { - struct destroydata dd; parentname = zfs_strdup(zhp->zfs_hdl, zhp->zfs_name); if (parentname == NULL) { @@ -3854,15 +3546,6 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) goto error; } - dd.snapname = delim + 1; - dd.gotone = B_FALSE; - dd.closezhp = B_TRUE; - - /* We remove any zvol links prior to renaming them */ - ret = zfs_iter_filesystems(zhrp, zfs_remove_link_cb, &dd); - if (ret) { - goto error; - } } else { if ((cl = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0)) == NULL) return (-1); @@ -3884,202 +3567,52 @@ zfs_rename(zfs_handle_t *zhp, const char *target, boolean_t recursive) else zc.zc_objset_type = DMU_OST_ZFS; - (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); - (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); - - zc.zc_cookie = recursive; - - if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { - /* - * if it was recursive, the one that actually failed will - * be in zc.zc_name - */ - (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, - "cannot rename '%s'"), zc.zc_name); - - if (recursive && errno == EEXIST) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "a child dataset already has a snapshot " - "with the new name")); - (void) zfs_error(hdl, EZFS_EXISTS, errbuf); - } else { - (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); - } - - /* - * On failure, we still want to remount any filesystems that - * were previously mounted, so we don't alter the system state. - */ - if (recursive) { - struct createdata cd; - - /* only create links for datasets that had existed */ - cd.cd_snapname = delim + 1; - cd.cd_ifexists = B_TRUE; - (void) zfs_iter_filesystems(zhrp, zfs_create_link_cb, - &cd); - } else { - (void) changelist_postfix(cl); - } - } else { - if (recursive) { - struct createdata cd; - - /* only create links for datasets that had existed */ - cd.cd_snapname = strchr(target, '@') + 1; - cd.cd_ifexists = B_TRUE; - ret = zfs_iter_filesystems(zhrp, zfs_create_link_cb, - &cd); - } else { - changelist_rename(cl, zfs_get_name(zhp), target); - ret = changelist_postfix(cl); - } - } - -error: - if (parentname) { - free(parentname); - } - if (zhrp) { - zfs_close(zhrp); - } - if (cl) { - changelist_free(cl); - } - return (ret); -} - -/* - * Given a zvol dataset, issue the ioctl to create the appropriate minor node, - * poke devfsadm to create the /dev link, and then wait for the link to appear. - */ -int -zvol_create_link(libzfs_handle_t *hdl, const char *dataset) -{ - return (zvol_create_link_common(hdl, dataset, B_FALSE)); -} - -static int -zvol_create_link_common(libzfs_handle_t *hdl, const char *dataset, int ifexists) -{ - zfs_cmd_t zc = { 0 }; - di_devlink_handle_t dhdl; - priv_set_t *priv_effective; - int privileged; - - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - - /* - * Issue the appropriate ioctl. - */ - if (ioctl(hdl->libzfs_fd, ZFS_IOC_CREATE_MINOR, &zc) != 0) { - switch (errno) { - case EEXIST: - /* - * Silently ignore the case where the link already - * exists. This allows 'zfs volinit' to be run multiple - * times without errors. - */ - return (0); - - case ENOENT: - /* - * Dataset does not exist in the kernel. If we - * don't care (see zfs_rename), then ignore the - * error quietly. - */ - if (ifexists) { - return (0); - } - - /* FALLTHROUGH */ + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, target, sizeof (zc.zc_value)); - default: - return (zfs_standard_error_fmt(hdl, errno, - dgettext(TEXT_DOMAIN, "cannot create device links " - "for '%s'"), dataset)); - } - } + zc.zc_cookie = recursive; - /* - * If privileged call devfsadm and wait for the links to - * magically appear. - * Otherwise, print out an informational message. - */ + if ((ret = zfs_ioctl(zhp->zfs_hdl, ZFS_IOC_RENAME, &zc)) != 0) { + /* + * if it was recursive, the one that actually failed will + * be in zc.zc_name + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot rename '%s'"), zc.zc_name); - priv_effective = priv_allocset(); - (void) getppriv(PRIV_EFFECTIVE, priv_effective); - privileged = (priv_isfullset(priv_effective) == B_TRUE); - priv_freeset(priv_effective); - - if (privileged) { - if ((dhdl = di_devlink_init(ZFS_DRIVER, - DI_MAKE_LINK)) == NULL) { - zfs_error_aux(hdl, strerror(errno)); - (void) zfs_error_fmt(hdl, errno, - dgettext(TEXT_DOMAIN, "cannot create device links " - "for '%s'"), dataset); - (void) ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc); - return (-1); + if (recursive && errno == EEXIST) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "a child dataset already has a snapshot " + "with the new name")); + (void) zfs_error(hdl, EZFS_EXISTS, errbuf); } else { - (void) di_devlink_fini(&dhdl); + (void) zfs_standard_error(zhp->zfs_hdl, errno, errbuf); } - } else { - char pathname[MAXPATHLEN]; - struct stat64 statbuf; - int i; - -#define MAX_WAIT 10 /* - * This is the poor mans way of waiting for the link - * to show up. If after 10 seconds we still don't - * have it, then print out a message. + * On failure, we still want to remount any filesystems that + * were previously mounted, so we don't alter the system state. */ - (void) snprintf(pathname, sizeof (pathname), "/dev/zvol/dsk/%s", - dataset); - - for (i = 0; i != MAX_WAIT; i++) { - if (stat64(pathname, &statbuf) == 0) - break; - (void) sleep(1); + if (!recursive) + (void) changelist_postfix(cl); + } else { + if (!recursive) { + changelist_rename(cl, zfs_get_name(zhp), target); + ret = changelist_postfix(cl); } - if (i == MAX_WAIT) - (void) printf(gettext("%s may not be immediately " - "available\n"), pathname); } - return (0); -} - -/* - * Remove a minor node for the given zvol and the associated /dev links. - */ -int -zvol_remove_link(libzfs_handle_t *hdl, const char *dataset) -{ - zfs_cmd_t zc = { 0 }; - - (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); - - if (ioctl(hdl->libzfs_fd, ZFS_IOC_REMOVE_MINOR, &zc) != 0) { - switch (errno) { - case ENXIO: - /* - * Silently ignore the case where the link no longer - * exists, so that 'zfs volfini' can be run multiple - * times without errors. - */ - return (0); - - default: - return (zfs_standard_error_fmt(hdl, errno, - dgettext(TEXT_DOMAIN, "cannot remove device " - "links for '%s'"), dataset)); - } +error: + if (parentname) { + free(parentname); } - - return (0); + if (zhrp) { + zfs_close(zhrp); + } + if (cl) { + changelist_free(cl); + } + return (ret); } nvlist_t * @@ -4088,6 +3621,15 @@ zfs_get_user_props(zfs_handle_t *zhp) return (zhp->zfs_user_props); } +nvlist_t * +zfs_get_recvd_props(zfs_handle_t *zhp) +{ + if (zhp->zfs_recvd_props == NULL) + if (get_recvd_props_ioctl(zhp) != 0) + return (NULL); + return (zhp->zfs_recvd_props); +} + /* * This function is used by 'zfs list' to determine the exact set of columns to * display, and their maximum widths. This does two main things: @@ -4097,10 +3639,12 @@ zfs_get_user_props(zfs_handle_t *zhp) * for new unique user properties and add them to the list. * * - For non fixed-width properties, keep track of the maximum width seen - * so that we can size the column appropriately. + * so that we can size the column appropriately. If the user has + * requested received property values, we also need to compute the width + * of the RECEIVED column. */ int -zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp) +zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp, boolean_t received) { libzfs_handle_t *hdl = zhp->zfs_hdl; zprop_list_t *entry; @@ -4171,12 +3715,24 @@ zfs_expand_proplist(zfs_handle_t *zhp, zprop_list_t **plp) if (strlen(buf) > entry->pl_width) entry->pl_width = strlen(buf); } - } else if (nvlist_lookup_nvlist(userprops, - entry->pl_user_prop, &propval) == 0) { - verify(nvlist_lookup_string(propval, - ZPROP_VALUE, &strval) == 0); - if (strlen(strval) > entry->pl_width) - entry->pl_width = strlen(strval); + if (received && zfs_prop_get_recvd(zhp, + zfs_prop_to_name(entry->pl_prop), + buf, sizeof (buf), B_FALSE) == 0) + if (strlen(buf) > entry->pl_recvd_width) + entry->pl_recvd_width = strlen(buf); + } else { + if (nvlist_lookup_nvlist(userprops, entry->pl_user_prop, + &propval) == 0) { + verify(nvlist_lookup_string(propval, + ZPROP_VALUE, &strval) == 0); + if (strlen(strval) > entry->pl_width) + entry->pl_width = strlen(strval); + } + if (received && zfs_prop_get_recvd(zhp, + entry->pl_user_prop, + buf, sizeof (buf), B_FALSE) == 0) + if (strlen(buf) > entry->pl_recvd_width) + entry->pl_recvd_width = strlen(buf); } } @@ -4231,18 +3787,406 @@ zfs_iscsi_perm_check(libzfs_handle_t *hdl, char *dataset, ucred_t *cred) int zfs_deleg_share_nfs(libzfs_handle_t *hdl, char *dataset, char *path, - void *export, void *sharetab, int sharemax, zfs_share_op_t operation) + char *resource, void *export, void *sharetab, + int sharemax, zfs_share_op_t operation) { zfs_cmd_t zc = { 0 }; int error; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); + if (resource) + (void) strlcpy(zc.zc_string, resource, sizeof (zc.zc_string)); zc.zc_share.z_sharedata = (uint64_t)(uintptr_t)sharetab; zc.zc_share.z_exportdata = (uint64_t)(uintptr_t)export; zc.zc_share.z_sharetype = operation; zc.zc_share.z_sharemax = sharemax; - error = ioctl(hdl->libzfs_fd, ZFS_IOC_SHARE, &zc); return (error); } + +void +zfs_prune_proplist(zfs_handle_t *zhp, uint8_t *props) +{ + nvpair_t *curr; + + /* + * Keep a reference to the props-table against which we prune the + * properties. + */ + zhp->zfs_props_table = props; + + curr = nvlist_next_nvpair(zhp->zfs_props, NULL); + + while (curr) { + zfs_prop_t zfs_prop = zfs_name_to_prop(nvpair_name(curr)); + nvpair_t *next = nvlist_next_nvpair(zhp->zfs_props, curr); + + /* + * User properties will result in ZPROP_INVAL, and since we + * only know how to prune standard ZFS properties, we always + * leave these in the list. This can also happen if we + * encounter an unknown DSL property (when running older + * software, for example). + */ + if (zfs_prop != ZPROP_INVAL && props[zfs_prop] == B_FALSE) + (void) nvlist_remove(zhp->zfs_props, + nvpair_name(curr), nvpair_type(curr)); + curr = next; + } +} + +static int +zfs_smb_acl_mgmt(libzfs_handle_t *hdl, char *dataset, char *path, + zfs_smb_acl_op_t cmd, char *resource1, char *resource2) +{ + zfs_cmd_t zc = { 0 }; + nvlist_t *nvlist = NULL; + int error; + + (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, path, sizeof (zc.zc_value)); + zc.zc_cookie = (uint64_t)cmd; + + if (cmd == ZFS_SMB_ACL_RENAME) { + if (nvlist_alloc(&nvlist, NV_UNIQUE_NAME, 0) != 0) { + (void) no_memory(hdl); + return (NULL); + } + } + + switch (cmd) { + case ZFS_SMB_ACL_ADD: + case ZFS_SMB_ACL_REMOVE: + (void) strlcpy(zc.zc_string, resource1, sizeof (zc.zc_string)); + break; + case ZFS_SMB_ACL_RENAME: + if (nvlist_add_string(nvlist, ZFS_SMB_ACL_SRC, + resource1) != 0) { + (void) no_memory(hdl); + return (-1); + } + if (nvlist_add_string(nvlist, ZFS_SMB_ACL_TARGET, + resource2) != 0) { + (void) no_memory(hdl); + return (-1); + } + if (zcmd_write_src_nvlist(hdl, &zc, nvlist) != 0) { + nvlist_free(nvlist); + return (-1); + } + break; + case ZFS_SMB_ACL_PURGE: + break; + default: + return (-1); + } + error = ioctl(hdl->libzfs_fd, ZFS_IOC_SMB_ACL, &zc); + if (nvlist) + nvlist_free(nvlist); + return (error); +} + +int +zfs_smb_acl_add(libzfs_handle_t *hdl, char *dataset, + char *path, char *resource) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_ADD, + resource, NULL)); +} + +int +zfs_smb_acl_remove(libzfs_handle_t *hdl, char *dataset, + char *path, char *resource) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_REMOVE, + resource, NULL)); +} + +int +zfs_smb_acl_purge(libzfs_handle_t *hdl, char *dataset, char *path) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_PURGE, + NULL, NULL)); +} + +int +zfs_smb_acl_rename(libzfs_handle_t *hdl, char *dataset, char *path, + char *oldname, char *newname) +{ + return (zfs_smb_acl_mgmt(hdl, dataset, path, ZFS_SMB_ACL_RENAME, + oldname, newname)); +} + +int +zfs_userspace(zfs_handle_t *zhp, zfs_userquota_prop_t type, + zfs_userspace_cb_t func, void *arg) +{ + zfs_cmd_t zc = { 0 }; + int error; + zfs_useracct_t buf[100]; + + (void) strncpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + + zc.zc_objset_type = type; + zc.zc_nvlist_dst = (uintptr_t)buf; + + /* CONSTCOND */ + while (1) { + zfs_useracct_t *zua = buf; + + zc.zc_nvlist_dst_size = sizeof (buf); + error = ioctl(zhp->zfs_hdl->libzfs_fd, + ZFS_IOC_USERSPACE_MANY, &zc); + if (error || zc.zc_nvlist_dst_size == 0) + break; + + while (zc.zc_nvlist_dst_size > 0) { + error = func(arg, zua->zu_domain, zua->zu_rid, + zua->zu_space); + if (error != 0) + return (error); + zua++; + zc.zc_nvlist_dst_size -= sizeof (zfs_useracct_t); + } + } + + return (error); +} + +int +zfs_hold(zfs_handle_t *zhp, const char *snapname, const char *tag, + boolean_t recursive, boolean_t temphold, boolean_t enoent_ok) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) + >= sizeof (zc.zc_string)) + return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); + zc.zc_cookie = recursive; + zc.zc_temphold = temphold; + + if (zfs_ioctl(hdl, ZFS_IOC_HOLD, &zc) != 0) { + char errbuf[ZFS_MAXNAMELEN+32]; + + /* + * if it was recursive, the one that actually failed will be in + * zc.zc_name. + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot hold '%s@%s'"), zc.zc_name, snapname); + switch (errno) { + case E2BIG: + /* + * Temporary tags wind up having the ds object id + * prepended. So even if we passed the length check + * above, it's still possible for the tag to wind + * up being slightly too long. + */ + return (zfs_error(hdl, EZFS_TAGTOOLONG, errbuf)); + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); + case EINVAL: + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + case EEXIST: + return (zfs_error(hdl, EZFS_REFTAG_HOLD, errbuf)); + case ENOENT: + if (enoent_ok) + return (0); + /* FALLTHROUGH */ + default: + return (zfs_standard_error_fmt(hdl, errno, errbuf)); + } + } + + return (0); +} + +struct hold_range_arg { + zfs_handle_t *origin; + const char *fromsnap; + const char *tosnap; + char lastsnapheld[ZFS_MAXNAMELEN]; + const char *tag; + boolean_t temphold; + boolean_t seento; + boolean_t seenfrom; + boolean_t holding; + boolean_t recursive; +}; + +static int +zfs_hold_range_one(zfs_handle_t *zhp, void *arg) +{ + struct hold_range_arg *hra = arg; + const char *thissnap; + int error; + + thissnap = strchr(zfs_get_name(zhp), '@') + 1; + + if (hra->fromsnap && !hra->seenfrom && + strcmp(hra->fromsnap, thissnap) == 0) + hra->seenfrom = B_TRUE; + + /* snap is older or newer than the desired range, ignore it */ + if (hra->seento || !hra->seenfrom) { + zfs_close(zhp); + return (0); + } + + if (hra->holding) { + /* We could be racing with destroy, so ignore ENOENT. */ + error = zfs_hold(hra->origin, thissnap, hra->tag, + hra->recursive, hra->temphold, B_TRUE); + if (error == 0) { + (void) strlcpy(hra->lastsnapheld, zfs_get_name(zhp), + sizeof (hra->lastsnapheld)); + } + } else { + error = zfs_release(hra->origin, thissnap, hra->tag, + hra->recursive); + } + + if (!hra->seento && strcmp(hra->tosnap, thissnap) == 0) + hra->seento = B_TRUE; + + zfs_close(zhp); + return (error); +} + +/* + * Add a user hold on the set of snapshots starting with fromsnap up to + * and including tosnap. If we're unable to to acquire a particular hold, + * undo any holds up to that point. + */ +int +zfs_hold_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, + const char *tag, boolean_t recursive, boolean_t temphold) +{ + struct hold_range_arg arg = { 0 }; + int error; + + arg.origin = zhp; + arg.fromsnap = fromsnap; + arg.tosnap = tosnap; + arg.tag = tag; + arg.temphold = temphold; + arg.holding = B_TRUE; + arg.recursive = recursive; + arg.seenfrom = (fromsnap == NULL); + + error = zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg); + + /* + * Make sure we either hold the entire range or none. + */ + if (error && arg.lastsnapheld[0] != '\0') { + (void) zfs_release_range(zhp, fromsnap, + (const char *)arg.lastsnapheld, tag, recursive); + } + return (error); +} + +int +zfs_release(zfs_handle_t *zhp, const char *snapname, const char *tag, + boolean_t recursive) +{ + zfs_cmd_t zc = { 0 }; + libzfs_handle_t *hdl = zhp->zfs_hdl; + + (void) strlcpy(zc.zc_name, zhp->zfs_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_value, snapname, sizeof (zc.zc_value)); + if (strlcpy(zc.zc_string, tag, sizeof (zc.zc_string)) + >= sizeof (zc.zc_string)) + return (zfs_error(hdl, EZFS_TAGTOOLONG, tag)); + zc.zc_cookie = recursive; + + if (zfs_ioctl(hdl, ZFS_IOC_RELEASE, &zc) != 0) { + char errbuf[ZFS_MAXNAMELEN+32]; + + /* + * if it was recursive, the one that actually failed will be in + * zc.zc_name. + */ + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot release '%s' from '%s@%s'"), tag, zc.zc_name, + snapname); + switch (errno) { + case ESRCH: + return (zfs_error(hdl, EZFS_REFTAG_RELE, errbuf)); + case ENOTSUP: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgraded")); + return (zfs_error(hdl, EZFS_BADVERSION, errbuf)); + case EINVAL: + return (zfs_error(hdl, EZFS_BADTYPE, errbuf)); + default: + return (zfs_standard_error_fmt(hdl, errno, errbuf)); + } + } + + return (0); +} + +/* + * Release a user hold from the set of snapshots starting with fromsnap + * up to and including tosnap. + */ +int +zfs_release_range(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, + const char *tag, boolean_t recursive) +{ + struct hold_range_arg arg = { 0 }; + + arg.origin = zhp; + arg.fromsnap = fromsnap; + arg.tosnap = tosnap; + arg.tag = tag; + arg.recursive = recursive; + arg.seenfrom = (fromsnap == NULL); + + return (zfs_iter_snapshots_sorted(zhp, zfs_hold_range_one, &arg)); +} + +uint64_t +zvol_volsize_to_reservation(uint64_t volsize, nvlist_t *props) +{ + uint64_t numdb; + uint64_t nblocks, volblocksize; + int ncopies; + char *strval; + + if (nvlist_lookup_string(props, + zfs_prop_to_name(ZFS_PROP_COPIES), &strval) == 0) + ncopies = atoi(strval); + else + ncopies = 1; + if (nvlist_lookup_uint64(props, + zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), + &volblocksize) != 0) + volblocksize = ZVOL_DEFAULT_BLOCKSIZE; + nblocks = volsize/volblocksize; + /* start with metadnode L0-L6 */ + numdb = 7; + /* calculate number of indirects */ + while (nblocks > 1) { + nblocks += DNODES_PER_LEVEL - 1; + nblocks /= DNODES_PER_LEVEL; + numdb += nblocks; + } + numdb *= MIN(SPA_DVAS_PER_BP, ncopies + 1); + volsize *= ncopies; + /* + * this is exactly DN_MAX_INDBLKSHIFT when metadata isn't + * compressed, but in practice they compress down to about + * 1100 bytes + */ + numdb *= 1ULL << DN_MAX_INDBLKSHIFT; + volsize += numdb; + return (volsize); +} diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_fru.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_fru.c new file mode 100644 index 0000000000000..788fa2cfb763d --- /dev/null +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_fru.c @@ -0,0 +1,452 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include + +#include "libzfs_impl.h" + +/* + * This file is responsible for determining the relationship between I/O + * devices paths and physical locations. In the world of MPxIO and external + * enclosures, the device path is not synonymous with the physical location. + * If you remove a drive and insert it into a different slot, it will end up + * with the same path under MPxIO. If you recable storage enclosures, the + * device paths may change. All of this makes it difficult to implement the + * 'autoreplace' property, which is supposed to automatically manage disk + * replacement based on physical slot. + * + * In order to work around these limitations, we have a per-vdev FRU property + * that is the libtopo path (minus disk-specific authority information) to the + * physical location of the device on the system. This is an optional + * property, and is only needed when using the 'autoreplace' property or when + * generating FMA faults against vdevs. + */ + +/* + * Because the FMA packages depend on ZFS, we have to dlopen() libtopo in case + * it is not present. We only need this once per library instance, so it is + * not part of the libzfs handle. + */ +static void *_topo_dlhandle; +static topo_hdl_t *(*_topo_open)(int, const char *, int *); +static void (*_topo_close)(topo_hdl_t *); +static char *(*_topo_snap_hold)(topo_hdl_t *, const char *, int *); +static void (*_topo_snap_release)(topo_hdl_t *); +static topo_walk_t *(*_topo_walk_init)(topo_hdl_t *, const char *, + topo_walk_cb_t, void *, int *); +static int (*_topo_walk_step)(topo_walk_t *, int); +static void (*_topo_walk_fini)(topo_walk_t *); +static void (*_topo_hdl_strfree)(topo_hdl_t *, char *); +static char *(*_topo_node_name)(tnode_t *); +static int (*_topo_prop_get_string)(tnode_t *, const char *, const char *, + char **, int *); +static int (*_topo_node_fru)(tnode_t *, nvlist_t **, nvlist_t *, int *); +static int (*_topo_fmri_nvl2str)(topo_hdl_t *, nvlist_t *, char **, int *); +static int (*_topo_fmri_strcmp_noauth)(topo_hdl_t *, const char *, + const char *); + +#define ZFS_FRU_HASH_SIZE 257 + +static size_t +fru_strhash(const char *key) +{ + ulong_t g, h = 0; + const char *p; + + for (p = key; *p != '\0'; p++) { + h = (h << 4) + *p; + + if ((g = (h & 0xf0000000)) != 0) { + h ^= (g >> 24); + h ^= g; + } + } + + return (h % ZFS_FRU_HASH_SIZE); +} + +static int +libzfs_fru_gather(topo_hdl_t *thp, tnode_t *tn, void *arg) +{ + libzfs_handle_t *hdl = arg; + nvlist_t *fru; + char *devpath, *frustr; + int err; + libzfs_fru_t *frup; + size_t idx; + + /* + * If this is the chassis node, and we don't yet have the system + * chassis ID, then fill in this value now. + */ + if (hdl->libzfs_chassis_id[0] == '\0' && + strcmp(_topo_node_name(tn), "chassis") == 0) { + if (_topo_prop_get_string(tn, FM_FMRI_AUTHORITY, + FM_FMRI_AUTH_CHASSIS, &devpath, &err) == 0) + (void) strlcpy(hdl->libzfs_chassis_id, devpath, + sizeof (hdl->libzfs_chassis_id)); + } + + /* + * Skip non-disk nodes. + */ + if (strcmp(_topo_node_name(tn), "disk") != 0) + return (TOPO_WALK_NEXT); + + /* + * Get the devfs path and FRU. + */ + if (_topo_prop_get_string(tn, "io", "devfs-path", &devpath, &err) != 0) + return (TOPO_WALK_NEXT); + + if (libzfs_fru_lookup(hdl, devpath) != NULL) { + _topo_hdl_strfree(thp, devpath); + return (TOPO_WALK_NEXT); + } + + if (_topo_node_fru(tn, &fru, NULL, &err) != 0) { + _topo_hdl_strfree(thp, devpath); + return (TOPO_WALK_NEXT); + } + + /* + * Convert the FRU into a string. + */ + if (_topo_fmri_nvl2str(thp, fru, &frustr, &err) != 0) { + nvlist_free(fru); + _topo_hdl_strfree(thp, devpath); + return (TOPO_WALK_NEXT); + } + + nvlist_free(fru); + + /* + * Finally, we have a FRU string and device path. Add it to the hash. + */ + if ((frup = calloc(sizeof (libzfs_fru_t), 1)) == NULL) { + _topo_hdl_strfree(thp, devpath); + _topo_hdl_strfree(thp, frustr); + return (TOPO_WALK_NEXT); + } + + if ((frup->zf_device = strdup(devpath)) == NULL || + (frup->zf_fru = strdup(frustr)) == NULL) { + free(frup->zf_device); + free(frup); + _topo_hdl_strfree(thp, devpath); + _topo_hdl_strfree(thp, frustr); + return (TOPO_WALK_NEXT); + } + + _topo_hdl_strfree(thp, devpath); + _topo_hdl_strfree(thp, frustr); + + idx = fru_strhash(frup->zf_device); + frup->zf_chain = hdl->libzfs_fru_hash[idx]; + hdl->libzfs_fru_hash[idx] = frup; + frup->zf_next = hdl->libzfs_fru_list; + hdl->libzfs_fru_list = frup; + + return (TOPO_WALK_NEXT); +} + +/* + * Called during initialization to setup the dynamic libtopo connection. + */ +#pragma init(libzfs_init_fru) +static void +libzfs_init_fru(void) +{ + char path[MAXPATHLEN]; + char isa[257]; + +#if defined(_LP64) + if (sysinfo(SI_ARCHITECTURE_64, isa, sizeof (isa)) < 0) + isa[0] = '\0'; +#else + isa[0] = '\0'; +#endif + (void) snprintf(path, sizeof (path), + "/usr/lib/fm/%s/libtopo.so", isa); + + if ((_topo_dlhandle = dlopen(path, RTLD_LAZY)) == NULL) + return; + + _topo_open = (topo_hdl_t *(*)()) + dlsym(_topo_dlhandle, "topo_open"); + _topo_close = (void (*)()) + dlsym(_topo_dlhandle, "topo_close"); + _topo_snap_hold = (char *(*)()) + dlsym(_topo_dlhandle, "topo_snap_hold"); + _topo_snap_release = (void (*)()) + dlsym(_topo_dlhandle, "topo_snap_release"); + _topo_walk_init = (topo_walk_t *(*)()) + dlsym(_topo_dlhandle, "topo_walk_init"); + _topo_walk_step = (int (*)()) + dlsym(_topo_dlhandle, "topo_walk_step"); + _topo_walk_fini = (void (*)()) + dlsym(_topo_dlhandle, "topo_walk_fini"); + _topo_hdl_strfree = (void (*)()) + dlsym(_topo_dlhandle, "topo_hdl_strfree"); + _topo_node_name = (char *(*)()) + dlsym(_topo_dlhandle, "topo_node_name"); + _topo_prop_get_string = (int (*)()) + dlsym(_topo_dlhandle, "topo_prop_get_string"); + _topo_node_fru = (int (*)()) + dlsym(_topo_dlhandle, "topo_node_fru"); + _topo_fmri_nvl2str = (int (*)()) + dlsym(_topo_dlhandle, "topo_fmri_nvl2str"); + _topo_fmri_strcmp_noauth = (int (*)()) + dlsym(_topo_dlhandle, "topo_fmri_strcmp_noauth"); + + if (_topo_open == NULL || _topo_close == NULL || + _topo_snap_hold == NULL || _topo_snap_release == NULL || + _topo_walk_init == NULL || _topo_walk_step == NULL || + _topo_walk_fini == NULL || _topo_hdl_strfree == NULL || + _topo_node_name == NULL || _topo_prop_get_string == NULL || + _topo_node_fru == NULL || _topo_fmri_nvl2str == NULL || + _topo_fmri_strcmp_noauth == NULL) { + (void) dlclose(_topo_dlhandle); + _topo_dlhandle = NULL; + } +} + +/* + * Refresh the mappings from device path -> FMRI. We do this by walking the + * hc topology looking for disk nodes, and recording the io/devfs-path and FRU. + * Note that we strip out the disk-specific authority information (serial, + * part, revision, etc) so that we are left with only the identifying + * characteristics of the slot (hc path and chassis-id). + */ +void +libzfs_fru_refresh(libzfs_handle_t *hdl) +{ + int err; + char *uuid; + topo_hdl_t *thp; + topo_walk_t *twp; + + if (_topo_dlhandle == NULL) + return; + + /* + * Clear the FRU hash and initialize our basic structures. + */ + libzfs_fru_clear(hdl, B_FALSE); + + if ((hdl->libzfs_topo_hdl = _topo_open(TOPO_VERSION, + NULL, &err)) == NULL) + return; + + thp = hdl->libzfs_topo_hdl; + + if ((uuid = _topo_snap_hold(thp, NULL, &err)) == NULL) + return; + + _topo_hdl_strfree(thp, uuid); + + if (hdl->libzfs_fru_hash == NULL && + (hdl->libzfs_fru_hash = + calloc(ZFS_FRU_HASH_SIZE * sizeof (void *), 1)) == NULL) + return; + + /* + * We now have a topo snapshot, so iterate over the hc topology looking + * for disks to add to the hash. + */ + twp = _topo_walk_init(thp, FM_FMRI_SCHEME_HC, + libzfs_fru_gather, hdl, &err); + if (twp != NULL) { + (void) _topo_walk_step(twp, TOPO_WALK_CHILD); + _topo_walk_fini(twp); + } +} + +/* + * Given a devfs path, return the FRU for the device, if known. This will + * automatically call libzfs_fru_refresh() if it hasn't already been called by + * the consumer. The string returned is valid until the next call to + * libzfs_fru_refresh(). + */ +const char * +libzfs_fru_lookup(libzfs_handle_t *hdl, const char *devpath) +{ + size_t idx = fru_strhash(devpath); + libzfs_fru_t *frup; + + if (hdl->libzfs_fru_hash == NULL) + libzfs_fru_refresh(hdl); + + if (hdl->libzfs_fru_hash == NULL) + return (NULL); + + for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL; + frup = frup->zf_chain) { + if (strcmp(devpath, frup->zf_device) == 0) + return (frup->zf_fru); + } + + return (NULL); +} + +/* + * Given a fru path, return the device path. This will automatically call + * libzfs_fru_refresh() if it hasn't already been called by the consumer. The + * string returned is valid until the next call to libzfs_fru_refresh(). + */ +const char * +libzfs_fru_devpath(libzfs_handle_t *hdl, const char *fru) +{ + libzfs_fru_t *frup; + size_t idx; + + if (hdl->libzfs_fru_hash == NULL) + libzfs_fru_refresh(hdl); + + if (hdl->libzfs_fru_hash == NULL) + return (NULL); + + for (idx = 0; idx < ZFS_FRU_HASH_SIZE; idx++) { + for (frup = hdl->libzfs_fru_hash[idx]; frup != NULL; + frup = frup->zf_next) { + if (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, + fru, frup->zf_fru)) + return (frup->zf_device); + } + } + + return (NULL); +} + +/* + * Change the stored FRU for the given vdev. + */ +int +zpool_fru_set(zpool_handle_t *zhp, uint64_t vdev_guid, const char *fru) +{ + zfs_cmd_t zc = { 0 }; + + (void) strncpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + (void) strncpy(zc.zc_value, fru, sizeof (zc.zc_value)); + zc.zc_guid = vdev_guid; + + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SETFRU, &zc) != 0) + return (zpool_standard_error_fmt(zhp->zpool_hdl, errno, + dgettext(TEXT_DOMAIN, "cannot set FRU"))); + + return (0); +} + +/* + * Compare to two FRUs, ignoring any authority information. + */ +boolean_t +libzfs_fru_compare(libzfs_handle_t *hdl, const char *a, const char *b) +{ + if (hdl->libzfs_fru_hash == NULL) + libzfs_fru_refresh(hdl); + + if (hdl->libzfs_fru_hash == NULL) + return (strcmp(a, b) == 0); + + return (_topo_fmri_strcmp_noauth(hdl->libzfs_topo_hdl, a, b)); +} + +/* + * This special function checks to see whether the FRU indicates it's supposed + * to be in the system chassis, but the chassis-id doesn't match. This can + * happen in a clustered case, where both head nodes have the same logical + * disk, but opening the device on the other head node is meaningless. + */ +boolean_t +libzfs_fru_notself(libzfs_handle_t *hdl, const char *fru) +{ + const char *chassisid; + size_t len; + + if (hdl->libzfs_fru_hash == NULL) + libzfs_fru_refresh(hdl); + + if (hdl->libzfs_chassis_id[0] == '\0') + return (B_FALSE); + + if (strstr(fru, "/chassis=0/") == NULL) + return (B_FALSE); + + if ((chassisid = strstr(fru, ":chassis-id=")) == NULL) + return (B_FALSE); + + chassisid += 12; + len = strlen(hdl->libzfs_chassis_id); + if (strncmp(chassisid, hdl->libzfs_chassis_id, len) == 0 && + (chassisid[len] == '/' || chassisid[len] == ':')) + return (B_FALSE); + + return (B_TRUE); +} + +/* + * Clear memory associated with the FRU hash. + */ +void +libzfs_fru_clear(libzfs_handle_t *hdl, boolean_t final) +{ + libzfs_fru_t *frup; + + while ((frup = hdl->libzfs_fru_list) != NULL) { + hdl->libzfs_fru_list = frup->zf_next; + free(frup->zf_device); + free(frup->zf_fru); + free(frup); + } + + hdl->libzfs_fru_list = NULL; + + if (hdl->libzfs_topo_hdl != NULL) { + _topo_snap_release(hdl->libzfs_topo_hdl); + _topo_close(hdl->libzfs_topo_hdl); + hdl->libzfs_topo_hdl = NULL; + } + + if (final) { + free(hdl->libzfs_fru_hash); + } else if (hdl->libzfs_fru_hash != NULL) { + bzero(hdl->libzfs_fru_hash, + ZFS_FRU_HASH_SIZE * sizeof (void *)); + } +} diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c index e7cbf2386014e..bc21c51ae26c0 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_graph.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Iterate over all children of the current object. This includes the normal * dataset hierarchy, but also arbitrary hierarchies due to clones. We want to @@ -399,13 +397,6 @@ iterate_children(libzfs_handle_t *hdl, zfs_graph_t *zgp, const char *dataset) for ((void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name)); ioctl(hdl->libzfs_fd, ZFS_IOC_DATASET_LIST_NEXT, &zc) == 0; (void) strlcpy(zc.zc_name, dataset, sizeof (zc.zc_name))) { - - /* - * Ignore private dataset names. - */ - if (dataset_name_hidden(zc.zc_name)) - continue; - /* * Get statistics for this dataset, to determine the type of the * dataset and clone statistics. If this fails, the dataset has diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h index 9f1f66d51db50..ef34591fe3945 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_impl.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,6 +38,8 @@ #include #include +#include + #ifdef __cplusplus extern "C" { #endif @@ -47,6 +49,13 @@ extern "C" { #endif #define VERIFY verify +typedef struct libzfs_fru { + char *zf_device; + char *zf_fru; + struct libzfs_fru *zf_chain; + struct libzfs_fru *zf_next; +} libzfs_fru_t; + struct libzfs_handle { int libzfs_error; int libzfs_fd; @@ -63,7 +72,15 @@ struct libzfs_handle { int libzfs_printerr; void *libzfs_sharehdl; /* libshare handle */ uint_t libzfs_shareflags; + boolean_t libzfs_mnttab_enable; + avl_tree_t libzfs_mnttab_cache; + int libzfs_pool_iter; + topo_hdl_t *libzfs_topo_hdl; + libzfs_fru_t **libzfs_fru_hash; + libzfs_fru_t *libzfs_fru_list; + char libzfs_chassis_id[256]; }; + #define ZFSSHARE_MISS 0x01 /* Didn't find entry in cache */ struct zfs_handle { @@ -75,8 +92,10 @@ struct zfs_handle { dmu_objset_stats_t zfs_dmustats; nvlist_t *zfs_props; nvlist_t *zfs_user_props; + nvlist_t *zfs_recvd_props; boolean_t zfs_mntcheck; char *zfs_mntopts; + uint8_t *zfs_props_table; }; /* @@ -169,9 +188,6 @@ zfs_handle_t *make_dataset_handle(libzfs_handle_t *, const char *); int zpool_open_silent(libzfs_handle_t *, const char *, zpool_handle_t **); -int zvol_create_link(libzfs_handle_t *, const char *); -int zvol_remove_link(libzfs_handle_t *, const char *); -int zpool_iter_zvol(zpool_handle_t *, int (*)(const char *, void *), void *); boolean_t zpool_name_valid(libzfs_handle_t *, boolean_t, const char *); void namespace_clear(libzfs_handle_t *); @@ -184,8 +200,11 @@ extern int zfs_init_libshare(libzfs_handle_t *, int); extern void zfs_uninit_libshare(libzfs_handle_t *); extern int zfs_parse_options(char *, zfs_share_proto_t); -extern int zfs_unshare_proto(zfs_handle_t *zhp, +extern int zfs_unshare_proto(zfs_handle_t *, const char *, zfs_share_proto_t *); + +extern void libzfs_fru_clear(libzfs_handle_t *, boolean_t); + #ifdef __cplusplus } #endif diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c index d67776889d350..fd3044b1da333 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_import.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Pool import support functions. * @@ -41,15 +39,21 @@ * using our derived config, and record the results. */ +#include #include #include #include #include +#include #include #include #include #include #include +#include +#include +#include +#include #include @@ -388,8 +392,6 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config) } if (err) { - (void) zpool_standard_error(hdl, errno, - dgettext(TEXT_DOMAIN, "cannot discover pools")); zcmd_free_nvlists(&zc); return (NULL); } @@ -403,6 +405,21 @@ refresh_config(libzfs_handle_t *hdl, nvlist_t *config) return (nvl); } +/* + * Determine if the vdev id is a hole in the namespace. + */ +boolean_t +vdev_is_hole(uint64_t *hole_array, uint_t holes, uint_t id) +{ + for (int c = 0; c < holes; c++) { + + /* Top-level is a hole */ + if (hole_array[c] == id) + return (B_TRUE); + } + return (B_FALSE); +} + /* * Convert our list of pools into the definitive set of configurations. We * start by picking the best config for each toplevel vdev. Once that's done, @@ -425,17 +442,20 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) uint64_t version, guid; uint_t children = 0; nvlist_t **child = NULL; + uint_t holes; + uint64_t *hole_array, max_id; uint_t c; boolean_t isactive; uint64_t hostid; nvlist_t *nvl; boolean_t found_one = B_FALSE; + boolean_t valid_top_config = B_FALSE; if (nvlist_alloc(&ret, 0, 0) != 0) goto nomem; for (pe = pl->pools; pe != NULL; pe = pe->pe_next) { - uint64_t id; + uint64_t id, max_txg = 0; if (nvlist_alloc(&config, NV_UNIQUE_NAME, 0) != 0) goto nomem; @@ -463,6 +483,42 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) } } + /* + * We rely on the fact that the max txg for the + * pool will contain the most up-to-date information + * about the valid top-levels in the vdev namespace. + */ + if (best_txg > max_txg) { + (void) nvlist_remove(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + DATA_TYPE_UINT64); + (void) nvlist_remove(config, + ZPOOL_CONFIG_HOLE_ARRAY, + DATA_TYPE_UINT64_ARRAY); + + max_txg = best_txg; + hole_array = NULL; + holes = 0; + max_id = 0; + valid_top_config = B_FALSE; + + if (nvlist_lookup_uint64(tmp, + ZPOOL_CONFIG_VDEV_CHILDREN, &max_id) == 0) { + verify(nvlist_add_uint64(config, + ZPOOL_CONFIG_VDEV_CHILDREN, + max_id) == 0); + valid_top_config = B_TRUE; + } + + if (nvlist_lookup_uint64_array(tmp, + ZPOOL_CONFIG_HOLE_ARRAY, &hole_array, + &holes) == 0) { + verify(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_HOLE_ARRAY, + hole_array, holes) == 0); + } + } + if (!config_seen) { /* * Copy the relevant pieces of data to the pool @@ -522,6 +578,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) ZPOOL_CONFIG_VDEV_TREE, &nvtop) == 0); verify(nvlist_lookup_uint64(nvtop, ZPOOL_CONFIG_ID, &id) == 0); + if (id >= children) { nvlist_t **newchild; @@ -542,9 +599,74 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) } + /* + * If we have information about all the top-levels then + * clean up the nvlist which we've constructed. This + * means removing any extraneous devices that are + * beyond the valid range or adding devices to the end + * of our array which appear to be missing. + */ + if (valid_top_config) { + if (max_id < children) { + for (c = max_id; c < children; c++) + nvlist_free(child[c]); + children = max_id; + } else if (max_id > children) { + nvlist_t **newchild; + + newchild = zfs_alloc(hdl, (max_id) * + sizeof (nvlist_t *)); + if (newchild == NULL) + goto nomem; + + for (c = 0; c < children; c++) + newchild[c] = child[c]; + + free(child); + child = newchild; + children = max_id; + } + } + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &guid) == 0); + /* + * The vdev namespace may contain holes as a result of + * device removal. We must add them back into the vdev + * tree before we process any missing devices. + */ + if (holes > 0) { + ASSERT(valid_top_config); + + for (c = 0; c < children; c++) { + nvlist_t *holey; + + if (child[c] != NULL || + !vdev_is_hole(hole_array, holes, c)) + continue; + + if (nvlist_alloc(&holey, NV_UNIQUE_NAME, + 0) != 0) + goto nomem; + + /* + * Holes in the namespace are treated as + * "hole" top-level vdevs and have a + * special flag set on them. + */ + if (nvlist_add_string(holey, + ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_ID, c) != 0 || + nvlist_add_uint64(holey, + ZPOOL_CONFIG_GUID, 0ULL) != 0) + goto nomem; + child[c] = holey; + } + } + /* * Look for any missing top-level vdevs. If this is the case, * create a faked up 'missing' vdev as a placeholder. We cannot @@ -552,7 +674,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) * certain checks to make sure the vdev IDs match their location * in the configuration. */ - for (c = 0; c < children; c++) + for (c = 0; c < children; c++) { if (child[c] == NULL) { nvlist_t *missing; if (nvlist_alloc(&missing, NV_UNIQUE_NAME, @@ -570,6 +692,7 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) } child[c] = missing; } + } /* * Put all of this pool's top-level vdevs into a root vdev. @@ -636,8 +759,11 @@ get_configs(libzfs_handle_t *hdl, pool_list_t *pl, boolean_t active_ok) continue; } - if ((nvl = refresh_config(hdl, config)) == NULL) - goto error; + if ((nvl = refresh_config(hdl, config)) == NULL) { + nvlist_free(config); + config = NULL; + continue; + } nvlist_free(config); config = nvl; @@ -777,6 +903,212 @@ zpool_read_label(int fd, nvlist_t **config) return (0); } +typedef struct rdsk_node { + char *rn_name; + int rn_dfd; + libzfs_handle_t *rn_hdl; + nvlist_t *rn_config; + avl_tree_t *rn_avl; + avl_node_t rn_node; + boolean_t rn_nozpool; +} rdsk_node_t; + +static int +slice_cache_compare(const void *arg1, const void *arg2) +{ + const char *nm1 = ((rdsk_node_t *)arg1)->rn_name; + const char *nm2 = ((rdsk_node_t *)arg2)->rn_name; + char *nm1slice, *nm2slice; + int rv; + + /* + * slices zero and two are the most likely to provide results, + * so put those first + */ + nm1slice = strstr(nm1, "s0"); + nm2slice = strstr(nm2, "s0"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + nm1slice = strstr(nm1, "s2"); + nm2slice = strstr(nm2, "s2"); + if (nm1slice && !nm2slice) { + return (-1); + } + if (!nm1slice && nm2slice) { + return (1); + } + + rv = strcmp(nm1, nm2); + if (rv == 0) + return (0); + return (rv > 0 ? 1 : -1); +} + +static void +check_one_slice(avl_tree_t *r, char *diskname, uint_t partno, + diskaddr_t size, uint_t blksz) +{ + rdsk_node_t tmpnode; + rdsk_node_t *node; + char sname[MAXNAMELEN]; + + tmpnode.rn_name = &sname[0]; + (void) snprintf(tmpnode.rn_name, MAXNAMELEN, "%s%u", + diskname, partno); + /* + * protect against division by zero for disk labels that + * contain a bogus sector size + */ + if (blksz == 0) + blksz = DEV_BSIZE; + /* too small to contain a zpool? */ + if ((size < (SPA_MINDEVSIZE / blksz)) && + (node = avl_find(r, &tmpnode, NULL))) + node->rn_nozpool = B_TRUE; +} + +static void +nozpool_all_slices(avl_tree_t *r, const char *sname) +{ + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if (((ptr = strrchr(diskname, 's')) == NULL) && + ((ptr = strrchr(diskname, 'p')) == NULL)) + return; + ptr[0] = 's'; + ptr[1] = '\0'; + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, 0, 1); + ptr[0] = 'p'; + for (i = 0; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); +} + +static void +check_slices(avl_tree_t *r, int fd, const char *sname) +{ + struct extvtoc vtoc; + struct dk_gpt *gpt; + char diskname[MAXNAMELEN]; + char *ptr; + int i; + + (void) strncpy(diskname, sname, MAXNAMELEN); + if ((ptr = strrchr(diskname, 's')) == NULL || !isdigit(ptr[1])) + return; + ptr[1] = '\0'; + + if (read_extvtoc(fd, &vtoc) >= 0) { + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + vtoc.v_part[i].p_size, vtoc.v_sectorsz); + } else if (efi_alloc_and_read(fd, &gpt) >= 0) { + /* + * on x86 we'll still have leftover links that point + * to slices s[9-15], so use NDKMAP instead + */ + for (i = 0; i < NDKMAP; i++) + check_one_slice(r, diskname, i, + gpt->efi_parts[i].p_size, gpt->efi_lbasize); + /* nodes p[1-4] are never used with EFI labels */ + ptr[0] = 'p'; + for (i = 1; i <= FD_NUMPART; i++) + check_one_slice(r, diskname, i, 0, 1); + efi_free(gpt); + } +} + +static void +zpool_open_func(void *arg) +{ + rdsk_node_t *rn = arg; + struct stat64 statbuf; + nvlist_t *config; + int fd; + + if (rn->rn_nozpool) + return; + if ((fd = openat64(rn->rn_dfd, rn->rn_name, O_RDONLY)) < 0) { + /* symlink to a device that's no longer there */ + if (errno == ENOENT) + nozpool_all_slices(rn->rn_avl, rn->rn_name); + return; + } + /* + * Ignore failed stats. We only want regular + * files, character devs and block devs. + */ + if (fstat64(fd, &statbuf) != 0 || + (!S_ISREG(statbuf.st_mode) && + !S_ISCHR(statbuf.st_mode) && + !S_ISBLK(statbuf.st_mode))) { + (void) close(fd); + return; + } + /* this file is too small to hold a zpool */ + if (S_ISREG(statbuf.st_mode) && + statbuf.st_size < SPA_MINDEVSIZE) { + (void) close(fd); + return; + } else if (!S_ISREG(statbuf.st_mode)) { + /* + * Try to read the disk label first so we don't have to + * open a bunch of minor nodes that can't have a zpool. + */ + check_slices(rn->rn_avl, fd, rn->rn_name); + } + + if ((zpool_read_label(fd, &config)) != 0) { + (void) close(fd); + (void) no_memory(rn->rn_hdl); + return; + } + (void) close(fd); + + + rn->rn_config = config; + if (config != NULL) { + assert(rn->rn_nozpool == B_FALSE); + } +} + +/* + * Given a file descriptor, clear (zero) the label information. This function + * is currently only used in the appliance stack as part of the ZFS sysevent + * module. + */ +int +zpool_clear_label(int fd) +{ + struct stat64 statbuf; + int l; + vdev_label_t *label; + uint64_t size; + + if (fstat64(fd, &statbuf) == -1) + return (0); + size = P2ALIGN_TYPED(statbuf.st_size, sizeof (vdev_label_t), uint64_t); + + if ((label = calloc(sizeof (vdev_label_t), 1)) == NULL) + return (-1); + + for (l = 0; l < VDEV_LABELS; l++) { + if (pwrite64(fd, label, sizeof (vdev_label_t), + label_offset(size, l)) != sizeof (vdev_label_t)) + return (-1); + } + + free(label); + return (0); +} + /* * Given a list of directories to search, find all pools stored on disk. This * includes partial pools which are not available to import. If no args are @@ -785,30 +1117,28 @@ zpool_read_label(int fd, nvlist_t **config) * to import a specific pool. */ static nvlist_t * -zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv, - boolean_t active_ok, char *poolname, uint64_t guid) +zpool_find_import_impl(libzfs_handle_t *hdl, importargs_t *iarg) { - int i; + int i, dirs = iarg->paths; DIR *dirp = NULL; struct dirent64 *dp; char path[MAXPATHLEN]; - char *end; + char *end, **dir = iarg->path; size_t pathleft; - struct stat64 statbuf; - nvlist_t *ret = NULL, *config; + nvlist_t *ret = NULL; static char *default_dir = "/dev/dsk"; - int fd; pool_list_t pools = { 0 }; pool_entry_t *pe, *penext; vdev_entry_t *ve, *venext; config_entry_t *ce, *cenext; name_entry_t *ne, *nenext; + avl_tree_t slice_cache; + rdsk_node_t *slice; + void *cookie; - verify(poolname == NULL || guid == 0); - - if (argc == 0) { - argc = 1; - argv = &default_dir; + if (dirs == 0) { + dirs = 1; + dir = &default_dir; } /* @@ -816,15 +1146,15 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv, * possible device, organizing the information according to pool GUID * and toplevel GUID. */ - for (i = 0; i < argc; i++) { + for (i = 0; i < dirs; i++) { + tpool_t *t; char *rdsk; int dfd; /* use realpath to normalize the path */ - if (realpath(argv[i], path) == 0) { + if (realpath(dir[i], path) == 0) { (void) zfs_error_fmt(hdl, EZFS_BADPATH, - dgettext(TEXT_DOMAIN, "cannot open '%s'"), - argv[i]); + dgettext(TEXT_DOMAIN, "cannot open '%s'"), dir[i]); goto error; } end = &path[strlen(path)]; @@ -851,6 +1181,8 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv, goto error; } + avl_create(&slice_cache, slice_cache_compare, + sizeof (rdsk_node_t), offsetof(rdsk_node_t, rn_node)); /* * This is not MT-safe, but we have no MT consumers of libzfs */ @@ -860,46 +1192,53 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv, (name[1] == 0 || (name[1] == '.' && name[2] == 0))) continue; - if ((fd = openat64(dfd, name, O_RDONLY)) < 0) - continue; - - /* - * Ignore failed stats. We only want regular - * files, character devs and block devs. - */ - if (fstat64(fd, &statbuf) != 0 || - (!S_ISREG(statbuf.st_mode) && - !S_ISCHR(statbuf.st_mode) && - !S_ISBLK(statbuf.st_mode))) { - (void) close(fd); - continue; - } - - if ((zpool_read_label(fd, &config)) != 0) { - (void) close(fd); - (void) no_memory(hdl); - goto error; - } - - (void) close(fd); - - if (config != NULL) { + slice = zfs_alloc(hdl, sizeof (rdsk_node_t)); + slice->rn_name = zfs_strdup(hdl, name); + slice->rn_avl = &slice_cache; + slice->rn_dfd = dfd; + slice->rn_hdl = hdl; + slice->rn_nozpool = B_FALSE; + avl_add(&slice_cache, slice); + } + /* + * create a thread pool to do all of this in parallel; + * rn_nozpool is not protected, so this is racy in that + * multiple tasks could decide that the same slice can + * not hold a zpool, which is benign. Also choose + * double the number of processors; we hold a lot of + * locks in the kernel, so going beyond this doesn't + * buy us much. + */ + t = tpool_create(1, 2 * sysconf(_SC_NPROCESSORS_ONLN), + 0, NULL); + for (slice = avl_first(&slice_cache); slice; + (slice = avl_walk(&slice_cache, slice, + AVL_AFTER))) + (void) tpool_dispatch(t, zpool_open_func, slice); + tpool_wait(t); + tpool_destroy(t); + + cookie = NULL; + while ((slice = avl_destroy_nodes(&slice_cache, + &cookie)) != NULL) { + if (slice->rn_config != NULL) { + nvlist_t *config = slice->rn_config; boolean_t matched = B_TRUE; - if (poolname != NULL) { + if (iarg->poolname != NULL) { char *pname; matched = nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, &pname) == 0 && - strcmp(poolname, pname) == 0; - } else if (guid != 0) { + strcmp(iarg->poolname, pname) == 0; + } else if (iarg->guid != 0) { uint64_t this_guid; matched = nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &this_guid) == 0 && - guid == this_guid; + iarg->guid == this_guid; } if (!matched) { nvlist_free(config); @@ -907,17 +1246,20 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv, continue; } /* use the non-raw path for the config */ - (void) strlcpy(end, name, pathleft); + (void) strlcpy(end, slice->rn_name, pathleft); if (add_config(hdl, &pools, path, config) != 0) goto error; } + free(slice->rn_name); + free(slice); } + avl_destroy(&slice_cache); (void) closedir(dirp); dirp = NULL; } - ret = get_configs(hdl, &pools, active_ok); + ret = get_configs(hdl, &pools, iarg->can_be_active); error: for (pe = pools.pools; pe != NULL; pe = penext) { @@ -951,27 +1293,12 @@ zpool_find_import_impl(libzfs_handle_t *hdl, int argc, char **argv, nvlist_t * zpool_find_import(libzfs_handle_t *hdl, int argc, char **argv) { - return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, 0)); -} + importargs_t iarg = { 0 }; -nvlist_t * -zpool_find_import_byname(libzfs_handle_t *hdl, int argc, char **argv, - char *pool) -{ - return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, pool, 0)); -} + iarg.paths = argc; + iarg.path = argv; -nvlist_t * -zpool_find_import_byguid(libzfs_handle_t *hdl, int argc, char **argv, - uint64_t guid) -{ - return (zpool_find_import_impl(hdl, argc, argv, B_FALSE, NULL, guid)); -} - -nvlist_t * -zpool_find_import_activeok(libzfs_handle_t *hdl, int argc, char **argv) -{ - return (zpool_find_import_impl(hdl, argc, argv, B_TRUE, NULL, 0)); + return (zpool_find_import_impl(hdl, &iarg)); } /* @@ -1093,6 +1420,46 @@ zpool_find_import_cached(libzfs_handle_t *hdl, const char *cachefile, return (pools); } +static int +name_or_guid_exists(zpool_handle_t *zhp, void *data) +{ + importargs_t *import = data; + int found = 0; + + if (import->poolname != NULL) { + char *pool_name; + + verify(nvlist_lookup_string(zhp->zpool_config, + ZPOOL_CONFIG_POOL_NAME, &pool_name) == 0); + if (strcmp(pool_name, import->poolname) == 0) + found = 1; + } else { + uint64_t pool_guid; + + verify(nvlist_lookup_uint64(zhp->zpool_config, + ZPOOL_CONFIG_POOL_GUID, &pool_guid) == 0); + if (pool_guid == import->guid) + found = 1; + } + + zpool_close(zhp); + return (found); +} + +nvlist_t * +zpool_search_import(libzfs_handle_t *hdl, importargs_t *import) +{ + verify(import->poolname == NULL || import->guid == 0); + + if (import->unique) + import->exists = zpool_iter(hdl, name_or_guid_exists, import); + + if (import->cachefile != NULL) + return (zpool_find_import_cached(hdl, import->cachefile, + import->poolname, import->guid)); + + return (zpool_find_import_impl(hdl, import)); +} boolean_t find_guid(nvlist_t *nv, uint64_t guid) diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c index 7c5c7f3ecaeed..62348b6cedc11 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_mount.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -74,7 +74,6 @@ #include #include #include -#include #include #include @@ -236,18 +235,9 @@ dir_is_empty(const char *dirname) boolean_t is_mounted(libzfs_handle_t *zfs_hdl, const char *special, char **where) { - struct mnttab search = { 0 }, entry; - - /* - * Search for the entry in /etc/mnttab. We don't bother getting the - * mountpoint, as we can just search for the special device. This will - * also let us find mounts when the mountpoint is 'legacy'. - */ - search.mnt_special = (char *)special; - search.mnt_fstype = MNTTYPE_ZFS; + struct mnttab entry; - rewind(zfs_hdl->libzfs_mnttab); - if (getmntany(zfs_hdl->libzfs_mnttab, &entry, &search) != 0) + if (libzfs_mnttab_find(zfs_hdl, special, &entry) != 0) return (B_FALSE); if (where != NULL) @@ -358,12 +348,14 @@ zfs_mount(zfs_handle_t *zhp, const char *options, int flags) } else { zfs_error_aux(hdl, strerror(errno)); } - return (zfs_error_fmt(hdl, EZFS_MOUNTFAILED, dgettext(TEXT_DOMAIN, "cannot mount '%s'"), zhp->zfs_name)); } + /* add the mounted entry into our cache */ + libzfs_mnttab_add(hdl, zfs_get_name(zhp), mountpoint, + mntopts); return (0); } @@ -389,26 +381,23 @@ unmount_one(libzfs_handle_t *hdl, const char *mountpoint, int flags) int zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) { - struct mnttab search = { 0 }, entry; + libzfs_handle_t *hdl = zhp->zfs_hdl; + struct mnttab entry; char *mntpt = NULL; - /* check to see if need to unmount the filesystem */ - search.mnt_special = zhp->zfs_name; - search.mnt_fstype = MNTTYPE_ZFS; - rewind(zhp->zfs_hdl->libzfs_mnttab); + /* check to see if we need to unmount the filesystem */ if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && - getmntany(zhp->zfs_hdl->libzfs_mnttab, &entry, &search) == 0)) { - + libzfs_mnttab_find(hdl, zhp->zfs_name, &entry) == 0)) { /* * mountpoint may have come from a call to * getmnt/getmntany if it isn't NULL. If it is NULL, - * we know it comes from getmntany which can then get - * overwritten later. We strdup it to play it safe. + * we know it comes from libzfs_mnttab_find which can + * then get freed later. We strdup it to play it safe. */ if (mountpoint == NULL) - mntpt = zfs_strdup(zhp->zfs_hdl, entry.mnt_mountp); + mntpt = zfs_strdup(hdl, entry.mnt_mountp); else - mntpt = zfs_strdup(zhp->zfs_hdl, mountpoint); + mntpt = zfs_strdup(hdl, mountpoint); /* * Unshare and unmount the filesystem @@ -416,11 +405,12 @@ zfs_unmount(zfs_handle_t *zhp, const char *mountpoint, int flags) if (zfs_unshare_proto(zhp, mntpt, share_all_proto) != 0) return (-1); - if (unmount_one(zhp->zfs_hdl, mntpt, flags) != 0) { + if (unmount_one(hdl, mntpt, flags) != 0) { free(mntpt); (void) zfs_shareall(zhp); return (-1); } + libzfs_mnttab_remove(hdl, zhp->zfs_name); free(mntpt); } @@ -849,7 +839,7 @@ unshare_one(libzfs_handle_t *hdl, const char *name, const char *mountpoint, char *mntpt; /* * Mountpoint could get trashed if libshare calls getmntany - * which id does during API initialization, so strdup the + * which it does during API initialization, so strdup the * value. */ mntpt = zfs_strdup(hdl, mountpoint); @@ -887,18 +877,17 @@ int zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, zfs_share_proto_t *proto) { - struct mnttab search = { 0 }, entry; + libzfs_handle_t *hdl = zhp->zfs_hdl; + struct mnttab entry; char *mntpt = NULL; /* check to see if need to unmount the filesystem */ - search.mnt_special = (char *)zfs_get_name(zhp); - search.mnt_fstype = MNTTYPE_ZFS; rewind(zhp->zfs_hdl->libzfs_mnttab); if (mountpoint != NULL) - mntpt = zfs_strdup(zhp->zfs_hdl, mountpoint); + mountpoint = mntpt = zfs_strdup(hdl, mountpoint); if (mountpoint != NULL || ((zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) && - getmntany(zhp->zfs_hdl->libzfs_mnttab, &entry, &search) == 0)) { + libzfs_mnttab_find(hdl, zfs_get_name(zhp), &entry) == 0)) { zfs_share_proto_t *curr_proto; if (mountpoint == NULL) @@ -907,8 +896,8 @@ zfs_unshare_proto(zfs_handle_t *zhp, const char *mountpoint, for (curr_proto = proto; *curr_proto != PROTO_END; curr_proto++) { - if (is_shared(zhp->zfs_hdl, mntpt, *curr_proto) && - unshare_one(zhp->zfs_hdl, zhp->zfs_name, + if (is_shared(hdl, mntpt, *curr_proto) && + unshare_one(hdl, zhp->zfs_name, mntpt, *curr_proto) != 0) { if (mntpt != NULL) free(mntpt); @@ -1191,10 +1180,12 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) /* * And mount all the datasets, keeping track of which ones - * succeeded or failed. By using zfs_alloc(), the good pointer - * will always be non-NULL. + * succeeded or failed. */ - good = zfs_alloc(zhp->zpool_hdl, cb.cb_used * sizeof (int)); + if ((good = zfs_alloc(zhp->zpool_hdl, + cb.cb_used * sizeof (int))) == NULL) + goto out; + ret = 0; for (i = 0; i < cb.cb_used; i++) { if (zfs_mount(cb.cb_datasets[i], mntopts, flags) != 0) @@ -1224,26 +1215,19 @@ zpool_enable_datasets(zpool_handle_t *zhp, const char *mntopts, int flags) return (ret); } - +/*ARGSUSED1*/ static int -zvol_cb(const char *dataset, void *data) +zvol_cb(zfs_handle_t *zhp, void *unused) { - libzfs_handle_t *hdl = data; - zfs_handle_t *zhp; - - /* - * Ignore snapshots and ignore failures from non-existant datasets. - */ - if (strchr(dataset, '@') != NULL || - (zhp = zfs_open(hdl, dataset, ZFS_TYPE_VOLUME)) == NULL) - return (0); - - if (zfs_unshare_iscsi(zhp) != 0) - return (-1); + int error = 0; + if (zfs_get_type(zhp) == ZFS_TYPE_FILESYSTEM) + (void) zfs_iter_children(zhp, zvol_cb, NULL); + if (zfs_get_type(zhp) == ZFS_TYPE_VOLUME) + error = zfs_unshare_iscsi(zhp); zfs_close(zhp); - return (0); + return (error); } static int @@ -1255,6 +1239,8 @@ mountpoint_compare(const void *a, const void *b) return (strcmp(mountb, mounta)); } +/* alias for 2002/240 */ +#pragma weak zpool_unmount_datasets = zpool_disable_datasets /* * Unshare and unmount all datasets within the given pool. We don't want to * rely on traversing the DSL to discover the filesystems within the pool, @@ -1262,7 +1248,6 @@ mountpoint_compare(const void *a, const void *b) * arbitrarily (on I/O error, for example). Instead, we walk /etc/mnttab and * gather all the filesystems that are currently mounted. */ -#pragma weak zpool_unmount_datasets = zpool_disable_datasets int zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) { @@ -1270,6 +1255,7 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) struct mnttab entry; size_t namelen; char **mountpoints = NULL; + zfs_handle_t *zfp; zfs_handle_t **datasets = NULL; libzfs_handle_t *hdl = zhp->zpool_hdl; int i; @@ -1279,8 +1265,12 @@ zpool_disable_datasets(zpool_handle_t *zhp, boolean_t force) /* * First unshare all zvols. */ - if (zpool_iter_zvol(zhp, zvol_cb, hdl) != 0) - return (-1); + zfp = zfs_open(zhp->zpool_hdl, zhp->zpool_name, + ZFS_TYPE_FILESYSTEM); + if (zfp != NULL) { + (void) zfs_iter_children(zfp, zvol_cb, NULL); + zfs_close(zfp); + } namelen = strlen(zhp->zpool_name); diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c index 18ceb4859654e..3c0f46815b49a 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_pool.c @@ -20,32 +20,72 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#include -#include #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include -#include -#include +#include #include "zfs_namecheck.h" #include "zfs_prop.h" #include "libzfs_impl.h" +#include "zfs_comutil.h" + +const char *hist_event_table[LOG_END] = { + "invalid event", + "pool create", + "vdev add", + "pool remove", + "pool destroy", + "pool export", + "pool import", + "vdev attach", + "vdev replace", + "vdev detach", + "vdev online", + "vdev offline", + "vdev upgrade", + "pool clear", + "pool scrub", + "pool property set", + "create", + "clone", + "destroy", + "destroy_begin_sync", + "inherit", + "property set", + "quota set", + "permission update", + "permission remove", + "permission who remove", + "promote", + "receive", + "rename", + "reservation set", + "replay_inc_sync", + "replay_full_sync", + "rollback", + "snapshot", + "filesystem version upgrade", + "refquota set", + "refreservation set", + "pool scrub done", + "user hold", + "user release", + "pool split", +}; static int read_efi_label(nvlist_t *config, diskaddr_t *sb); @@ -55,6 +95,10 @@ static int read_efi_label(nvlist_t *config, diskaddr_t *sb); #define BOOTCMD "installboot(1M)" #endif +#define DISK_ROOT "/dev/dsk" +#define RDISK_ROOT "/dev/rdsk" +#define BACKUP_SLICE "s2" + /* * ==================================================================== * zpool property functions @@ -188,6 +232,8 @@ zpool_state_to_name(vdev_state_t state, vdev_aux_t aux) case VDEV_STATE_CANT_OPEN: if (aux == VDEV_AUX_CORRUPT_DATA || aux == VDEV_AUX_BAD_LOG) return (gettext("FAULTED")); + else if (aux == VDEV_AUX_SPLIT_POOL) + return (gettext("SPLIT")); else return (gettext("UNAVAIL")); case VDEV_STATE_FAULTED: @@ -217,12 +263,39 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, uint_t vsc; if (zpool_get_state(zhp) == POOL_STATE_UNAVAIL) { - if (prop == ZPOOL_PROP_NAME) + switch (prop) { + case ZPOOL_PROP_NAME: (void) strlcpy(buf, zpool_get_name(zhp), len); - else if (prop == ZPOOL_PROP_HEALTH) + break; + + case ZPOOL_PROP_HEALTH: (void) strlcpy(buf, "FAULTED", len); - else + break; + + case ZPOOL_PROP_GUID: + intval = zpool_get_prop_int(zhp, prop, &src); + (void) snprintf(buf, len, "%llu", intval); + break; + + case ZPOOL_PROP_ALTROOT: + case ZPOOL_PROP_CACHEFILE: + if (zhp->zpool_props != NULL || + zpool_get_all_props(zhp) == 0) { + (void) strlcpy(buf, + zpool_get_prop_string(zhp, prop, &src), + len); + if (srctype != NULL) + *srctype = src; + return (0); + } + /* FALLTHROUGH */ + default: (void) strlcpy(buf, "-", len); + break; + } + + if (srctype != NULL) + *srctype = src; return (0); } @@ -241,8 +314,8 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, switch (prop) { case ZPOOL_PROP_SIZE: - case ZPOOL_PROP_USED: - case ZPOOL_PROP_AVAILABLE: + case ZPOOL_PROP_ALLOCATED: + case ZPOOL_PROP_FREE: (void) zfs_nicenum(intval, buf, len); break; @@ -251,6 +324,12 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len, (u_longlong_t)intval); break; + case ZPOOL_PROP_DEDUPRATIO: + (void) snprintf(buf, len, "%llu.%02llux", + (u_longlong_t)(intval / 100), + (u_longlong_t)(intval % 100)); + break; + case ZPOOL_PROP_HEALTH: verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); @@ -532,9 +611,6 @@ zpool_set_prop(zpool_handle_t *zhp, const char *propname, const char *propval) dgettext(TEXT_DOMAIN, "cannot set property for '%s'"), zhp->zpool_name); - if (zhp->zpool_props == NULL && zpool_get_all_props(zhp)) - return (zfs_error(zhp->zpool_hdl, EZFS_POOLPROPS, errbuf)); - if (nvlist_alloc(&nvl, NV_UNIQUE_NAME, 0) != 0) return (no_memory(zhp->zpool_hdl)); @@ -603,6 +679,12 @@ zpool_expand_proplist(zpool_handle_t *zhp, zprop_list_t **plp) } +/* + * Don't start the slice at the default block of 34; many storage + * devices will use a stripe width of 128k, so start there instead. + */ +#define NEW_START_BLOCK 256 + /* * Validate the given pool name, optionally putting an extended error message in * 'buf'. @@ -969,9 +1051,6 @@ zpool_destroy(zpool_handle_t *zhp) ZFS_TYPE_FILESYSTEM)) == NULL) return (-1); - if (zpool_remove_zvol_links(zhp) != 0) - return (-1); - (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_DESTROY, &zc) != 0) { @@ -1037,7 +1116,8 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "device '%s' contains an EFI label and " "cannot be used on root pools."), - zpool_vdev_name(hdl, NULL, spares[s])); + zpool_vdev_name(hdl, NULL, spares[s], + B_FALSE)); return (zfs_error(hdl, EZFS_POOL_NOTSUP, msg)); } } @@ -1127,19 +1207,17 @@ zpool_add(zpool_handle_t *zhp, nvlist_t *nvroot) * mounted datasets in the pool. */ int -zpool_export(zpool_handle_t *zhp, boolean_t force) +zpool_export_common(zpool_handle_t *zhp, boolean_t force, boolean_t hardforce) { zfs_cmd_t zc = { 0 }; char msg[1024]; - if (zpool_remove_zvol_links(zhp) != 0) - return (-1); - (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot export '%s'"), zhp->zpool_name); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_cookie = force; + zc.zc_guid = hardforce; if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_POOL_EXPORT, &zc) != 0) { switch (errno) { @@ -1160,6 +1238,139 @@ zpool_export(zpool_handle_t *zhp, boolean_t force) return (0); } +int +zpool_export(zpool_handle_t *zhp, boolean_t force) +{ + return (zpool_export_common(zhp, force, B_FALSE)); +} + +int +zpool_export_force(zpool_handle_t *zhp) +{ + return (zpool_export_common(zhp, B_TRUE, B_TRUE)); +} + +static void +zpool_rewind_exclaim(libzfs_handle_t *hdl, const char *name, boolean_t dryrun, + nvlist_t *rbi) +{ + uint64_t rewindto; + int64_t loss = -1; + struct tm t; + char timestr[128]; + + if (!hdl->libzfs_printerr || rbi == NULL) + return; + + if (nvlist_lookup_uint64(rbi, ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + return; + (void) nvlist_lookup_int64(rbi, ZPOOL_CONFIG_REWIND_TIME, &loss); + + if (localtime_r((time_t *)&rewindto, &t) != NULL && + strftime(timestr, 128, 0, &t) != 0) { + if (dryrun) { + (void) printf(dgettext(TEXT_DOMAIN, + "Would be able to return %s " + "to its state as of %s.\n"), + name, timestr); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "Pool %s returned to its state as of %s.\n"), + name, timestr); + } + if (loss > 120) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s approximately %lld "), + dryrun ? "Would discard" : "Discarded", + (loss + 30) / 60); + (void) printf(dgettext(TEXT_DOMAIN, + "minutes of transactions.\n")); + } else if (loss > 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "%s approximately %lld "), + dryrun ? "Would discard" : "Discarded", loss); + (void) printf(dgettext(TEXT_DOMAIN, + "seconds of transactions.\n")); + } + } +} + +void +zpool_explain_recover(libzfs_handle_t *hdl, const char *name, int reason, + nvlist_t *config) +{ + int64_t loss = -1; + uint64_t edata = UINT64_MAX; + uint64_t rewindto; + struct tm t; + char timestr[128]; + + if (!hdl->libzfs_printerr) + return; + + if (reason >= 0) + (void) printf(dgettext(TEXT_DOMAIN, "action: ")); + else + (void) printf(dgettext(TEXT_DOMAIN, "\t")); + + /* All attempted rewinds failed if ZPOOL_CONFIG_LOAD_TIME missing */ + if (nvlist_lookup_uint64(config, + ZPOOL_CONFIG_LOAD_TIME, &rewindto) != 0) + goto no_info; + + (void) nvlist_lookup_int64(config, ZPOOL_CONFIG_REWIND_TIME, &loss); + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_LOAD_DATA_ERRORS, + &edata); + + (void) printf(dgettext(TEXT_DOMAIN, + "Recovery is possible, but will result in some data loss.\n")); + + if (localtime_r((time_t *)&rewindto, &t) != NULL && + strftime(timestr, 128, 0, &t) != 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "\tReturning the pool to its state as of %s\n" + "\tshould correct the problem. "), + timestr); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "\tReverting the pool to an earlier state " + "should correct the problem.\n\t")); + } + + if (loss > 120) { + (void) printf(dgettext(TEXT_DOMAIN, + "Approximately %lld minutes of data\n" + "\tmust be discarded, irreversibly. "), (loss + 30) / 60); + } else if (loss > 0) { + (void) printf(dgettext(TEXT_DOMAIN, + "Approximately %lld seconds of data\n" + "\tmust be discarded, irreversibly. "), loss); + } + if (edata != 0 && edata != UINT64_MAX) { + if (edata == 1) { + (void) printf(dgettext(TEXT_DOMAIN, + "After rewind, at least\n" + "\tone persistent user-data error will remain. ")); + } else { + (void) printf(dgettext(TEXT_DOMAIN, + "After rewind, several\n" + "\tpersistent user-data errors will remain. ")); + } + } + (void) printf(dgettext(TEXT_DOMAIN, + "Recovery can be attempted\n\tby executing 'zpool %s -F %s'. "), + reason >= 0 ? "clear" : "import", name); + + (void) printf(dgettext(TEXT_DOMAIN, + "A scrub of the pool\n" + "\tis strongly recommended after recovery.\n")); + return; + +no_info: + (void) printf(dgettext(TEXT_DOMAIN, + "Destroy and re-create the pool from\n\ta backup source.\n")); +} + /* * zpool_import() is a contracted interface. Should be kept the same * if possible. @@ -1209,8 +1420,11 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, nvlist_t *props, boolean_t importfaulted) { zfs_cmd_t zc = { 0 }; + zpool_rewind_policy_t policy; + nvlist_t *nvi = NULL; char *thename; char *origname; + uint64_t returned_size; int ret; char errbuf[1024]; @@ -1254,11 +1468,30 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, nvlist_free(props); return (-1); } + returned_size = zc.zc_nvlist_conf_size + 512; + if (zcmd_alloc_dst_nvlist(hdl, &zc, returned_size) != 0) { + nvlist_free(props); + return (-1); + } zc.zc_cookie = (uint64_t)importfaulted; ret = 0; if (zfs_ioctl(hdl, ZFS_IOC_POOL_IMPORT, &zc) != 0) { char desc[1024]; + + (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); + zpool_get_rewind_policy(config, &policy); + /* + * Dry-run failed, but we print out what success + * looks like if we found a best txg + */ + if ((policy.zrp_request & ZPOOL_TRY_REWIND) && nvi) { + zpool_rewind_exclaim(hdl, newname ? origname : thename, + B_TRUE, nvi); + nvlist_free(nvi); + return (-1); + } + if (newname == NULL) (void) snprintf(desc, sizeof (desc), dgettext(TEXT_DOMAIN, "cannot import '%s'"), @@ -1281,7 +1514,12 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, break; default: + (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); (void) zpool_standard_error(hdl, errno, desc); + zpool_explain_recover(hdl, + newname ? origname : thename, -errno, nvi); + nvlist_free(nvi); + break; } ret = -1; @@ -1291,13 +1529,20 @@ zpool_import_props(libzfs_handle_t *hdl, nvlist_t *config, const char *newname, /* * This should never fail, but play it safe anyway. */ - if (zpool_open_silent(hdl, thename, &zhp) != 0) { + if (zpool_open_silent(hdl, thename, &zhp) != 0) ret = -1; - } else if (zhp != NULL) { - ret = zpool_create_zvol_links(zhp); + else if (zhp != NULL) zpool_close(zhp); + (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); + zpool_get_rewind_policy(config, &policy); + if (policy.zrp_request & + (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { + zpool_rewind_exclaim(hdl, newname ? origname : thename, + ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), + nvi); } - + nvlist_free(nvi); + return (0); } zcmd_free_nvlists(&zc); @@ -1332,46 +1577,137 @@ zpool_scrub(zpool_handle_t *zhp, pool_scrub_type_t type) } /* + * Find a vdev that matches the search criteria specified. We use the + * the nvpair name to determine how we should look for the device. * 'avail_spare' is set to TRUE if the provided guid refers to an AVAIL * spare; but FALSE if its an INUSE spare. */ static nvlist_t * -vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, - boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) +vdev_to_nvlist_iter(nvlist_t *nv, nvlist_t *search, boolean_t *avail_spare, + boolean_t *l2cache, boolean_t *log) { uint_t c, children; nvlist_t **child; - uint64_t theguid, present; - char *path; - uint64_t wholedisk = 0; nvlist_t *ret; uint64_t is_log; + char *srchkey; + nvpair_t *pair = nvlist_next_nvpair(search, NULL); + + /* Nothing to look for */ + if (search == NULL || pair == NULL) + return (NULL); + + /* Obtain the key we will use to search */ + srchkey = nvpair_name(pair); + + switch (nvpair_type(pair)) { + case DATA_TYPE_UINT64: { + uint64_t srchval, theguid, present; + + verify(nvpair_value_uint64(pair, &srchval) == 0); + if (strcmp(srchkey, ZPOOL_CONFIG_GUID) == 0) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &present) == 0) { + /* + * If the device has never been present since + * import, the only reliable way to match the + * vdev is by GUID. + */ + verify(nvlist_lookup_uint64(nv, + ZPOOL_CONFIG_GUID, &theguid) == 0); + if (theguid == srchval) + return (nv); + } + } + break; + } + + case DATA_TYPE_STRING: { + char *srchval, *val; - verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &theguid) == 0); + verify(nvpair_value_string(pair, &srchval) == 0); + if (nvlist_lookup_string(nv, srchkey, &val) != 0) + break; - if (search == NULL && - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, &present) == 0) { /* - * If the device has never been present since import, the only - * reliable way to match the vdev is by GUID. + * Search for the requested value. We special case the search + * for ZPOOL_CONFIG_PATH when it's a wholedisk and when + * Looking for a top-level vdev name (i.e. ZPOOL_CONFIG_TYPE). + * Otherwise, all other searches are simple string compares. */ - if (theguid == guid) - return (nv); - } else if (search != NULL && - nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) == 0) { - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, - &wholedisk); - if (wholedisk) { + if (strcmp(srchkey, ZPOOL_CONFIG_PATH) == 0 && val) { + uint64_t wholedisk = 0; + + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + if (wholedisk) { + /* + * For whole disks, the internal path has 's0', + * but the path passed in by the user doesn't. + */ + if (strlen(srchval) == strlen(val) - 2 && + strncmp(srchval, val, strlen(srchval)) == 0) + return (nv); + break; + } + } else if (strcmp(srchkey, ZPOOL_CONFIG_TYPE) == 0 && val) { + char *type, *idx, *end, *p; + uint64_t id, vdev_id; + + /* + * Determine our vdev type, keeping in mind + * that the srchval is composed of a type and + * vdev id pair (i.e. mirror-4). + */ + if ((type = strdup(srchval)) == NULL) + return (NULL); + + if ((p = strrchr(type, '-')) == NULL) { + free(type); + break; + } + idx = p + 1; + *p = '\0'; + /* - * For whole disks, the internal path has 's0', but the - * path passed in by the user doesn't. + * If the types don't match then keep looking. */ - if (strlen(search) == strlen(path) - 2 && - strncmp(search, path, strlen(search)) == 0) + if (strncmp(val, type, strlen(val)) != 0) { + free(type); + break; + } + + verify(strncmp(type, VDEV_TYPE_RAIDZ, + strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(type, VDEV_TYPE_MIRROR, + strlen(VDEV_TYPE_MIRROR)) == 0); + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &id) == 0); + + errno = 0; + vdev_id = strtoull(idx, &end, 10); + + free(type); + if (errno != 0) + return (NULL); + + /* + * Now verify that we have the correct vdev id. + */ + if (vdev_id == id) return (nv); - } else if (strcmp(search, path) == 0) { - return (nv); } + + /* + * Common case + */ + if (strcmp(srchval, val) == 0) + return (nv); + break; + } + + default: + break; } if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, @@ -1379,7 +1715,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, return (NULL); for (c = 0; c < children; c++) { - if ((ret = vdev_to_nvlist_iter(child[c], search, guid, + if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { /* * The 'is_log' value is only set for the toplevel @@ -1400,7 +1736,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_SPARES, &child, &children) == 0) { for (c = 0; c < children; c++) { - if ((ret = vdev_to_nvlist_iter(child[c], search, guid, + if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *avail_spare = B_TRUE; return (ret); @@ -1411,7 +1747,7 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) { - if ((ret = vdev_to_nvlist_iter(child[c], search, guid, + if ((ret = vdev_to_nvlist_iter(child[c], search, avail_spare, l2cache, NULL)) != NULL) { *l2cache = B_TRUE; return (ret); @@ -1422,24 +1758,62 @@ vdev_to_nvlist_iter(nvlist_t *nv, const char *search, uint64_t guid, return (NULL); } +/* + * Given a physical path (minus the "/devices" prefix), find the + * associated vdev. + */ +nvlist_t * +zpool_find_vdev_by_physpath(zpool_handle_t *zhp, const char *ppath, + boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) +{ + nvlist_t *search, *nvroot, *ret; + + verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); + verify(nvlist_add_string(search, ZPOOL_CONFIG_PHYS_PATH, ppath) == 0); + + verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + + *avail_spare = B_FALSE; + ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); + nvlist_free(search); + + return (ret); +} + +/* + * Determine if we have an "interior" top-level vdev (i.e mirror/raidz). + */ +boolean_t +zpool_vdev_is_interior(const char *name) +{ + if (strncmp(name, VDEV_TYPE_RAIDZ, strlen(VDEV_TYPE_RAIDZ)) == 0 || + strncmp(name, VDEV_TYPE_MIRROR, strlen(VDEV_TYPE_MIRROR)) == 0) + return (B_TRUE); + return (B_FALSE); +} + nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) { char buf[MAXPATHLEN]; - const char *search; char *end; - nvlist_t *nvroot; + nvlist_t *nvroot, *search, *ret; uint64_t guid; + verify(nvlist_alloc(&search, NV_UNIQUE_NAME, KM_SLEEP) == 0); + guid = strtoull(path, &end, 10); if (guid != 0 && *end == '\0') { - search = NULL; + verify(nvlist_add_uint64(search, ZPOOL_CONFIG_GUID, guid) == 0); + } else if (zpool_vdev_is_interior(path)) { + verify(nvlist_add_string(search, ZPOOL_CONFIG_TYPE, path) == 0); } else if (path[0] != '/') { (void) snprintf(buf, sizeof (buf), "%s%s", "/dev/dsk/", path); - search = buf; + verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, buf) == 0); } else { - search = path; + verify(nvlist_add_string(search, ZPOOL_CONFIG_PATH, path) == 0); } verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, @@ -1449,8 +1823,10 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, *l2cache = B_FALSE; if (log != NULL) *log = B_FALSE; - return (vdev_to_nvlist_iter(nvroot, search, guid, avail_spare, - l2cache, log)); + ret = vdev_to_nvlist_iter(nvroot, search, avail_spare, l2cache, log); + nvlist_free(search); + + return (ret); } static int @@ -1467,106 +1843,178 @@ vdev_online(nvlist_t *nv) } /* - * Get phys_path for a root pool - * Return 0 on success; non-zeron on failure. + * Helper function for zpool_get_physpaths(). */ -int -zpool_get_physpath(zpool_handle_t *zhp, char *physpath) +static int +vdev_get_one_physpath(nvlist_t *config, char *physpath, size_t physpath_size, + size_t *bytes_written) +{ + size_t bytes_left, pos, rsz; + char *tmppath; + const char *format; + + if (nvlist_lookup_string(config, ZPOOL_CONFIG_PHYS_PATH, + &tmppath) != 0) + return (EZFS_NODEVICE); + + pos = *bytes_written; + bytes_left = physpath_size - pos; + format = (pos == 0) ? "%s" : " %s"; + + rsz = snprintf(physpath + pos, bytes_left, format, tmppath); + *bytes_written += rsz; + + if (rsz >= bytes_left) { + /* if physpath was not copied properly, clear it */ + if (bytes_left != 0) { + physpath[pos] = 0; + } + return (EZFS_NOSPC); + } + return (0); +} + +static int +vdev_get_physpaths(nvlist_t *nv, char *physpath, size_t phypath_size, + size_t *rsz, boolean_t is_spare) +{ + char *type; + int ret; + + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type) != 0) + return (EZFS_INVALCONFIG); + + if (strcmp(type, VDEV_TYPE_DISK) == 0) { + /* + * An active spare device has ZPOOL_CONFIG_IS_SPARE set. + * For a spare vdev, we only want to boot from the active + * spare device. + */ + if (is_spare) { + uint64_t spare = 0; + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare); + if (!spare) + return (EZFS_INVALCONFIG); + } + + if (vdev_online(nv)) { + if ((ret = vdev_get_one_physpath(nv, physpath, + phypath_size, rsz)) != 0) + return (ret); + } + } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0 || + strcmp(type, VDEV_TYPE_REPLACING) == 0 || + (is_spare = (strcmp(type, VDEV_TYPE_SPARE) == 0))) { + nvlist_t **child; + uint_t count; + int i, ret; + + if (nvlist_lookup_nvlist_array(nv, + ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) + return (EZFS_INVALCONFIG); + + for (i = 0; i < count; i++) { + ret = vdev_get_physpaths(child[i], physpath, + phypath_size, rsz, is_spare); + if (ret == EZFS_NOSPC) + return (ret); + } + } + + return (EZFS_POOL_INVALARG); +} + +/* + * Get phys_path for a root pool config. + * Return 0 on success; non-zero on failure. + */ +static int +zpool_get_config_physpath(nvlist_t *config, char *physpath, size_t phypath_size) { + size_t rsz; nvlist_t *vdev_root; nvlist_t **child; uint_t count; - int i; + char *type; - /* - * Make sure this is a root pool, as phys_path doesn't mean - * anything to a non-root pool. - */ - if (!pool_is_bootable(zhp)) - return (-1); + rsz = 0; - verify(nvlist_lookup_nvlist(zhp->zpool_config, - ZPOOL_CONFIG_VDEV_TREE, &vdev_root) == 0); + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &vdev_root) != 0) + return (EZFS_INVALCONFIG); - if (nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, + if (nvlist_lookup_string(vdev_root, ZPOOL_CONFIG_TYPE, &type) != 0 || + nvlist_lookup_nvlist_array(vdev_root, ZPOOL_CONFIG_CHILDREN, &child, &count) != 0) - return (-2); + return (EZFS_INVALCONFIG); - for (i = 0; i < count; i++) { - nvlist_t **child2; - uint_t count2; - char *type; - char *tmppath; - int j; + /* + * root pool can not have EFI labeled disks and can only have + * a single top-level vdev. + */ + if (strcmp(type, VDEV_TYPE_ROOT) != 0 || count != 1 || + pool_uses_efi(vdev_root)) + return (EZFS_POOL_INVALARG); - if (nvlist_lookup_string(child[i], ZPOOL_CONFIG_TYPE, &type) - != 0) - return (-3); - - if (strcmp(type, VDEV_TYPE_DISK) == 0) { - if (!vdev_online(child[i])) - return (-8); - verify(nvlist_lookup_string(child[i], - ZPOOL_CONFIG_PHYS_PATH, &tmppath) == 0); - (void) strncpy(physpath, tmppath, strlen(tmppath)); - } else if (strcmp(type, VDEV_TYPE_MIRROR) == 0) { - if (nvlist_lookup_nvlist_array(child[i], - ZPOOL_CONFIG_CHILDREN, &child2, &count2) != 0) - return (-4); - - for (j = 0; j < count2; j++) { - if (!vdev_online(child2[j])) - return (-8); - if (nvlist_lookup_string(child2[j], - ZPOOL_CONFIG_PHYS_PATH, &tmppath) != 0) - return (-5); - - if ((strlen(physpath) + strlen(tmppath)) > - MAXNAMELEN) - return (-6); - - if (strlen(physpath) == 0) { - (void) strncpy(physpath, tmppath, - strlen(tmppath)); - } else { - (void) strcat(physpath, " "); - (void) strcat(physpath, tmppath); - } - } - } else { - return (-7); - } - } + (void) vdev_get_physpaths(child[0], physpath, phypath_size, &rsz, + B_FALSE); + + /* No online devices */ + if (rsz == 0) + return (EZFS_NODEVICE); return (0); } /* - * Returns TRUE if the given guid corresponds to the given type. - * This is used to check for hot spares (INUSE or not), and level 2 cache - * devices. + * Get phys_path for a root pool + * Return 0 on success; non-zero on failure. */ -static boolean_t -is_guid_type(zpool_handle_t *zhp, uint64_t guid, const char *type) +int +zpool_get_physpath(zpool_handle_t *zhp, char *physpath, size_t phypath_size) { - uint64_t target_guid; - nvlist_t *nvroot; - nvlist_t **list; - uint_t count; - int i; + return (zpool_get_config_physpath(zhp->zpool_config, physpath, + phypath_size)); +} - verify(nvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (nvlist_lookup_nvlist_array(nvroot, type, &list, &count) == 0) { - for (i = 0; i < count; i++) { - verify(nvlist_lookup_uint64(list[i], ZPOOL_CONFIG_GUID, - &target_guid) == 0); - if (guid == target_guid) - return (B_TRUE); - } +/* + * If the device has being dynamically expanded then we need to relabel + * the disk to use the new unallocated space. + */ +static int +zpool_relabel_disk(libzfs_handle_t *hdl, const char *name) +{ + char path[MAXPATHLEN]; + char errbuf[1024]; + int fd, error; + int (*_efi_use_whole_disk)(int); + + if ((_efi_use_whole_disk = (int (*)(int))dlsym(RTLD_DEFAULT, + "efi_use_whole_disk")) == NULL) + return (-1); + + (void) snprintf(path, sizeof (path), "%s/%s", RDISK_ROOT, name); + + if ((fd = open(path, O_RDWR | O_NDELAY)) < 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to open device"), name); + return (zfs_error(hdl, EZFS_OPENFAILED, errbuf)); } - return (B_FALSE); + /* + * It's possible that we might encounter an error if the device + * does not have any unallocated space left. If so, we simply + * ignore that error and continue on. + */ + error = _efi_use_whole_disk(fd); + (void) close(fd); + if (error && error != VT_ENOSPC) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "relabel '%s': unable to read disk capacity"), name); + return (zfs_error(hdl, EZFS_NOCAP, errbuf)); + } + return (0); } /* @@ -1580,28 +2028,64 @@ zpool_vdev_online(zpool_handle_t *zhp, const char *path, int flags, zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; - boolean_t avail_spare, l2cache; + boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; - (void) snprintf(msg, sizeof (msg), - dgettext(TEXT_DOMAIN, "cannot online %s"), path); + if (flags & ZFS_ONLINE_EXPAND) { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot expand %s"), path); + } else { + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "cannot online %s"), path); + } (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - NULL)) == NULL) + &islog)) == NULL) return (zfs_error(hdl, EZFS_NODEVICE, msg)); verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); - if (avail_spare || - is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE) + if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); + if (flags & ZFS_ONLINE_EXPAND || + zpool_get_prop_int(zhp, ZPOOL_PROP_AUTOEXPAND, NULL)) { + char *pathname = NULL; + uint64_t wholedisk = 0; + + (void) nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_WHOLE_DISK, + &wholedisk); + verify(nvlist_lookup_string(tgt, ZPOOL_CONFIG_PATH, + &pathname) == 0); + + /* + * XXX - L2ARC 1.0 devices can't support expansion. + */ + if (l2cache) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "cannot expand cache devices")); + return (zfs_error(hdl, EZFS_VDEVNOTSUP, msg)); + } + + if (wholedisk) { + pathname += strlen(DISK_ROOT) + 1; + (void) zpool_relabel_disk(zhp->zpool_hdl, pathname); + } + } + zc.zc_cookie = VDEV_STATE_ONLINE; zc.zc_obj = flags; - if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) + if (zfs_ioctl(zhp->zpool_hdl, ZFS_IOC_VDEV_SET_STATE, &zc) != 0) { + if (errno == EINVAL) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "was split " + "from this pool into a new one. Use '%s' " + "instead"), "zpool detach"); + return (zfs_error(hdl, EZFS_POSTSPLIT_ONLINE, msg)); + } return (zpool_standard_error(hdl, errno, msg)); + } *newstate = zc.zc_cookie; return (0); @@ -1629,8 +2113,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); - if (avail_spare || - is_guid_type(zhp, zc.zc_guid, ZPOOL_CONFIG_SPARES) == B_TRUE) + if (avail_spare) return (zfs_error(hdl, EZFS_ISSPARE, msg)); zc.zc_cookie = VDEV_STATE_OFFLINE; @@ -1647,6 +2130,12 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) */ return (zfs_error(hdl, EZFS_NOREPLICAS, msg)); + case EEXIST: + /* + * The log device has unplayed logs + */ + return (zfs_error(hdl, EZFS_UNPLAYED_LOGS, msg)); + default: return (zpool_standard_error(hdl, errno, msg)); } @@ -1656,7 +2145,7 @@ zpool_vdev_offline(zpool_handle_t *zhp, const char *path, boolean_t istmp) * Mark the given vdev faulted. */ int -zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid) +zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = { 0 }; char msg[1024]; @@ -1668,6 +2157,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_FAULTED; + zc.zc_obj = aux; if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); @@ -1690,7 +2180,7 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid) * Mark the given vdev degraded. */ int -zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid) +zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) { zfs_cmd_t zc = { 0 }; char msg[1024]; @@ -1702,6 +2192,7 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid) (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; zc.zc_cookie = VDEV_STATE_DEGRADED; + zc.zc_obj = aux; if (ioctl(zhp->zpool_hdl->libzfs_fd, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) return (0); @@ -1799,7 +2290,7 @@ zpool_vdev_attach(zpool_handle_t *zhp, verify(nvlist_lookup_nvlist(zpool_get_config(zhp, NULL), ZPOOL_CONFIG_VDEV_TREE, &config_root) == 0); - if ((newname = zpool_vdev_name(NULL, NULL, child[0])) == NULL) + if ((newname = zpool_vdev_name(NULL, NULL, child[0], B_FALSE)) == NULL) return (-1); /* @@ -1851,6 +2342,14 @@ zpool_vdev_attach(zpool_handle_t *zhp, (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Please " "be sure to invoke %s to make '%s' bootable.\n"), BOOTCMD, new_disk); + + /* + * XXX need a better way to prevent user from + * booting up a half-baked vdev. + */ + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Make " + "sure to wait until resilver is done " + "before rebooting.\n")); } return (0); } @@ -1978,6 +2477,257 @@ zpool_vdev_detach(zpool_handle_t *zhp, const char *path) return (-1); } +/* + * Find a mirror vdev in the source nvlist. + * + * The mchild array contains a list of disks in one of the top-level mirrors + * of the source pool. The schild array contains a list of disks that the + * user specified on the command line. We loop over the mchild array to + * see if any entry in the schild array matches. + * + * If a disk in the mchild array is found in the schild array, we return + * the index of that entry. Otherwise we return -1. + */ +static int +find_vdev_entry(zpool_handle_t *zhp, nvlist_t **mchild, uint_t mchildren, + nvlist_t **schild, uint_t schildren) +{ + uint_t mc; + + for (mc = 0; mc < mchildren; mc++) { + uint_t sc; + char *mpath = zpool_vdev_name(zhp->zpool_hdl, zhp, + mchild[mc], B_FALSE); + + for (sc = 0; sc < schildren; sc++) { + char *spath = zpool_vdev_name(zhp->zpool_hdl, zhp, + schild[sc], B_FALSE); + boolean_t result = (strcmp(mpath, spath) == 0); + + free(spath); + if (result) { + free(mpath); + return (mc); + } + } + + free(mpath); + } + + return (-1); +} + +/* + * Split a mirror pool. If newroot points to null, then a new nvlist + * is generated and it is the responsibility of the caller to free it. + */ +int +zpool_vdev_split(zpool_handle_t *zhp, char *newname, nvlist_t **newroot, + nvlist_t *props, splitflags_t flags) +{ + zfs_cmd_t zc = { 0 }; + char msg[1024]; + nvlist_t *tree, *config, **child, **newchild, *newconfig = NULL; + nvlist_t **varray = NULL, *zc_props = NULL; + uint_t c, children, newchildren, lastlog = 0, vcount, found = 0; + libzfs_handle_t *hdl = zhp->zpool_hdl; + uint64_t vers; + boolean_t freelist = B_FALSE, memory_err = B_TRUE; + int retval = 0; + + (void) snprintf(msg, sizeof (msg), + dgettext(TEXT_DOMAIN, "Unable to split %s"), zhp->zpool_name); + + if (!zpool_name_valid(hdl, B_FALSE, newname)) + return (zfs_error(hdl, EZFS_INVALIDNAME, msg)); + + if ((config = zpool_get_config(zhp, NULL)) == NULL) { + (void) fprintf(stderr, gettext("Internal error: unable to " + "retrieve pool configuration\n")); + return (-1); + } + + verify(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &tree) + == 0); + verify(nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &vers) == 0); + + if (props) { + if ((zc_props = zpool_valid_proplist(hdl, zhp->zpool_name, + props, vers, B_TRUE, msg)) == NULL) + return (-1); + } + + if (nvlist_lookup_nvlist_array(tree, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Source pool is missing vdev tree")); + if (zc_props) + nvlist_free(zc_props); + return (-1); + } + + varray = zfs_alloc(hdl, children * sizeof (nvlist_t *)); + vcount = 0; + + if (*newroot == NULL || + nvlist_lookup_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, + &newchild, &newchildren) != 0) + newchildren = 0; + + for (c = 0; c < children; c++) { + uint64_t is_log = B_FALSE, is_hole = B_FALSE; + char *type; + nvlist_t **mchild, *vdev; + uint_t mchildren; + int entry; + + /* + * Unlike cache & spares, slogs are stored in the + * ZPOOL_CONFIG_CHILDREN array. We filter them out here. + */ + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_LOG, + &is_log); + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + if (is_log || is_hole) { + /* + * Create a hole vdev and put it in the config. + */ + if (nvlist_alloc(&vdev, NV_UNIQUE_NAME, 0) != 0) + goto out; + if (nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_HOLE) != 0) + goto out; + if (nvlist_add_uint64(vdev, ZPOOL_CONFIG_IS_HOLE, + 1) != 0) + goto out; + if (lastlog == 0) + lastlog = vcount; + varray[vcount++] = vdev; + continue; + } + lastlog = 0; + verify(nvlist_lookup_string(child[c], ZPOOL_CONFIG_TYPE, &type) + == 0); + if (strcmp(type, VDEV_TYPE_MIRROR) != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Source pool must be composed only of mirrors\n")); + retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); + goto out; + } + + verify(nvlist_lookup_nvlist_array(child[c], + ZPOOL_CONFIG_CHILDREN, &mchild, &mchildren) == 0); + + /* find or add an entry for this top-level vdev */ + if (newchildren > 0 && + (entry = find_vdev_entry(zhp, mchild, mchildren, + newchild, newchildren)) >= 0) { + /* We found a disk that the user specified. */ + vdev = mchild[entry]; + ++found; + } else { + /* User didn't specify a disk for this vdev. */ + vdev = mchild[mchildren - 1]; + } + + if (nvlist_dup(vdev, &varray[vcount++], 0) != 0) + goto out; + } + + /* did we find every disk the user specified? */ + if (found != newchildren) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "Device list must " + "include at most one disk from each mirror")); + retval = zfs_error(hdl, EZFS_INVALCONFIG, msg); + goto out; + } + + /* Prepare the nvlist for populating. */ + if (*newroot == NULL) { + if (nvlist_alloc(newroot, NV_UNIQUE_NAME, 0) != 0) + goto out; + freelist = B_TRUE; + if (nvlist_add_string(*newroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) != 0) + goto out; + } else { + verify(nvlist_remove_all(*newroot, ZPOOL_CONFIG_CHILDREN) == 0); + } + + /* Add all the children we found */ + if (nvlist_add_nvlist_array(*newroot, ZPOOL_CONFIG_CHILDREN, varray, + lastlog == 0 ? vcount : lastlog) != 0) + goto out; + + /* + * If we're just doing a dry run, exit now with success. + */ + if (flags.dryrun) { + memory_err = B_FALSE; + freelist = B_FALSE; + goto out; + } + + /* now build up the config list & call the ioctl */ + if (nvlist_alloc(&newconfig, NV_UNIQUE_NAME, 0) != 0) + goto out; + + if (nvlist_add_nvlist(newconfig, + ZPOOL_CONFIG_VDEV_TREE, *newroot) != 0 || + nvlist_add_string(newconfig, + ZPOOL_CONFIG_POOL_NAME, newname) != 0 || + nvlist_add_uint64(newconfig, ZPOOL_CONFIG_VERSION, vers) != 0) + goto out; + + /* + * The new pool is automatically part of the namespace unless we + * explicitly export it. + */ + if (!flags.import) + zc.zc_cookie = ZPOOL_EXPORT_AFTER_SPLIT; + (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); + (void) strlcpy(zc.zc_string, newname, sizeof (zc.zc_string)); + if (zcmd_write_conf_nvlist(hdl, &zc, newconfig) != 0) + goto out; + if (zc_props != NULL && zcmd_write_src_nvlist(hdl, &zc, zc_props) != 0) + goto out; + + if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SPLIT, &zc) != 0) { + retval = zpool_standard_error(hdl, errno, msg); + goto out; + } + + freelist = B_FALSE; + memory_err = B_FALSE; + +out: + if (varray != NULL) { + int v; + + for (v = 0; v < vcount; v++) + nvlist_free(varray[v]); + free(varray); + } + zcmd_free_nvlists(&zc); + if (zc_props) + nvlist_free(zc_props); + if (newconfig) + nvlist_free(newconfig); + if (freelist) { + nvlist_free(*newroot); + *newroot = NULL; + } + + if (retval != 0) + return (retval); + + if (memory_err) + return (no_memory(hdl)); + + return (0); +} + /* * Remove the given device. Currently, this is supported only for hot spares * and level 2 cache devices. @@ -1988,24 +2738,34 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; - boolean_t avail_spare, l2cache; + boolean_t avail_spare, l2cache, islog; libzfs_handle_t *hdl = zhp->zpool_hdl; + uint64_t version; (void) snprintf(msg, sizeof (msg), dgettext(TEXT_DOMAIN, "cannot remove %s"), path); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); if ((tgt = zpool_find_vdev(zhp, path, &avail_spare, &l2cache, - NULL)) == 0) + &islog)) == 0) return (zfs_error(hdl, EZFS_NODEVICE, msg)); - - if (!avail_spare && !l2cache) { + /* + * XXX - this should just go away. + */ + if (!avail_spare && !l2cache && !islog) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "only inactive hot spares or cache devices " - "can be removed")); + "only inactive hot spares, cache, top-level, " + "or log devices can be removed")); return (zfs_error(hdl, EZFS_NODEVICE, msg)); } + version = zpool_get_prop_int(zhp, ZPOOL_PROP_VERSION, NULL); + if (islog && version < SPA_VERSION_HOLES) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool must be upgrade to support log removal")); + return (zfs_error(hdl, EZFS_BADVERSION, msg)); + } + verify(nvlist_lookup_uint64(tgt, ZPOOL_CONFIG_GUID, &zc.zc_guid) == 0); if (zfs_ioctl(hdl, ZFS_IOC_VDEV_REMOVE, &zc) == 0) @@ -2018,13 +2778,15 @@ zpool_vdev_remove(zpool_handle_t *zhp, const char *path) * Clear the errors for the pool, or the particular device if specified. */ int -zpool_clear(zpool_handle_t *zhp, const char *path) +zpool_clear(zpool_handle_t *zhp, const char *path, nvlist_t *rewindnvl) { zfs_cmd_t zc = { 0 }; char msg[1024]; nvlist_t *tgt; + zpool_rewind_policy_t policy; boolean_t avail_spare, l2cache; libzfs_handle_t *hdl = zhp->zpool_hdl; + nvlist_t *nvi = NULL; if (path) (void) snprintf(msg, sizeof (msg), @@ -2052,9 +2814,31 @@ zpool_clear(zpool_handle_t *zhp, const char *path) &zc.zc_guid) == 0); } - if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0) + zpool_get_rewind_policy(rewindnvl, &policy); + zc.zc_cookie = policy.zrp_request; + + if (zcmd_alloc_dst_nvlist(hdl, &zc, 8192) != 0) + return (-1); + + if (zcmd_write_src_nvlist(zhp->zpool_hdl, &zc, rewindnvl) != 0) + return (-1); + + if (zfs_ioctl(hdl, ZFS_IOC_CLEAR, &zc) == 0 || + ((policy.zrp_request & ZPOOL_TRY_REWIND) && + errno != EPERM && errno != EACCES)) { + if (policy.zrp_request & + (ZPOOL_DO_REWIND | ZPOOL_TRY_REWIND)) { + (void) zcmd_read_dst_nvlist(hdl, &zc, &nvi); + zpool_rewind_exclaim(hdl, zc.zc_name, + ((policy.zrp_request & ZPOOL_TRY_REWIND) != 0), + nvi); + nvlist_free(nvi); + } + zcmd_free_nvlists(&zc); return (0); + } + zcmd_free_nvlists(&zc); return (zpool_standard_error(hdl, errno, msg)); } @@ -2081,173 +2865,6 @@ zpool_vdev_clear(zpool_handle_t *zhp, uint64_t guid) return (zpool_standard_error(hdl, errno, msg)); } -/* - * Iterate over all zvols in a given pool by walking the /dev/zvol/dsk/ - * hierarchy. - */ -int -zpool_iter_zvol(zpool_handle_t *zhp, int (*cb)(const char *, void *), - void *data) -{ - libzfs_handle_t *hdl = zhp->zpool_hdl; - char (*paths)[MAXPATHLEN]; - size_t size = 4; - int curr, fd, base, ret = 0; - DIR *dirp; - struct dirent *dp; - struct stat st; - - if ((base = open("/dev/zvol/dsk", O_RDONLY)) < 0) - return (errno == ENOENT ? 0 : -1); - - if (fstatat(base, zhp->zpool_name, &st, 0) != 0) { - int err = errno; - (void) close(base); - return (err == ENOENT ? 0 : -1); - } - - /* - * Oddly this wasn't a directory -- ignore that failure since we - * know there are no links lower in the (non-existant) hierarchy. - */ - if (!S_ISDIR(st.st_mode)) { - (void) close(base); - return (0); - } - - if ((paths = zfs_alloc(hdl, size * sizeof (paths[0]))) == NULL) { - (void) close(base); - return (-1); - } - - (void) strlcpy(paths[0], zhp->zpool_name, sizeof (paths[0])); - curr = 0; - - while (curr >= 0) { - if (fstatat(base, paths[curr], &st, AT_SYMLINK_NOFOLLOW) != 0) - goto err; - - if (S_ISDIR(st.st_mode)) { - if ((fd = openat(base, paths[curr], O_RDONLY)) < 0) - goto err; - - if ((dirp = fdopendir(fd)) == NULL) { - (void) close(fd); - goto err; - } - - while ((dp = readdir(dirp)) != NULL) { - if (dp->d_name[0] == '.') - continue; - - if (curr + 1 == size) { - paths = zfs_realloc(hdl, paths, - size * sizeof (paths[0]), - size * 2 * sizeof (paths[0])); - if (paths == NULL) { - (void) closedir(dirp); - (void) close(fd); - goto err; - } - - size *= 2; - } - - (void) strlcpy(paths[curr + 1], paths[curr], - sizeof (paths[curr + 1])); - (void) strlcat(paths[curr], "/", - sizeof (paths[curr])); - (void) strlcat(paths[curr], dp->d_name, - sizeof (paths[curr])); - curr++; - } - - (void) closedir(dirp); - - } else { - if ((ret = cb(paths[curr], data)) != 0) - break; - } - - curr--; - } - - free(paths); - (void) close(base); - - return (ret); - -err: - free(paths); - (void) close(base); - return (-1); -} - -typedef struct zvol_cb { - zpool_handle_t *zcb_pool; - boolean_t zcb_create; -} zvol_cb_t; - -/*ARGSUSED*/ -static int -do_zvol_create(zfs_handle_t *zhp, void *data) -{ - int ret = 0; - - if (ZFS_IS_VOLUME(zhp)) { - (void) zvol_create_link(zhp->zfs_hdl, zhp->zfs_name); - ret = zfs_iter_snapshots(zhp, do_zvol_create, NULL); - } - - if (ret == 0) - ret = zfs_iter_filesystems(zhp, do_zvol_create, NULL); - - zfs_close(zhp); - - return (ret); -} - -/* - * Iterate over all zvols in the pool and make any necessary minor nodes. - */ -int -zpool_create_zvol_links(zpool_handle_t *zhp) -{ - zfs_handle_t *zfp; - int ret; - - /* - * If the pool is unavailable, just return success. - */ - if ((zfp = make_dataset_handle(zhp->zpool_hdl, - zhp->zpool_name)) == NULL) - return (0); - - ret = zfs_iter_filesystems(zfp, do_zvol_create, NULL); - - zfs_close(zfp); - return (ret); -} - -static int -do_zvol_remove(const char *dataset, void *data) -{ - zpool_handle_t *zhp = data; - - return (zvol_remove_link(zhp->zpool_hdl, dataset)); -} - -/* - * Iterate over all zvols in the pool and remove any minor nodes. We iterate - * by examining the /dev links so that a corrupted pool doesn't impede this - * operation. - */ -int -zpool_remove_zvol_links(zpool_handle_t *zhp) -{ - return (zpool_iter_zvol(zhp, do_zvol_remove, zhp)); -} - /* * Convert from a devid string to a path. */ @@ -2340,7 +2957,8 @@ set_path(zpool_handle_t *zhp, nvlist_t *nv, const char *path) * of these checks. */ char * -zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv) +zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv, + boolean_t verbose) { char *path, *devid; uint64_t value; @@ -2419,6 +3037,20 @@ zpool_vdev_name(libzfs_handle_t *hdl, zpool_handle_t *zhp, nvlist_t *nv) (u_longlong_t)value); path = buf; } + + /* + * We identify each top-level vdev by using a + * naming convention. + */ + if (verbose) { + uint64_t id; + + verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ID, + &id) == 0); + (void) snprintf(buf, sizeof (buf), "%s-%llu", path, + (u_longlong_t)id); + path = buf; + } } return (zfs_strdup(hdl, path)); @@ -2637,7 +3269,7 @@ get_history(zpool_handle_t *zhp, char *buf, uint64_t *off, uint64_t *len) * into 'records'. 'leftover' is set to the number of bytes that weren't * processed as there wasn't a complete record. */ -static int +int zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, nvlist_t ***records, uint_t *numrecords) { @@ -2766,14 +3398,6 @@ zpool_obj_to_path(zpool_handle_t *zhp, uint64_t dsobj, uint64_t obj, free(mntpnt); } -#define RDISK_ROOT "/dev/rdsk" -#define BACKUP_SLICE "s2" -/* - * Don't start the slice at the default block of 34; many storage - * devices will use a stripe width of 128k, so start there instead. - */ -#define NEW_START_BLOCK 256 - /* * Read the EFI label from the config, if a label does not exist then * pass back the error to the caller. If the caller has passed a non-NULL @@ -2964,6 +3588,7 @@ supported_dump_vdev_type(libzfs_handle_t *hdl, nvlist_t *config, char *errbuf) if (strcmp(type, VDEV_TYPE_RAIDZ) == 0 || strcmp(type, VDEV_TYPE_FILE) == 0 || strcmp(type, VDEV_TYPE_LOG) == 0 || + strcmp(type, VDEV_TYPE_HOLE) == 0 || strcmp(type, VDEV_TYPE_MISSING) == 0) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "vdev type '%s' is not supported"), type); diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c index a3ed5cea8589b..c8d85c8b86024 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_sendrecv.c @@ -20,14 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include -#include #include #include #include @@ -36,22 +35,382 @@ #include #include #include -#include -#include -#include -#include +#include +#include #include #include "zfs_namecheck.h" #include "zfs_prop.h" +#include "zfs_fletcher.h" #include "libzfs_impl.h" +#include +#include +#include -#include /* XXX */ +/* in libzfs_dataset.c */ +extern void zfs_setprop_error(libzfs_handle_t *, zfs_prop_t, int, char *); static int zfs_receive_impl(libzfs_handle_t *, const char *, recvflags_t, int, avl_tree_t *, char **); +static const zio_cksum_t zero_cksum = { 0 }; + +typedef struct dedup_arg { + int inputfd; + int outputfd; + libzfs_handle_t *dedup_hdl; +} dedup_arg_t; + +typedef struct dataref { + uint64_t ref_guid; + uint64_t ref_object; + uint64_t ref_offset; +} dataref_t; + +typedef struct dedup_entry { + struct dedup_entry *dde_next; + zio_cksum_t dde_chksum; + uint64_t dde_prop; + dataref_t dde_ref; +} dedup_entry_t; + +#define MAX_DDT_PHYSMEM_PERCENT 20 +#define SMALLEST_POSSIBLE_MAX_DDT_MB 128 + +typedef struct dedup_table { + dedup_entry_t **dedup_hash_array; + umem_cache_t *ddecache; + uint64_t max_ddt_size; /* max dedup table size in bytes */ + uint64_t cur_ddt_size; /* current dedup table size in bytes */ + uint64_t ddt_count; + int numhashbits; + boolean_t ddt_full; +} dedup_table_t; + +static int +high_order_bit(uint64_t n) +{ + int count; + + for (count = 0; n != 0; count++) + n >>= 1; + return (count); +} + +static size_t +ssread(void *buf, size_t len, FILE *stream) +{ + size_t outlen; + + if ((outlen = fread(buf, len, 1, stream)) == 0) + return (0); + + return (outlen); +} + +static void +ddt_hash_append(libzfs_handle_t *hdl, dedup_table_t *ddt, dedup_entry_t **ddepp, + zio_cksum_t *cs, uint64_t prop, dataref_t *dr) +{ + dedup_entry_t *dde; + + if (ddt->cur_ddt_size >= ddt->max_ddt_size) { + if (ddt->ddt_full == B_FALSE) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "Dedup table full. Deduplication will continue " + "with existing table entries")); + ddt->ddt_full = B_TRUE; + } + return; + } + + if ((dde = umem_cache_alloc(ddt->ddecache, UMEM_DEFAULT)) + != NULL) { + assert(*ddepp == NULL); + dde->dde_next = NULL; + dde->dde_chksum = *cs; + dde->dde_prop = prop; + dde->dde_ref = *dr; + *ddepp = dde; + ddt->cur_ddt_size += sizeof (dedup_entry_t); + ddt->ddt_count++; + } +} + +/* + * Using the specified dedup table, do a lookup for an entry with + * the checksum cs. If found, return the block's reference info + * in *dr. Otherwise, insert a new entry in the dedup table, using + * the reference information specified by *dr. + * + * return value: true - entry was found + * false - entry was not found + */ +static boolean_t +ddt_update(libzfs_handle_t *hdl, dedup_table_t *ddt, zio_cksum_t *cs, + uint64_t prop, dataref_t *dr) +{ + uint32_t hashcode; + dedup_entry_t **ddepp; + + hashcode = BF64_GET(cs->zc_word[0], 0, ddt->numhashbits); + + for (ddepp = &(ddt->dedup_hash_array[hashcode]); *ddepp != NULL; + ddepp = &((*ddepp)->dde_next)) { + if (ZIO_CHECKSUM_EQUAL(((*ddepp)->dde_chksum), *cs) && + (*ddepp)->dde_prop == prop) { + *dr = (*ddepp)->dde_ref; + return (B_TRUE); + } + } + ddt_hash_append(hdl, ddt, ddepp, cs, prop, dr); + return (B_FALSE); +} + +static int +cksum_and_write(const void *buf, uint64_t len, zio_cksum_t *zc, int outfd) +{ + fletcher_4_incremental_native(buf, len, zc); + return (write(outfd, buf, len)); +} + +/* + * This function is started in a separate thread when the dedup option + * has been requested. The main send thread determines the list of + * snapshots to be included in the send stream and makes the ioctl calls + * for each one. But instead of having the ioctl send the output to the + * the output fd specified by the caller of zfs_send()), the + * ioctl is told to direct the output to a pipe, which is read by the + * alternate thread running THIS function. This function does the + * dedup'ing by: + * 1. building a dedup table (the DDT) + * 2. doing checksums on each data block and inserting a record in the DDT + * 3. looking for matching checksums, and + * 4. sending a DRR_WRITE_BYREF record instead of a write record whenever + * a duplicate block is found. + * The output of this function then goes to the output fd requested + * by the caller of zfs_send(). + */ +static void * +cksummer(void *arg) +{ + dedup_arg_t *dda = arg; + char *buf = malloc(1<<20); + dmu_replay_record_t thedrr; + dmu_replay_record_t *drr = &thedrr; + struct drr_begin *drrb = &thedrr.drr_u.drr_begin; + struct drr_end *drre = &thedrr.drr_u.drr_end; + struct drr_object *drro = &thedrr.drr_u.drr_object; + struct drr_write *drrw = &thedrr.drr_u.drr_write; + FILE *ofp; + int outfd; + dmu_replay_record_t wbr_drr = {0}; + struct drr_write_byref *wbr_drrr = &wbr_drr.drr_u.drr_write_byref; + dedup_table_t ddt; + zio_cksum_t stream_cksum; + uint64_t physmem = sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE); + uint64_t numbuckets; + + ddt.max_ddt_size = + MAX((physmem * MAX_DDT_PHYSMEM_PERCENT)/100, + SMALLEST_POSSIBLE_MAX_DDT_MB<<20); + + numbuckets = ddt.max_ddt_size/(sizeof (dedup_entry_t)); + + /* + * numbuckets must be a power of 2. Increase number to + * a power of 2 if necessary. + */ + if (!ISP2(numbuckets)) + numbuckets = 1 << high_order_bit(numbuckets); + + ddt.dedup_hash_array = calloc(numbuckets, sizeof (dedup_entry_t *)); + ddt.ddecache = umem_cache_create("dde", sizeof (dedup_entry_t), 0, + NULL, NULL, NULL, NULL, NULL, 0); + ddt.cur_ddt_size = numbuckets * sizeof (dedup_entry_t *); + ddt.numhashbits = high_order_bit(numbuckets) - 1; + ddt.ddt_full = B_FALSE; + + /* Initialize the write-by-reference block. */ + wbr_drr.drr_type = DRR_WRITE_BYREF; + wbr_drr.drr_payloadlen = 0; + + outfd = dda->outputfd; + ofp = fdopen(dda->inputfd, "r"); + while (ssread(drr, sizeof (dmu_replay_record_t), ofp) != 0) { + + switch (drr->drr_type) { + case DRR_BEGIN: + { + int fflags; + ZIO_SET_CHECKSUM(&stream_cksum, 0, 0, 0, 0); + + /* set the DEDUP feature flag for this stream */ + fflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + fflags |= (DMU_BACKUP_FEATURE_DEDUP | + DMU_BACKUP_FEATURE_DEDUPPROPS); + DMU_SET_FEATUREFLAGS(drrb->drr_versioninfo, fflags); + + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM && drr->drr_payloadlen != 0) { + int sz = drr->drr_payloadlen; + + if (sz > 1<<20) { + free(buf); + buf = malloc(sz); + } + (void) ssread(buf, sz, ofp); + if (ferror(stdin)) + perror("fread"); + if (cksum_and_write(buf, sz, &stream_cksum, + outfd) == -1) + goto out; + } + break; + } + + case DRR_END: + { + /* use the recalculated checksum */ + ZIO_SET_CHECKSUM(&drre->drr_checksum, + stream_cksum.zc_word[0], stream_cksum.zc_word[1], + stream_cksum.zc_word[2], stream_cksum.zc_word[3]); + if ((write(outfd, drr, + sizeof (dmu_replay_record_t))) == -1) + goto out; + break; + } + + case DRR_OBJECT: + { + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + if (drro->drr_bonuslen > 0) { + (void) ssread(buf, + P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), + ofp); + if (cksum_and_write(buf, + P2ROUNDUP((uint64_t)drro->drr_bonuslen, 8), + &stream_cksum, outfd) == -1) + goto out; + } + break; + } + + case DRR_FREEOBJECTS: + { + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + break; + } + + case DRR_WRITE: + { + dataref_t dataref; + + (void) ssread(buf, drrw->drr_length, ofp); + + /* + * Use the existing checksum if it's dedup-capable, + * else calculate a SHA256 checksum for it. + */ + + if (ZIO_CHECKSUM_EQUAL(drrw->drr_key.ddk_cksum, + zero_cksum) || + !DRR_IS_DEDUP_CAPABLE(drrw->drr_checksumflags)) { + SHA256_CTX ctx; + zio_cksum_t tmpsha256; + + SHA256Init(&ctx); + SHA256Update(&ctx, buf, drrw->drr_length); + SHA256Final(&tmpsha256, &ctx); + drrw->drr_key.ddk_cksum.zc_word[0] = + BE_64(tmpsha256.zc_word[0]); + drrw->drr_key.ddk_cksum.zc_word[1] = + BE_64(tmpsha256.zc_word[1]); + drrw->drr_key.ddk_cksum.zc_word[2] = + BE_64(tmpsha256.zc_word[2]); + drrw->drr_key.ddk_cksum.zc_word[3] = + BE_64(tmpsha256.zc_word[3]); + drrw->drr_checksumtype = ZIO_CHECKSUM_SHA256; + drrw->drr_checksumflags = DRR_CHECKSUM_DEDUP; + } + + dataref.ref_guid = drrw->drr_toguid; + dataref.ref_object = drrw->drr_object; + dataref.ref_offset = drrw->drr_offset; + + if (ddt_update(dda->dedup_hdl, &ddt, + &drrw->drr_key.ddk_cksum, drrw->drr_key.ddk_prop, + &dataref)) { + /* block already present in stream */ + wbr_drrr->drr_object = drrw->drr_object; + wbr_drrr->drr_offset = drrw->drr_offset; + wbr_drrr->drr_length = drrw->drr_length; + wbr_drrr->drr_toguid = drrw->drr_toguid; + wbr_drrr->drr_refguid = dataref.ref_guid; + wbr_drrr->drr_refobject = + dataref.ref_object; + wbr_drrr->drr_refoffset = + dataref.ref_offset; + + wbr_drrr->drr_checksumtype = + drrw->drr_checksumtype; + wbr_drrr->drr_checksumflags = + drrw->drr_checksumtype; + wbr_drrr->drr_key.ddk_cksum = + drrw->drr_key.ddk_cksum; + wbr_drrr->drr_key.ddk_prop = + drrw->drr_key.ddk_prop; + + if (cksum_and_write(&wbr_drr, + sizeof (dmu_replay_record_t), &stream_cksum, + outfd) == -1) + goto out; + } else { + /* block not previously seen */ + if (cksum_and_write(drr, + sizeof (dmu_replay_record_t), &stream_cksum, + outfd) == -1) + goto out; + if (cksum_and_write(buf, + drrw->drr_length, + &stream_cksum, outfd) == -1) + goto out; + } + break; + } + + case DRR_FREE: + { + if (cksum_and_write(drr, sizeof (dmu_replay_record_t), + &stream_cksum, outfd) == -1) + goto out; + break; + } + + default: + (void) printf("INVALID record type 0x%x\n", + drr->drr_type); + /* should never happen, so assert */ + assert(B_FALSE); + } + } +out: + umem_cache_destroy(ddt.ddecache); + free(ddt.dedup_hash_array); + free(buf); + (void) fclose(ofp); + + return (NULL); +} + /* * Routines for dealing with the AVL tree of fs-nvlists */ @@ -113,6 +472,9 @@ fsavl_destroy(avl_tree_t *avl) free(avl); } +/* + * Given an nvlist, produce an avl tree of snapshots, ordered by guid + */ static avl_tree_t * fsavl_create(nvlist_t *fss) { @@ -170,6 +532,7 @@ typedef struct send_data { nvlist_t *snapprops; const char *fromsnap; const char *tosnap; + boolean_t recursive; /* * The header nvlist is of the following format: @@ -237,23 +600,50 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv) zfs_prop_t prop = zfs_name_to_prop(propname); nvlist_t *propnv; - if (!zfs_prop_user(propname) && zfs_prop_readonly(prop)) - continue; + if (!zfs_prop_user(propname)) { + /* + * Realistically, this should never happen. However, + * we want the ability to add DSL properties without + * needing to make incompatible version changes. We + * need to ignore unknown properties to allow older + * software to still send datasets containing these + * properties, with the unknown properties elided. + */ + if (prop == ZPROP_INVAL) + continue; + + if (zfs_prop_readonly(prop)) + continue; + } verify(nvpair_value_nvlist(elem, &propnv) == 0); - if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) { - /* these guys are modifyable, but have no source */ + if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION || + prop == ZFS_PROP_REFQUOTA || + prop == ZFS_PROP_REFRESERVATION) { + char *source; uint64_t value; verify(nvlist_lookup_uint64(propnv, ZPROP_VALUE, &value) == 0); if (zhp->zfs_type == ZFS_TYPE_SNAPSHOT) continue; + /* + * May have no source before SPA_VERSION_RECVD_PROPS, + * but is still modifiable. + */ + if (nvlist_lookup_string(propnv, + ZPROP_SOURCE, &source) == 0) { + if ((strcmp(source, zhp->zfs_name) != 0) && + (strcmp(source, + ZPROP_SOURCE_VAL_RECVD) != 0)) + continue; + } } else { char *source; if (nvlist_lookup_string(propnv, ZPROP_SOURCE, &source) != 0) continue; - if (strcmp(source, zhp->zfs_name) != 0) + if ((strcmp(source, zhp->zfs_name) != 0) && + (strcmp(source, ZPROP_SOURCE_VAL_RECVD) != 0)) continue; } @@ -272,12 +662,17 @@ send_iterate_prop(zfs_handle_t *zhp, nvlist_t *nv) } } +/* + * recursively generate nvlists describing datasets. See comment + * for the data structure send_data_t above for description of contents + * of the nvlist. + */ static int send_iterate_fs(zfs_handle_t *zhp, void *arg) { send_data_t *sd = arg; nvlist_t *nvfs, *nv; - int rv; + int rv = 0; uint64_t parent_fromsnap_guid_save = sd->parent_fromsnap_guid; uint64_t guid = zhp->zfs_dmustats.dds_guid; char guidstring[64]; @@ -319,7 +714,8 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg) nvlist_free(nvfs); /* iterate over children */ - rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); + if (sd->recursive) + rv = zfs_iter_filesystems(zhp, send_iterate_fs, sd); sd->parent_fromsnap_guid = parent_fromsnap_guid_save; @@ -329,7 +725,7 @@ send_iterate_fs(zfs_handle_t *zhp, void *arg) static int gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, - const char *tosnap, nvlist_t **nvlp, avl_tree_t **avlp) + const char *tosnap, boolean_t recursive, nvlist_t **nvlp, avl_tree_t **avlp) { zfs_handle_t *zhp; send_data_t sd = { 0 }; @@ -342,6 +738,7 @@ gather_nvlist(libzfs_handle_t *hdl, const char *fsname, const char *fromsnap, VERIFY(0 == nvlist_alloc(&sd.fss, NV_UNIQUE_NAME, 0)); sd.fromsnap = fromsnap; sd.tosnap = tosnap; + sd.recursive = recursive; if ((error = send_iterate_fs(zhp, &sd)) != 0) { nvlist_free(sd.fss); @@ -403,7 +800,7 @@ zfs_snapshot_compare(const void *larg, const void *rarg) return (0); } -static int +int zfs_iter_snapshots_sorted(zfs_handle_t *zhp, zfs_iter_f callback, void *data) { int ret = 0; @@ -434,13 +831,15 @@ typedef struct send_dump_data { /* these are all just the short snapname (the part after the @) */ const char *fromsnap; const char *tosnap; - char lastsnap[ZFS_MAXNAMELEN]; + char prevsnap[ZFS_MAXNAMELEN]; boolean_t seenfrom, seento, replicate, doall, fromorigin; boolean_t verbose; int outfd; boolean_t err; nvlist_t *fss; avl_tree_t *fsavl; + snapfilter_cb_t *filter_cb; + void *filter_cb_arg; } send_dump_data_t; /* @@ -449,7 +848,7 @@ typedef struct send_dump_data { */ static int dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin, - int outfd) + int outfd, boolean_t enoent_ok, boolean_t *got_enoent) { zfs_cmd_t zc = { 0 }; libzfs_handle_t *hdl = zhp->zfs_hdl; @@ -463,6 +862,8 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin, zc.zc_cookie = outfd; zc.zc_obj = fromorigin; + *got_enoent = B_FALSE; + if (ioctl(zhp->zfs_hdl->libzfs_fd, ZFS_IOC_SEND, &zc) != 0) { char errbuf[1024]; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, @@ -476,6 +877,10 @@ dump_ioctl(zfs_handle_t *zhp, const char *fromsnap, boolean_t fromorigin, return (zfs_error(hdl, EZFS_CROSSTARGET, errbuf)); case ENOENT: + if (enoent_ok) { + *got_enoent = B_TRUE; + return (0); + } if (zfs_dataset_exists(hdl, zc.zc_name, ZFS_TYPE_SNAPSHOT)) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, @@ -512,13 +917,14 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) send_dump_data_t *sdd = arg; const char *thissnap; int err; + boolean_t got_enoent; thissnap = strchr(zhp->zfs_name, '@') + 1; if (sdd->fromsnap && !sdd->seenfrom && strcmp(sdd->fromsnap, thissnap) == 0) { sdd->seenfrom = B_TRUE; - (void) strcpy(sdd->lastsnap, thissnap); + (void) strcpy(sdd->prevsnap, thissnap); zfs_close(zhp); return (0); } @@ -528,20 +934,41 @@ dump_snapshot(zfs_handle_t *zhp, void *arg) return (0); } + if (strcmp(sdd->tosnap, thissnap) == 0) + sdd->seento = B_TRUE; + + /* + * If a filter function exists, call it to determine whether + * this snapshot will be sent. + */ + if (sdd->filter_cb != NULL && + sdd->filter_cb(zhp, sdd->filter_cb_arg) == B_FALSE) { + /* + * This snapshot is filtered out. Don't send it, and don't + * set prevsnap, so it will be as if this snapshot didn't + * exist, and the next accepted snapshot will be sent as + * an incremental from the last accepted one, or as the + * first (and full) snapshot in the case of a replication, + * non-incremental send. + */ + zfs_close(zhp); + return (0); + } + /* send it */ if (sdd->verbose) { (void) fprintf(stderr, "sending from @%s to %s\n", - sdd->lastsnap, zhp->zfs_name); + sdd->prevsnap, zhp->zfs_name); } - err = dump_ioctl(zhp, sdd->lastsnap, - sdd->lastsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate), - sdd->outfd); + err = dump_ioctl(zhp, sdd->prevsnap, + sdd->prevsnap[0] == '\0' && (sdd->fromorigin || sdd->replicate), + sdd->outfd, B_TRUE, &got_enoent); - if (!sdd->seento && strcmp(sdd->tosnap, thissnap) == 0) - sdd->seento = B_TRUE; - - (void) strcpy(sdd->lastsnap, thissnap); + if (got_enoent) + err = 0; + else + (void) strcpy(sdd->prevsnap, thissnap); zfs_close(zhp); return (err); } @@ -581,7 +1008,7 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) } if (sdd->doall) { - sdd->seenfrom = sdd->seento = sdd->lastsnap[0] = 0; + sdd->seenfrom = sdd->seento = sdd->prevsnap[0] = 0; if (sdd->fromsnap == NULL || missingfrom) sdd->seenfrom = B_TRUE; @@ -594,12 +1021,18 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) zhp->zfs_name, sdd->fromsnap); sdd->err = B_TRUE; } else if (!sdd->seento) { - (void) fprintf(stderr, - "WARNING: could not send %s@%s:\n" - "incremental source (%s@%s) " - "is not earlier than it\n", - zhp->zfs_name, sdd->tosnap, - zhp->zfs_name, sdd->fromsnap); + if (sdd->fromsnap) { + (void) fprintf(stderr, + "WARNING: could not send %s@%s:\n" + "incremental source (%s@%s) " + "is not earlier than it\n", + zhp->zfs_name, sdd->tosnap, + zhp->zfs_name, sdd->fromsnap); + } else { + (void) fprintf(stderr, "WARNING: " + "could not send %s@%s: does not exist\n", + zhp->zfs_name, sdd->tosnap); + } sdd->err = B_TRUE; } } else { @@ -612,10 +1045,16 @@ dump_filesystem(zfs_handle_t *zhp, void *arg) if (snapzhp == NULL) { rv = -1; } else { - rv = dump_ioctl(snapzhp, - missingfrom ? NULL : sdd->fromsnap, - sdd->fromorigin || missingfrom, - sdd->outfd); + if (sdd->filter_cb == NULL || + sdd->filter_cb(snapzhp, sdd->filter_cb_arg) == + B_TRUE) { + boolean_t got_enoent; + + rv = dump_ioctl(snapzhp, + missingfrom ? NULL : sdd->fromsnap, + sdd->fromorigin || missingfrom, + sdd->outfd, B_FALSE, &got_enoent); + } sdd->seento = B_TRUE; zfs_close(snapzhp); } @@ -681,20 +1120,39 @@ dump_filesystems(zfs_handle_t *rzhp, void *arg) } /* - * Dumps a backup of tosnap, incremental from fromsnap if it isn't NULL. - * If 'doall', dump all intermediate snaps. - * If 'replicate', dump special header and do recursively. + * Generate a send stream for the dataset identified by the argument zhp. + * + * The content of the send stream is the snapshot identified by + * 'tosnap'. Incremental streams are requested in two ways: + * - from the snapshot identified by "fromsnap" (if non-null) or + * - from the origin of the dataset identified by zhp, which must + * be a clone. In this case, "fromsnap" is null and "fromorigin" + * is TRUE. + * + * The send stream is recursive (i.e. dumps a hierarchy of snapshots) and + * uses a special header (with a hdrtype field of DMU_COMPOUNDSTREAM) + * if "replicate" is set. If "doall" is set, dump all the intermediate + * snapshots. The DMU_COMPOUNDSTREAM header is used in the "doall" + * case too. If "props" is set, send properties. */ int zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, - boolean_t replicate, boolean_t doall, boolean_t fromorigin, - boolean_t verbose, int outfd) + sendflags_t flags, int outfd, snapfilter_cb_t filter_func, + void *cb_arg) { char errbuf[1024]; send_dump_data_t sdd = { 0 }; int err; nvlist_t *fss = NULL; avl_tree_t *fsavl = NULL; + char holdtag[128]; + static uint64_t holdseq; + int spa_version; + boolean_t holdsnaps = B_FALSE; + pthread_t tid; + int pipefd[2]; + dedup_arg_t dda = { 0 }; + int featureflags = 0; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot send '%s'"), zhp->zfs_name); @@ -705,15 +1163,47 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, return (zfs_error(zhp->zfs_hdl, EZFS_NOENT, errbuf)); } - if (replicate || doall) { + if (zfs_spa_version(zhp, &spa_version) == 0 && + spa_version >= SPA_VERSION_USERREFS) + holdsnaps = B_TRUE; + + if (flags.dedup) { + featureflags |= (DMU_BACKUP_FEATURE_DEDUP | + DMU_BACKUP_FEATURE_DEDUPPROPS); + if (err = pipe(pipefd)) { + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + return (zfs_error(zhp->zfs_hdl, EZFS_PIPEFAILED, + errbuf)); + } + dda.outputfd = outfd; + dda.inputfd = pipefd[1]; + dda.dedup_hdl = zhp->zfs_hdl; + if (err = pthread_create(&tid, NULL, cksummer, &dda)) { + (void) close(pipefd[0]); + (void) close(pipefd[1]); + zfs_error_aux(zhp->zfs_hdl, strerror(errno)); + return (zfs_error(zhp->zfs_hdl, + EZFS_THREADCREATEFAILED, errbuf)); + } + } + + if (flags.replicate || flags.doall || flags.props) { dmu_replay_record_t drr = { 0 }; char *packbuf = NULL; size_t buflen = 0; zio_cksum_t zc = { 0 }; - assert(fromsnap || doall); + if (holdsnaps) { + (void) snprintf(holdtag, sizeof (holdtag), + ".send-%d-%llu", getpid(), (u_longlong_t)holdseq); + ++holdseq; + err = zfs_hold_range(zhp, fromsnap, tosnap, + holdtag, flags.replicate, B_TRUE); + if (err) + goto err_out; + } - if (replicate) { + if (flags.replicate || flags.props) { nvlist_t *hdrnv; VERIFY(0 == nvlist_alloc(&hdrnv, NV_UNIQUE_NAME, 0)); @@ -722,11 +1212,20 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, "fromsnap", fromsnap)); } VERIFY(0 == nvlist_add_string(hdrnv, "tosnap", tosnap)); + if (!flags.replicate) { + VERIFY(0 == nvlist_add_boolean(hdrnv, + "not_recursive")); + } err = gather_nvlist(zhp->zfs_hdl, zhp->zfs_name, - fromsnap, tosnap, &fss, &fsavl); - if (err) - return (err); + fromsnap, tosnap, flags.replicate, &fss, &fsavl); + if (err) { + if (holdsnaps) { + (void) zfs_release_range(zhp, fromsnap, + tosnap, holdtag, flags.replicate); + } + goto err_out; + } VERIFY(0 == nvlist_add_nvlist(hdrnv, "fss", fss)); err = nvlist_pack(hdrnv, &packbuf, &buflen, NV_ENCODE_XDR, 0); @@ -734,33 +1233,41 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, if (err) { fsavl_destroy(fsavl); nvlist_free(fss); - return (zfs_standard_error(zhp->zfs_hdl, - err, errbuf)); + if (holdsnaps) { + (void) zfs_release_range(zhp, fromsnap, + tosnap, holdtag, flags.replicate); + } + goto stderr_out; } } /* write first begin record */ drr.drr_type = DRR_BEGIN; drr.drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - drr.drr_u.drr_begin.drr_version = DMU_BACKUP_HEADER_VERSION; + DMU_SET_STREAM_HDRTYPE(drr.drr_u.drr_begin.drr_versioninfo, + DMU_COMPOUNDSTREAM); + DMU_SET_FEATUREFLAGS(drr.drr_u.drr_begin.drr_versioninfo, + featureflags); (void) snprintf(drr.drr_u.drr_begin.drr_toname, sizeof (drr.drr_u.drr_begin.drr_toname), "%s@%s", zhp->zfs_name, tosnap); drr.drr_payloadlen = buflen; - fletcher_4_incremental_native(&drr, sizeof (drr), &zc); - err = write(outfd, &drr, sizeof (drr)); + err = cksum_and_write(&drr, sizeof (drr), &zc, outfd); /* write header nvlist */ - if (err != -1) { - fletcher_4_incremental_native(packbuf, buflen, &zc); - err = write(outfd, packbuf, buflen); + if (err != -1 && packbuf != NULL) { + err = cksum_and_write(packbuf, buflen, &zc, outfd); } free(packbuf); if (err == -1) { fsavl_destroy(fsavl); nvlist_free(fss); - return (zfs_standard_error(zhp->zfs_hdl, - errno, errbuf)); + if (holdsnaps) { + (void) zfs_release_range(zhp, fromsnap, tosnap, + holdtag, flags.replicate); + } + err = errno; + goto stderr_out; } /* write end record */ @@ -772,8 +1279,12 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, if (err == -1) { fsavl_destroy(fsavl); nvlist_free(fss); - return (zfs_standard_error(zhp->zfs_hdl, - errno, errbuf)); + err = errno; + if (holdsnaps) { + (void) zfs_release_range(zhp, fromsnap, + tosnap, holdtag, flags.replicate); + } + goto stderr_out; } } } @@ -781,18 +1292,28 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, /* dump each stream */ sdd.fromsnap = fromsnap; sdd.tosnap = tosnap; - sdd.outfd = outfd; - sdd.replicate = replicate; - sdd.doall = doall; - sdd.fromorigin = fromorigin; + if (flags.dedup) + sdd.outfd = pipefd[0]; + else + sdd.outfd = outfd; + sdd.replicate = flags.replicate; + sdd.doall = flags.doall; + sdd.fromorigin = flags.fromorigin; sdd.fss = fss; sdd.fsavl = fsavl; - sdd.verbose = verbose; + sdd.verbose = flags.verbose; + sdd.filter_cb = filter_func; + sdd.filter_cb_arg = cb_arg; err = dump_filesystems(zhp, &sdd); fsavl_destroy(fsavl); nvlist_free(fss); - if (replicate || doall) { + if (flags.dedup) { + (void) close(pipefd[0]); + (void) pthread_join(tid, NULL); + } + + if (flags.replicate || flags.doall || flags.props) { /* * write final end record. NB: want to do this even if * there was some error, because it might not be totally @@ -800,6 +1321,10 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, */ dmu_replay_record_t drr = { 0 }; drr.drr_type = DRR_END; + if (holdsnaps) { + (void) zfs_release_range(zhp, fromsnap, tosnap, + holdtag, flags.replicate); + } if (write(outfd, &drr, sizeof (drr)) == -1) { return (zfs_standard_error(zhp->zfs_hdl, errno, errbuf)); @@ -807,6 +1332,16 @@ zfs_send(zfs_handle_t *zhp, const char *fromsnap, const char *tosnap, } return (err || sdd.err); + +stderr_out: + err = zfs_standard_error(zhp->zfs_hdl, err, errbuf); +err_out: + if (flags.dedup) { + (void) pthread_cancel(tid); + (void) pthread_join(tid, NULL); + (void) close(pipefd[0]); + } + return (err); } /* @@ -892,11 +1427,12 @@ recv_rename(libzfs_handle_t *hdl, const char *name, const char *tryname, if (err) return (err); + zc.zc_objset_type = DMU_OST_ZFS; + (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); + if (tryname) { (void) strcpy(newname, tryname); - zc.zc_objset_type = DMU_OST_ZFS; - (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); (void) strlcpy(zc.zc_value, tryname, sizeof (zc.zc_value)); if (flags.verbose) { @@ -951,12 +1487,18 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, int err = 0; prop_changelist_t *clp; zfs_handle_t *zhp; + boolean_t defer = B_FALSE; + int spa_version; zhp = zfs_open(hdl, name, ZFS_TYPE_DATASET); if (zhp == NULL) return (-1); clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, flags.force ? MS_FORCE : 0); + if (zfs_get_type(zhp) == ZFS_TYPE_SNAPSHOT && + zfs_spa_version(zhp, &spa_version) == 0 && + spa_version >= SPA_VERSION_USERREFS) + defer = B_TRUE; zfs_close(zhp); if (clp == NULL) return (-1); @@ -965,12 +1507,12 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, return (err); zc.zc_objset_type = DMU_OST_ZFS; + zc.zc_defer_destroy = defer; (void) strlcpy(zc.zc_name, name, sizeof (zc.zc_name)); if (flags.verbose) (void) printf("attempting destroy %s\n", zc.zc_name); err = ioctl(hdl->libzfs_fd, ZFS_IOC_DESTROY, &zc); - if (err == 0) { if (flags.verbose) (void) printf("success\n"); @@ -980,8 +1522,14 @@ recv_destroy(libzfs_handle_t *hdl, const char *name, int baselen, (void) changelist_postfix(clp); changelist_free(clp); - if (err != 0) + /* + * Deferred destroy might destroy the snapshot or only mark it to be + * destroyed later, and it returns success in either case. + */ + if (err != 0 || (defer && zfs_dataset_exists(hdl, name, + ZFS_TYPE_SNAPSHOT))) { err = recv_rename(hdl, name, NULL, baselen, newname, flags); + } return (err); } @@ -999,6 +1547,7 @@ guid_to_name_cb(zfs_handle_t *zhp, void *arg) if (zhp->zfs_dmustats.dds_guid == gtnd->guid) { (void) strcpy(gtnd->name, zhp->zfs_name); + zfs_close(zhp); return (EEXIST); } err = zfs_iter_children(zhp, guid_to_name_cb, gtnd); @@ -1097,11 +1646,15 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, char *tosnap, *fromsnap; char newname[ZFS_MAXNAMELEN]; int error; - boolean_t needagain, progress; + boolean_t needagain, progress, recursive; + char *s1, *s2; VERIFY(0 == nvlist_lookup_string(stream_nv, "fromsnap", &fromsnap)); VERIFY(0 == nvlist_lookup_string(stream_nv, "tosnap", &tosnap)); + recursive = (nvlist_lookup_boolean(stream_nv, "not_recursive") == + ENOENT); + if (flags.dryrun) return (0); @@ -1109,7 +1662,7 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, needagain = progress = B_FALSE; if ((error = gather_nvlist(hdl, tofs, fromsnap, NULL, - &local_nv, &local_avl)) != 0) + recursive, &local_nv, &local_avl)) != 0) return (error); /* @@ -1232,7 +1785,7 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, stream_snapname, &props)) { zfs_cmd_t zc = { 0 }; - zc.zc_cookie = B_TRUE; /* clear current props */ + zc.zc_cookie = B_TRUE; /* received */ (void) snprintf(zc.zc_name, sizeof (zc.zc_name), "%s@%s", fsname, nvpair_name(snapelem)); if (zcmd_write_src_nvlist(hdl, &zc, @@ -1292,11 +1845,13 @@ recv_incremental_replication(libzfs_handle_t *hdl, const char *tofs, VERIFY(0 == nvlist_lookup_uint64(stream_nvfs, "parentfromsnap", &stream_parent_fromsnap_guid)); + s1 = strrchr(fsname, '/'); + s2 = strrchr(stream_fsname, '/'); + /* check for rename */ if ((stream_parent_fromsnap_guid != 0 && stream_parent_fromsnap_guid != parent_fromsnap_guid) || - strcmp(strrchr(fsname, '/'), - strrchr(stream_fsname, '/')) != 0) { + ((s1 != NULL) && (s2 != NULL) && strcmp(s1, s2) != 0)) { nvlist_t *parent; char tryname[ZFS_MAXNAMELEN]; @@ -1372,19 +1927,13 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, assert(drr->drr_type == DRR_BEGIN); assert(drr->drr_u.drr_begin.drr_magic == DMU_BACKUP_MAGIC); - assert(drr->drr_u.drr_begin.drr_version == DMU_BACKUP_HEADER_VERSION); + assert(DMU_GET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo) == + DMU_COMPOUNDSTREAM); /* * Read in the nvlist from the stream. */ if (drr->drr_payloadlen != 0) { - if (!flags.isprefix) { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "must use -d to receive replication " - "(send -R) stream")); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); - } - error = recv_read_nvlist(hdl, fd, drr->drr_payloadlen, &stream_nv, flags.byteswap, zc); if (error) { @@ -1490,11 +2039,28 @@ zfs_receive_package(libzfs_handle_t *hdl, int fd, const char *destname, return (error); } +static void +trunc_prop_errs(int truncated) +{ + ASSERT(truncated != 0); + + if (truncated == 1) + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "1 more property could not be set\n")); + else + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, + "%d more properties could not be set\n"), truncated); +} + static int recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) { dmu_replay_record_t *drr; void *buf = malloc(1<<20); + char errbuf[1024]; + + (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, + "cannot receive:")); /* XXX would be great to use lseek if possible... */ drr = buf; @@ -1507,7 +2073,11 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) switch (drr->drr_type) { case DRR_BEGIN: /* NB: not to be used on v2 stream packages */ - assert(drr->drr_payloadlen == 0); + if (drr->drr_payloadlen != 0) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid substream header")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } break; case DRR_END: @@ -1534,12 +2104,15 @@ recv_skip(libzfs_handle_t *hdl, int fd, boolean_t byteswap) drr->drr_u.drr_write.drr_length, B_FALSE, NULL); break; + case DRR_WRITE_BYREF: case DRR_FREEOBJECTS: case DRR_FREE: break; default: - assert(!"invalid record type"); + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "invalid record type")); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } } @@ -1562,12 +2135,14 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, char *cp; struct drr_begin *drrb = &drr->drr_u.drr_begin; char errbuf[1024]; + char prop_errbuf[1024]; char chopprefix[ZFS_MAXNAMELEN]; boolean_t newfs = B_FALSE; boolean_t stream_wantsnewfs; uint64_t parent_snapguid = 0; prop_changelist_t *clp = NULL; nvlist_t *snapprops_nvlist = NULL; + zprop_errflags_t prop_errflags; begin_time = time(NULL); @@ -1615,23 +2190,27 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) strcpy(chopprefix, drrb->drr_toname); if (flags.isprefix) { /* - * They specified a fs with -d, we want to tack on - * everything but the pool name stored in the stream + * They specified a fs with -d or -e. We want to tack on + * everything but the first element of the sent snapshot path + * (all but the pool name) in the case of -d, or only the tail + * of the sent snapshot path in the case of -e. */ if (strchr(tosnap, '@')) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " - "argument - snapshot not allowed with -d")); + "argument - snapshot not allowed with %s"), + (flags.istail ? "-e" : "-d")); return (zfs_error(hdl, EZFS_INVALIDNAME, errbuf)); } - cp = strchr(chopprefix, '/'); + cp = (flags.istail ? strrchr(chopprefix, '/') : + strchr(chopprefix, '/')); if (cp == NULL) cp = strchr(chopprefix, '@'); *cp = '\0'; } else if (strchr(tosnap, '@') == NULL) { /* - * If they specified a filesystem without -d, we want to - * tack on everything after the fs specified in the - * first name from the stream. + * If they specified a filesystem without -d or -e, we want to + * tack on everything after the fs specified in the first name + * from the stream. */ cp = strchr(chopprefix, '@'); *cp = '\0'; @@ -1641,6 +2220,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, /* * Determine name of destination snapshot, store in zc_value. */ + (void) strcpy(zc.zc_top_ds, tosnap); (void) strcpy(zc.zc_value, tosnap); (void) strncat(zc.zc_value, drrb->drr_toname+choplen, sizeof (zc.zc_value)); @@ -1767,21 +2347,17 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, /* We can't do online recv in this case */ clp = changelist_gather(zhp, ZFS_PROP_NAME, 0, 0); if (clp == NULL) { + zfs_close(zhp); zcmd_free_nvlists(&zc); return (-1); } if (changelist_prefix(clp) != 0) { changelist_free(clp); + zfs_close(zhp); zcmd_free_nvlists(&zc); return (-1); } } - if (!flags.dryrun && zhp->zfs_type == ZFS_TYPE_VOLUME && - zvol_remove_link(hdl, zhp->zfs_name) != 0) { - zfs_close(zhp); - zcmd_free_nvlists(&zc); - return (-1); - } zfs_close(zhp); } else { /* @@ -1830,14 +2406,52 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, return (recv_skip(hdl, infd, flags.byteswap)); } + zc.zc_nvlist_dst = (uint64_t)(uintptr_t)prop_errbuf; + zc.zc_nvlist_dst_size = sizeof (prop_errbuf); + err = ioctl_err = zfs_ioctl(hdl, ZFS_IOC_RECV, &zc); ioctl_errno = errno; + prop_errflags = (zprop_errflags_t)zc.zc_obj; + + if (err == 0) { + nvlist_t *prop_errors; + VERIFY(0 == nvlist_unpack((void *)(uintptr_t)zc.zc_nvlist_dst, + zc.zc_nvlist_dst_size, &prop_errors, 0)); + + nvpair_t *prop_err = NULL; + + while ((prop_err = nvlist_next_nvpair(prop_errors, + prop_err)) != NULL) { + char tbuf[1024]; + zfs_prop_t prop; + int intval; + + prop = zfs_name_to_prop(nvpair_name(prop_err)); + (void) nvpair_value_int32(prop_err, &intval); + if (strcmp(nvpair_name(prop_err), + ZPROP_N_MORE_ERRORS) == 0) { + trunc_prop_errs(intval); + break; + } else { + (void) snprintf(tbuf, sizeof (tbuf), + dgettext(TEXT_DOMAIN, + "cannot receive %s property on %s"), + nvpair_name(prop_err), zc.zc_name); + zfs_setprop_error(hdl, prop, intval, tbuf); + } + } + nvlist_free(prop_errors); + } + + zc.zc_nvlist_dst = 0; + zc.zc_nvlist_dst_size = 0; zcmd_free_nvlists(&zc); if (err == 0 && snapprops_nvlist) { zfs_cmd_t zc2 = { 0 }; (void) strcpy(zc2.zc_name, zc.zc_value); + zc2.zc_cookie = B_TRUE; /* received */ if (zcmd_write_src_nvlist(hdl, &zc2, snapprops_nvlist) == 0) { (void) zfs_ioctl(hdl, ZFS_IOC_SET_PROP, &zc2); zcmd_free_nvlists(&zc2); @@ -1860,7 +2474,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, * get a strange "does not exist" error message. */ *cp = '\0'; - if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, + if (gather_nvlist(hdl, zc.zc_value, NULL, NULL, B_FALSE, &local_nv, &local_avl) == 0) { *cp = '@'; fs = fsavl_find(local_avl, drrb->drr_toguid, NULL); @@ -1872,14 +2486,13 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, (void) printf("snap %s already exists; " "ignoring\n", zc.zc_value); } - ioctl_err = recv_skip(hdl, infd, + err = ioctl_err = recv_skip(hdl, infd, flags.byteswap); } } *cp = '@'; } - if (ioctl_err != 0) { switch (ioctl_errno) { case ENODEV: @@ -1924,11 +2537,9 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, } /* - * Mount or recreate the /dev links for the target filesystem - * (if created, or if we tore them down to do an incremental - * restore), and the /dev links for the new snapshot (if - * created). Also mount any children of the target filesystem - * if we did an incremental receive. + * Mount the target filesystem (if created). Also mount any + * children of the target filesystem if we did a replication + * receive (indicated by stream_avl being non-NULL). */ cp = strchr(zc.zc_value, '@'); if (cp && (ioctl_err == 0 || !newfs)) { @@ -1940,11 +2551,7 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, if (h != NULL) { if (h->zfs_type == ZFS_TYPE_VOLUME) { *cp = '@'; - err = zvol_create_link(hdl, h->zfs_name); - if (err == 0 && ioctl_err == 0) - err = zvol_create_link(hdl, - zc.zc_value); - } else if (newfs) { + } else if (newfs || stream_avl) { /* * Track the first/top of hierarchy fs, * for mounting and sharing later. @@ -1962,6 +2569,19 @@ zfs_receive_one(libzfs_handle_t *hdl, int infd, const char *tosnap, changelist_free(clp); } + if (prop_errflags & ZPROP_ERR_NOCLEAR) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " + "failed to clear unreceived properties on %s"), + zc.zc_name); + (void) fprintf(stderr, "\n"); + } + if (prop_errflags & ZPROP_ERR_NORESTORE) { + (void) fprintf(stderr, dgettext(TEXT_DOMAIN, "Warning: " + "failed to restore original properties on %s"), + zc.zc_name); + (void) fprintf(stderr, "\n"); + } + if (err || ioctl_err) return (-1); @@ -1991,6 +2611,8 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, struct drr_begin *drrb = &drr.drr_u.drr_begin; char errbuf[1024]; zio_cksum_t zcksum = { 0 }; + uint64_t featureflags; + int hdrtype; (void) snprintf(errbuf, sizeof (errbuf), dgettext(TEXT_DOMAIN, "cannot receive")); @@ -2028,7 +2650,7 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, drr.drr_type = BSWAP_32(drr.drr_type); drr.drr_payloadlen = BSWAP_32(drr.drr_payloadlen); drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_version = BSWAP_64(drrb->drr_version); + drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_flags = BSWAP_32(drrb->drr_flags); @@ -2042,23 +2664,31 @@ zfs_receive_impl(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } + featureflags = DMU_GET_FEATUREFLAGS(drrb->drr_versioninfo); + hdrtype = DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo); + + if (!DMU_STREAM_SUPPORTED(featureflags) || + (hdrtype != DMU_SUBSTREAM && hdrtype != DMU_COMPOUNDSTREAM)) { + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "stream has unsupported feature, feature flags = %lx"), + featureflags); + return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); + } + if (strchr(drrb->drr_toname, '@') == NULL) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid " "stream (bad snapshot name)")); return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } - if (drrb->drr_version == DMU_BACKUP_STREAM_VERSION) { + if (DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == DMU_SUBSTREAM) { return (zfs_receive_one(hdl, infd, tosnap, flags, &drr, &drr_noswap, stream_avl, top_zfs)); - } else if (drrb->drr_version == DMU_BACKUP_HEADER_VERSION) { + } else { /* must be DMU_COMPOUNDSTREAM */ + assert(DMU_GET_STREAM_HDRTYPE(drrb->drr_versioninfo) == + DMU_COMPOUNDSTREAM); return (zfs_receive_package(hdl, infd, tosnap, flags, &drr, &zcksum, top_zfs)); - } else { - zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, - "stream is unsupported version %llu"), - drrb->drr_version); - return (zfs_error(hdl, EZFS_BADSTREAM, errbuf)); } } @@ -2077,7 +2707,7 @@ zfs_receive(libzfs_handle_t *hdl, const char *tosnap, recvflags_t flags, err = zfs_receive_impl(hdl, tosnap, flags, infd, stream_avl, &top_zfs); - if (err == 0 && top_zfs) { + if (err == 0 && !flags.nomount && top_zfs) { zfs_handle_t *zhp; prop_changelist_t *clp; diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c index c7eb04e74cac8..c4f907733f017 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_status.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -104,6 +104,13 @@ vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs) return (state == VDEV_STATE_OFFLINE); } +/* ARGSUSED */ +static int +vdev_removed(uint64_t state, uint64_t aux, uint64_t errs) +{ + return (state == VDEV_STATE_REMOVED); +} + /* * Detect if any leaf devices that have seen errors or could not be opened. */ @@ -275,6 +282,12 @@ check_status(nvlist_t *config, boolean_t isimport) if (find_vdev_problem(nvroot, vdev_offlined)) return (ZPOOL_STATUS_OFFLINE_DEV); + /* + * Removed device + */ + if (find_vdev_problem(nvroot, vdev_removed)) + return (ZPOOL_STATUS_REMOVED_DEV); + /* * Currently resilvering */ @@ -315,3 +328,68 @@ zpool_import_status(nvlist_t *config, char **msgid) return (ret); } + +static void +dump_ddt_stat(const ddt_stat_t *dds, int h) +{ + char refcnt[6]; + char blocks[6], lsize[6], psize[6], dsize[6]; + char ref_blocks[6], ref_lsize[6], ref_psize[6], ref_dsize[6]; + + if (dds == NULL || dds->dds_blocks == 0) + return; + + if (h == -1) + (void) strcpy(refcnt, "Total"); + else + zfs_nicenum(1ULL << h, refcnt, sizeof (refcnt)); + + zfs_nicenum(dds->dds_blocks, blocks, sizeof (blocks)); + zfs_nicenum(dds->dds_lsize, lsize, sizeof (lsize)); + zfs_nicenum(dds->dds_psize, psize, sizeof (psize)); + zfs_nicenum(dds->dds_dsize, dsize, sizeof (dsize)); + zfs_nicenum(dds->dds_ref_blocks, ref_blocks, sizeof (ref_blocks)); + zfs_nicenum(dds->dds_ref_lsize, ref_lsize, sizeof (ref_lsize)); + zfs_nicenum(dds->dds_ref_psize, ref_psize, sizeof (ref_psize)); + zfs_nicenum(dds->dds_ref_dsize, ref_dsize, sizeof (ref_dsize)); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + refcnt, + blocks, lsize, psize, dsize, + ref_blocks, ref_lsize, ref_psize, ref_dsize); +} + +/* + * Print the DDT histogram and the column totals. + */ +void +zpool_dump_ddt(const ddt_stat_t *dds_total, const ddt_histogram_t *ddh) +{ + int h; + + (void) printf("\n"); + + (void) printf("bucket " + " allocated " + " referenced \n"); + (void) printf("______ " + "______________________________ " + "______________________________\n"); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + "refcnt", + "blocks", "LSIZE", "PSIZE", "DSIZE", + "blocks", "LSIZE", "PSIZE", "DSIZE"); + + (void) printf("%6s %6s %5s %5s %5s %6s %5s %5s %5s\n", + "------", + "------", "-----", "-----", "-----", + "------", "-----", "-----", "-----"); + + for (h = 0; h < 64; h++) + dump_ddt_stat(&ddh->ddh_stat[h], h); + + dump_ddt_stat(dds_total, -1); + + (void) printf("\n"); +} diff --git a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c index 54de0f4b50a4c..a400dc9c1e114 100644 --- a/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c +++ b/external/cddl/osnet/dist/lib/libzfs/common/libzfs_util.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -94,8 +94,6 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_VOLTOOBIG: return (dgettext(TEXT_DOMAIN, "volume size exceeds limit for " "this system")); - case EZFS_VOLHASDATA: - return (dgettext(TEXT_DOMAIN, "volume has data")); case EZFS_INVALIDNAME: return (dgettext(TEXT_DOMAIN, "invalid name")); case EZFS_BADRESTORE: @@ -142,8 +140,6 @@ libzfs_error_description(libzfs_handle_t *hdl) return (dgettext(TEXT_DOMAIN, "iscsitgt service need to be enabled by " "a privileged user")); - case EZFS_DEVLINKS: - return (dgettext(TEXT_DOMAIN, "failed to create /dev links")); case EZFS_PERM: return (dgettext(TEXT_DOMAIN, "permission denied")); case EZFS_NOSPC: @@ -210,6 +206,23 @@ libzfs_error_description(libzfs_handle_t *hdl) case EZFS_ACTIVE_SPARE: return (dgettext(TEXT_DOMAIN, "pool has active shared spare " "device")); + case EZFS_UNPLAYED_LOGS: + return (dgettext(TEXT_DOMAIN, "log device has unplayed intent " + "logs")); + case EZFS_REFTAG_RELE: + return (dgettext(TEXT_DOMAIN, "no such tag on this dataset")); + case EZFS_REFTAG_HOLD: + return (dgettext(TEXT_DOMAIN, "tag already exists on this " + "dataset")); + case EZFS_TAGTOOLONG: + return (dgettext(TEXT_DOMAIN, "tag too long")); + case EZFS_PIPEFAILED: + return (dgettext(TEXT_DOMAIN, "pipe create failed")); + case EZFS_THREADCREATEFAILED: + return (dgettext(TEXT_DOMAIN, "thread create failed")); + case EZFS_POSTSPLIT_ONLINE: + return (dgettext(TEXT_DOMAIN, "disk was split from this pool " + "into a new one")); case EZFS_UNKNOWN: return (dgettext(TEXT_DOMAIN, "unknown error")); default: @@ -364,8 +377,13 @@ zfs_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case ENOTSUP: zfs_verror(hdl, EZFS_BADVERSION, fmt, ap); break; + case EAGAIN: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool I/O is currently suspended")); + zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); + break; default: - zfs_error_aux(hdl, strerror(errno)); + zfs_error_aux(hdl, strerror(error)); zfs_verror(hdl, EZFS_UNKNOWN, fmt, ap); break; } @@ -437,6 +455,11 @@ zpool_standard_error_fmt(libzfs_handle_t *hdl, int error, const char *fmt, ...) case EDQUOT: zfs_verror(hdl, EZFS_NOSPC, fmt, ap); return (-1); + case EAGAIN: + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, + "pool I/O is currently suspended")); + zfs_verror(hdl, EZFS_POOLUNAVAIL, fmt, ap); + break; default: zfs_error_aux(hdl, strerror(error)); @@ -480,7 +503,6 @@ zfs_realloc(libzfs_handle_t *hdl, void *ptr, size_t oldsize, size_t newsize) if ((ret = realloc(ptr, newsize)) == NULL) { (void) no_memory(hdl); - free(ptr); return (NULL); } @@ -576,6 +598,7 @@ libzfs_init(void) zfs_prop_init(); zpool_prop_init(); + libzfs_mnttab_init(hdl); return (hdl); } @@ -592,7 +615,9 @@ libzfs_fini(libzfs_handle_t *hdl) if (hdl->libzfs_log_str) (void) free(hdl->libzfs_log_str); zpool_free_handles(hdl); + libzfs_fru_clear(hdl, B_TRUE); namespace_clear(hdl); + libzfs_mnttab_fini(hdl); free(hdl); } @@ -667,7 +692,7 @@ int zcmd_alloc_dst_nvlist(libzfs_handle_t *hdl, zfs_cmd_t *zc, size_t len) { if (len == 0) - len = 2048; + len = 4*1024; zc->zc_nvlist_dst_size = len; if ((zc->zc_nvlist_dst = (uint64_t)(uintptr_t) zfs_alloc(hdl, zc->zc_nvlist_dst_size)) == NULL) @@ -793,16 +818,22 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) "PROPERTY")); cbp->cb_colwidths[GET_COL_VALUE] = strlen(dgettext(TEXT_DOMAIN, "VALUE")); + cbp->cb_colwidths[GET_COL_RECVD] = strlen(dgettext(TEXT_DOMAIN, + "RECEIVED")); cbp->cb_colwidths[GET_COL_SOURCE] = strlen(dgettext(TEXT_DOMAIN, "SOURCE")); + /* first property is always NAME */ + assert(cbp->cb_proplist->pl_prop == + ((type == ZFS_TYPE_POOL) ? ZPOOL_PROP_NAME : ZFS_PROP_NAME)); + /* * Go through and calculate the widths for each column. For the * 'source' column, we kludge it up by taking the worst-case scenario of * inheriting from the longest name. This is acceptable because in the * majority of cases 'SOURCE' is the last column displayed, and we don't * use the width anyway. Note that the 'VALUE' column can be oversized, - * if the name of the property is much longer the any values we find. + * if the name of the property is much longer than any values we find. */ for (pl = cbp->cb_proplist; pl != NULL; pl = pl->pl_next) { /* @@ -823,12 +854,21 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) } /* - * 'VALUE' column + * 'VALUE' column. The first property is always the 'name' + * property that was tacked on either by /sbin/zfs's + * zfs_do_get() or when calling zprop_expand_list(), so we + * ignore its width. If the user specified the name property + * to display, then it will be later in the list in any case. */ - if ((pl->pl_prop != ZFS_PROP_NAME || !pl->pl_all) && + if (pl != cbp->cb_proplist && pl->pl_width > cbp->cb_colwidths[GET_COL_VALUE]) cbp->cb_colwidths[GET_COL_VALUE] = pl->pl_width; + /* 'RECEIVED' column. */ + if (pl != cbp->cb_proplist && + pl->pl_recvd_width > cbp->cb_colwidths[GET_COL_RECVD]) + cbp->cb_colwidths[GET_COL_RECVD] = pl->pl_recvd_width; + /* * 'NAME' and 'SOURCE' columns */ @@ -844,7 +884,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) /* * Now go through and print the headers. */ - for (i = 0; i < 4; i++) { + for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: title = dgettext(TEXT_DOMAIN, "NAME"); @@ -855,6 +895,9 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) case GET_COL_VALUE: title = dgettext(TEXT_DOMAIN, "VALUE"); break; + case GET_COL_RECVD: + title = dgettext(TEXT_DOMAIN, "RECEIVED"); + break; case GET_COL_SOURCE: title = dgettext(TEXT_DOMAIN, "SOURCE"); break; @@ -863,7 +906,8 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) } if (title != NULL) { - if (i == 3 || cbp->cb_columns[i + 1] == 0) + if (i == (ZFS_GET_NCOLS - 1) || + cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", title); else (void) printf("%-*s ", @@ -881,7 +925,7 @@ zprop_print_headers(zprop_get_cbdata_t *cbp, zfs_type_t type) void zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, const char *propname, const char *value, zprop_source_t sourcetype, - const char *source) + const char *source, const char *recvd_value) { int i; const char *str; @@ -896,7 +940,7 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, if (cbp->cb_first) zprop_print_headers(cbp, cbp->cb_type); - for (i = 0; i < 4; i++) { + for (i = 0; i < ZFS_GET_NCOLS; i++) { switch (cbp->cb_columns[i]) { case GET_COL_NAME: str = name; @@ -933,14 +977,21 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, "inherited from %s", source); str = buf; break; + case ZPROP_SRC_RECEIVED: + str = "received"; + break; } break; + case GET_COL_RECVD: + str = (recvd_value == NULL ? "-" : recvd_value); + break; + default: continue; } - if (cbp->cb_columns[i + 1] == 0) + if (cbp->cb_columns[i + 1] == GET_COL_NONE) (void) printf("%s", str); else if (cbp->cb_scripted) (void) printf("%s\t", str); @@ -948,7 +999,6 @@ zprop_print_one_property(const char *name, zprop_get_cbdata_t *cbp, (void) printf("%-*s ", cbp->cb_colwidths[cbp->cb_columns[i]], str); - } (void) printf("\n"); @@ -1010,9 +1060,9 @@ zfs_nicestrtonum(libzfs_handle_t *hdl, const char *value, uint64_t *num) return (-1); } - /* Rely on stroll() to process the numeric portion. */ + /* Rely on strtoull() to process the numeric portion. */ errno = 0; - *num = strtoll(value, &end, 10); + *num = strtoull(value, &end, 10); /* * Check for ERANGE, which indicates that the value is too large to fit @@ -1202,7 +1252,7 @@ addlist(libzfs_handle_t *hdl, char *propname, zprop_list_t **listp, * dataset property, */ if (prop == ZPROP_INVAL && (type == ZFS_TYPE_POOL || - !zfs_prop_user(propname))) { + (!zfs_prop_user(propname) && !zfs_prop_userquota(propname)))) { zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "invalid property '%s'"), propname); return (zfs_error(hdl, EZFS_BADPROP, diff --git a/external/cddl/osnet/dist/lib/libzpool/common/taskq.c b/external/cddl/osnet/dist/lib/libzpool/common/taskq.c index 93acdcf8e4e37..142cd73f08f47 100644 --- a/external/cddl/osnet/dist/lib/libzpool/common/taskq.c +++ b/external/cddl/osnet/dist/lib/libzpool/common/taskq.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -114,8 +114,13 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t tqflags) mutex_exit(&tq->tq_lock); return (0); } - t->task_next = &tq->tq_task; - t->task_prev = tq->tq_task.task_prev; + if (tqflags & TQ_FRONT) { + t->task_next = tq->tq_task.task_next; + t->task_prev = &tq->tq_task; + } else { + t->task_next = &tq->tq_task; + t->task_prev = tq->tq_task.task_prev; + } t->task_next->task_prev = t; t->task_prev->task_next = t; t->task_func = func; @@ -174,6 +179,19 @@ taskq_create(const char *name, int nthreads, pri_t pri, taskq_t *tq = kmem_zalloc(sizeof (taskq_t), KM_SLEEP); int t; + if (flags & TASKQ_THREADS_CPU_PCT) { + int pct; + ASSERT3S(nthreads, >=, 0); + ASSERT3S(nthreads, <=, 100); + pct = MIN(nthreads, 100); + pct = MAX(pct, 0); + + nthreads = (sysconf(_SC_NPROCESSORS_ONLN) * pct) / 100; + nthreads = MAX(nthreads, 1); /* need at least 1 thread */ + } else { + ASSERT3S(nthreads, >=, 1); + } + rw_init(&tq->tq_threadlock, NULL, RW_DEFAULT, NULL); mutex_init(&tq->tq_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&tq->tq_dispatch_cv, NULL, CV_DEFAULT, NULL); @@ -259,3 +277,10 @@ system_taskq_init(void) system_taskq = taskq_create("system_taskq", 64, minclsyspri, 4, 512, TASKQ_DYNAMIC | TASKQ_PREPOPULATE); } + +void +system_taskq_fini(void) +{ + taskq_destroy(system_taskq); + system_taskq = NULL; /* defensive */ +} diff --git a/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c b/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c index 91e0f611cbb91..1e425758c2495 100644 --- a/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c +++ b/external/cddl/osnet/dist/tools/ctf/cvt/ctf.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Create and parse buffers containing CTF data. */ @@ -172,6 +170,12 @@ write_functions(iidesc_t *idp, ctf_buf_t *b) } nargs = idp->ii_nargs + (idp->ii_vargs != 0); + + if (nargs > CTF_MAX_VLEN) { + terminate("function %s has too many args: %d > %d\n", + idp->ii_name, nargs, CTF_MAX_VLEN); + } + fdata[0] = CTF_TYPE_INFO(CTF_K_FUNCTION, 1, nargs); fdata[1] = idp->ii_dtype->t_id; ctf_buf_write(b, fdata, sizeof (fdata)); @@ -312,6 +316,11 @@ write_type(tdesc_t *tp, ctf_buf_t *b) for (i = 0, mp = tp->t_members; mp != NULL; mp = mp->ml_next) i++; /* count up struct or union members */ + if (i > CTF_MAX_VLEN) { + terminate("sou %s has too many members: %d > %d\n", + tdesc_name(tp), i, CTF_MAX_VLEN); + } + if (tp->t_type == STRUCT) ctt.ctt_info = CTF_TYPE_INFO(CTF_K_STRUCT, isroot, i); else @@ -351,6 +360,11 @@ write_type(tdesc_t *tp, ctf_buf_t *b) for (i = 0, ep = tp->t_emem; ep != NULL; ep = ep->el_next) i++; /* count up enum members */ + if (i > CTF_MAX_VLEN) { + terminate("enum %s has too many values: %d > %d\n", + tdesc_name(tp), i, CTF_MAX_VLEN); + } + ctt.ctt_info = CTF_TYPE_INFO(CTF_K_ENUM, isroot, i); write_sized_type_rec(b, &ctt, tp->t_size); @@ -387,8 +401,14 @@ write_type(tdesc_t *tp, ctf_buf_t *b) break; case FUNCTION: - ctt.ctt_info = CTF_TYPE_INFO(CTF_K_FUNCTION, isroot, - tp->t_fndef->fn_nargs + tp->t_fndef->fn_vargs); + i = tp->t_fndef->fn_nargs + tp->t_fndef->fn_vargs; + + if (i > CTF_MAX_VLEN) { + terminate("function %s has too many args: %d > %d\n", + i, CTF_MAX_VLEN); + } + + ctt.ctt_info = CTF_TYPE_INFO(CTF_K_FUNCTION, isroot, i); ctt.ctt_type = tp->t_fndef->fn_ret->t_id; write_unsized_type_rec(b, &ctt); @@ -927,7 +947,7 @@ resurrect_types(ctf_header_t *h, tdata_t *td, tdesc_t **tdarr, int tdsize, if (CTF_NAME_STID(ctt->ctt_name) != CTF_STRTAB_0) parseterminate( - "Unable to cope with non-zero strtab id"); + "Unable to cope with non-zero strtab id"); if (CTF_NAME_OFFSET(ctt->ctt_name) != 0) { tdp->t_name = xstrdup(sbuf + CTF_NAME_OFFSET(ctt->ctt_name)); diff --git a/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c b/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c index 32d84829d70e5..295928586e136 100644 --- a/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c +++ b/external/cddl/osnet/dist/tools/ctf/cvt/tdata.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Routines for manipulating tdesc and tdata structures */ @@ -86,9 +84,10 @@ tdesc_layouthash(int nbuckets, void *node) * Unnamed structures, which cannot have forward * declarations pointing to them. We can therefore * incorporate the name of the first member into - * the hash value. + * the hash value, assuming there are any. */ - name = tdp->t_members->ml_name; + if (tdp->t_members != NULL) + name = tdp->t_members->ml_name; break; case ENUM: /* Use the first element in the hash value */ diff --git a/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c b/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c index 91e3230737a6f..ea1ac53d44a97 100644 --- a/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c +++ b/external/cddl/osnet/dist/uts/common/dtrace/dtrace.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -239,10 +239,16 @@ static void dtrace_nullop(void) {} +static int +dtrace_enable_nullop(void) +{ + return (0); +} + static dtrace_pops_t dtrace_provider_ops = { (void (*)(void *, const dtrace_probedesc_t *))dtrace_nullop, (void (*)(void *, struct modctl *))dtrace_nullop, - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, (void (*)(void *, dtrace_id_t, void *))dtrace_nullop, @@ -426,6 +432,7 @@ dtrace_load##bits(uintptr_t addr) \ #define DTRACE_DYNHASH_SINK 1 #define DTRACE_DYNHASH_VALID 2 +#define DTRACE_MATCH_FAIL -1 #define DTRACE_MATCH_NEXT 0 #define DTRACE_MATCH_DONE 1 #define DTRACE_ANCHORED(probe) ((probe)->dtpr_func[0] != '\0') @@ -6654,7 +6661,7 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, { dtrace_probe_t template, *probe; dtrace_hash_t *hash = NULL; - int len, best = INT_MAX, nmatched = 0; + int len, rc, best = INT_MAX, nmatched = 0; dtrace_id_t i; ASSERT(MUTEX_HELD(&dtrace_lock)); @@ -6666,7 +6673,8 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, if (pkp->dtpk_id != DTRACE_IDNONE) { if ((probe = dtrace_probe_lookup_id(pkp->dtpk_id)) != NULL && dtrace_match_probe(probe, pkp, priv, uid, zoneid) > 0) { - (void) (*matched)(probe, arg); + if ((*matched)(probe, arg) == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); nmatched++; } return (nmatched); @@ -6713,8 +6721,12 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, nmatched++; - if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) + if ((rc = (*matched)(probe, arg)) != + DTRACE_MATCH_NEXT) { + if (rc == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); break; + } } return (nmatched); @@ -6733,8 +6745,11 @@ dtrace_match(const dtrace_probekey_t *pkp, uint32_t priv, uid_t uid, nmatched++; - if ((*matched)(probe, arg) != DTRACE_MATCH_NEXT) + if ((rc = (*matched)(probe, arg)) != DTRACE_MATCH_NEXT) { + if (rc == DTRACE_MATCH_FAIL) + return (DTRACE_MATCH_FAIL); break; + } } return (nmatched); @@ -6954,7 +6969,7 @@ dtrace_unregister(dtrace_provider_id_t id) dtrace_probe_t *probe, *first = NULL; if (old->dtpv_pops.dtps_enable == - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop) { + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop) { /* * If DTrace itself is the provider, we're called with locks * already held. @@ -7100,7 +7115,7 @@ dtrace_invalidate(dtrace_provider_id_t id) dtrace_provider_t *pvp = (dtrace_provider_t *)id; ASSERT(pvp->dtpv_pops.dtps_enable != - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); mutex_enter(&dtrace_provider_lock); mutex_enter(&dtrace_lock); @@ -7141,7 +7156,7 @@ dtrace_condense(dtrace_provider_id_t id) * Make sure this isn't the dtrace provider itself. */ ASSERT(prov->dtpv_pops.dtps_enable != - (void (*)(void *, dtrace_id_t, void *))dtrace_nullop); + (int (*)(void *, dtrace_id_t, void *))dtrace_enable_nullop); mutex_enter(&dtrace_provider_lock); mutex_enter(&dtrace_lock); @@ -8102,7 +8117,7 @@ dtrace_difo_validate(dtrace_difo_t *dp, dtrace_vstate_t *vstate, uint_t nregs, break; default: - err += efunc(dp->dtdo_len - 1, "bad return size"); + err += efunc(dp->dtdo_len - 1, "bad return size\n"); } } @@ -9095,7 +9110,7 @@ dtrace_ecb_add(dtrace_state_t *state, dtrace_probe_t *probe) return (ecb); } -static void +static int dtrace_ecb_enable(dtrace_ecb_t *ecb) { dtrace_probe_t *probe = ecb->dte_probe; @@ -9108,7 +9123,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) /* * This is the NULL probe -- there's nothing to do. */ - return; + return (0); } if (probe->dtpr_ecb == NULL) { @@ -9122,8 +9137,8 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) if (ecb->dte_predicate != NULL) probe->dtpr_predcache = ecb->dte_predicate->dtp_cacheid; - prov->dtpv_pops.dtps_enable(prov->dtpv_arg, - probe->dtpr_id, probe->dtpr_arg); + return (prov->dtpv_pops.dtps_enable(prov->dtpv_arg, + probe->dtpr_id, probe->dtpr_arg)); } else { /* * This probe is already active. Swing the last pointer to @@ -9136,6 +9151,7 @@ dtrace_ecb_enable(dtrace_ecb_t *ecb) probe->dtpr_predcache = 0; dtrace_sync(); + return (0); } } @@ -9919,7 +9935,9 @@ dtrace_ecb_create_enable(dtrace_probe_t *probe, void *arg) if ((ecb = dtrace_ecb_create(state, probe, enab)) == NULL) return (DTRACE_MATCH_DONE); - dtrace_ecb_enable(ecb); + if (dtrace_ecb_enable(ecb) < 0) + return (DTRACE_MATCH_FAIL); + return (DTRACE_MATCH_NEXT); } @@ -10714,7 +10732,7 @@ static int dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) { int i = 0; - int matched = 0; + int total_matched = 0, matched = 0; ASSERT(MUTEX_HELD(&cpu_lock)); ASSERT(MUTEX_HELD(&dtrace_lock)); @@ -10725,7 +10743,14 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) enab->dten_current = ep; enab->dten_error = 0; - matched += dtrace_probe_enable(&ep->dted_probe, enab); + /* + * If a provider failed to enable a probe then get out and + * let the consumer know we failed. + */ + if ((matched = dtrace_probe_enable(&ep->dted_probe, enab)) < 0) + return (EBUSY); + + total_matched += matched; if (enab->dten_error != 0) { /* @@ -10753,7 +10778,7 @@ dtrace_enabling_match(dtrace_enabling_t *enab, int *nmatched) enab->dten_probegen = dtrace_probegen; if (nmatched != NULL) - *nmatched = matched; + *nmatched = total_matched; return (0); } @@ -10991,7 +11016,8 @@ dtrace_dof_copyin(uintptr_t uarg, int *errp) dof = kmem_alloc(hdr.dofh_loadsz, KM_SLEEP); - if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0) { + if (copyin((void *)uarg, dof, hdr.dofh_loadsz) != 0 || + dof->dofh_loadsz != hdr.dofh_loadsz) { kmem_free(dof, hdr.dofh_loadsz); *errp = EFAULT; return (NULL); @@ -11719,6 +11745,13 @@ dtrace_dof_slurp(dof_hdr_t *dof, dtrace_vstate_t *vstate, cred_t *cr, } } + if (DOF_SEC_ISLOADABLE(sec->dofs_type) && + !(sec->dofs_flags & DOF_SECF_LOAD)) { + dtrace_dof_error(dof, "loadable section with load " + "flag unset"); + return (-1); + } + if (!(sec->dofs_flags & DOF_SECF_LOAD)) continue; /* just ignore non-loadable sections */ @@ -14449,7 +14482,7 @@ dtrace_open(dev_t *devp, int flag, int otyp, cred_t *cred_p) mutex_exit(&cpu_lock); if (state == NULL) { - if (--dtrace_opens == 0) + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); mutex_exit(&dtrace_lock); return (EAGAIN); @@ -14485,7 +14518,12 @@ dtrace_close(dev_t dev, int flag, int otyp, cred_t *cred_p) dtrace_state_destroy(state); ASSERT(dtrace_opens > 0); - if (--dtrace_opens == 0) + + /* + * Only relinquish control of the kernel debugger interface when there + * are no consumers and no anonymous enablings. + */ + if (--dtrace_opens == 0 && dtrace_anon.dta_enabling == NULL) (void) kdi_dtrace_set(KDI_DTSET_DTRACE_DEACTIVATE); mutex_exit(&dtrace_lock); diff --git a/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c b/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c index fee6d60a572ee..42263e4ef2745 100644 --- a/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c +++ b/external/cddl/osnet/dist/uts/common/dtrace/fasttrap.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -875,7 +875,7 @@ fasttrap_disable_callbacks(void) } /*ARGSUSED*/ -static void +static int fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) { fasttrap_probe_t *probe = parg; @@ -903,7 +903,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * provider can't go away while we're in this code path. */ if (probe->ftp_prov->ftp_retired) - return; + return (0); /* * If we can't find the process, it may be that we're in the context of @@ -912,7 +912,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) */ if ((p = sprlock(probe->ftp_pid)) == NULL) { if ((curproc->p_flag & SFORKING) == 0) - return; + return (0); mutex_enter(&pidlock); p = prfind(probe->ftp_pid); @@ -974,7 +974,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) * drop our reference on the trap table entry. */ fasttrap_disable_callbacks(); - return; + return (0); } } @@ -982,6 +982,7 @@ fasttrap_pid_enable(void *arg, dtrace_id_t id, void *parg) sprunlock(p); probe->ftp_enabled = 1; + return (0); } /*ARGSUSED*/ @@ -1945,7 +1946,8 @@ fasttrap_ioctl(dev_t dev, int cmd, intptr_t arg, int md, cred_t *cr, int *rv) probe = kmem_alloc(size, KM_SLEEP); - if (copyin(uprobe, probe, size) != 0) { + if (copyin(uprobe, probe, size) != 0 || + probe->ftps_noffs != noffs) { kmem_free(probe, size); return (EFAULT); } diff --git a/external/cddl/osnet/dist/uts/common/dtrace/lockstat.c b/external/cddl/osnet/dist/uts/common/dtrace/lockstat.c index 55b3fcf8ff7bc..69c8b7254486e 100644 --- a/external/cddl/osnet/dist/uts/common/dtrace/lockstat.c +++ b/external/cddl/osnet/dist/uts/common/dtrace/lockstat.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -83,7 +83,7 @@ static kmutex_t lockstat_test; /* for testing purposes only */ static dtrace_provider_id_t lockstat_id; /*ARGSUSED*/ -static void +static int lockstat_enable(void *arg, dtrace_id_t id, void *parg) { lockstat_probe_t *probe = parg; @@ -102,6 +102,7 @@ lockstat_enable(void *arg, dtrace_id_t id, void *parg) */ mutex_enter(&lockstat_test); mutex_exit(&lockstat_test); + return (0); } /*ARGSUSED*/ diff --git a/external/cddl/osnet/dist/uts/common/dtrace/profile.c b/external/cddl/osnet/dist/uts/common/dtrace/profile.c index da8f58a378619..c1a2d1f1c12fe 100644 --- a/external/cddl/osnet/dist/uts/common/dtrace/profile.c +++ b/external/cddl/osnet/dist/uts/common/dtrace/profile.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -360,7 +360,7 @@ profile_offline(void *arg, cpu_t *cpu, void *oarg) } /*ARGSUSED*/ -static void +static int profile_enable(void *arg, dtrace_id_t id, void *parg) { profile_probe_t *prof = parg; @@ -390,6 +390,7 @@ profile_enable(void *arg, dtrace_id_t id, void *parg) } else { prof->prof_cyclic = cyclic_add_omni(&omni); } + return (0); } /*ARGSUSED*/ diff --git a/external/cddl/osnet/dist/uts/common/dtrace/sdt_subr.c b/external/cddl/osnet/dist/uts/common/dtrace/sdt_subr.c index 20aabcc20867a..a89403ea75859 100644 --- a/external/cddl/osnet/dist/uts/common/dtrace/sdt_subr.c +++ b/external/cddl/osnet/dist/uts/common/dtrace/sdt_subr.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include static dtrace_pattr_t vtrace_attr = { @@ -43,6 +41,14 @@ static dtrace_pattr_t info_attr = { { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, }; +static dtrace_pattr_t fc_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +}; + static dtrace_pattr_t fpu_attr = { { DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, @@ -83,6 +89,14 @@ static dtrace_pattr_t xpv_attr = { { DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_PLATFORM }, }; +static dtrace_pattr_t iscsi_attr = { +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_UNKNOWN }, +{ DTRACE_STABILITY_PRIVATE, DTRACE_STABILITY_PRIVATE, DTRACE_CLASS_ISA }, +{ DTRACE_STABILITY_EVOLVING, DTRACE_STABILITY_EVOLVING, DTRACE_CLASS_ISA }, +}; + sdt_provider_t sdt_providers[] = { { "vtrace", "__vtrace_", &vtrace_attr, 0 }, { "sysinfo", "__cpu_sysinfo_", &info_attr, 0 }, @@ -94,9 +108,12 @@ sdt_provider_t sdt_providers[] = { { "ip", "__ip_", &stab_attr, 0 }, { "mib", "__mib_", &stab_attr, 0 }, { "fsinfo", "__fsinfo_", &fsinfo_attr, 0 }, + { "iscsi", "__iscsi_", &iscsi_attr, 0 }, { "nfsv3", "__nfsv3_", &stab_attr, 0 }, { "nfsv4", "__nfsv4_", &stab_attr, 0 }, { "xpv", "__xpv_", &xpv_attr, 0 }, + { "fc", "__fc_", &fc_attr, 0 }, + { "srp", "__srp_", &fc_attr, 0 }, { "sysevent", "__sysevent_", &stab_attr, 0 }, { "sdt", NULL, &sdt_attr, 0 }, { NULL } @@ -170,6 +187,73 @@ sdt_argdesc_t sdt_args[] = { { "fsinfo", NULL, 0, 0, "vnode_t *", "fileinfo_t *" }, { "fsinfo", NULL, 1, 1, "int", "int" }, + { "iscsi", "async-send", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "async-send", 1, 1, "iscsi_async_evt_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "login-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "login-command", 1, 1, "iscsi_login_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "login-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "login-response", 1, 1, "iscsi_login_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "logout-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "logout-command", 1, 1, "iscsi_logout_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "logout-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "logout-response", 1, 1, "iscsi_logout_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "data-request", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "data-request", 1, 1, "iscsi_rtt_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "data-send", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "data-send", 1, 1, "iscsi_data_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "data-receive", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "data-receive", 1, 1, "iscsi_data_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "nop-send", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "nop-send", 1, 1, "iscsi_nop_in_hdr_t *", "iscsiinfo_t *" }, + { "iscsi", "nop-receive", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "nop-receive", 1, 1, "iscsi_nop_out_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "scsi-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "scsi-command", 1, 1, "iscsi_scsi_cmd_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "scsi-command", 2, 2, "scsi_task_t *", "scsicmd_t *" }, + { "iscsi", "scsi-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "scsi-response", 1, 1, "iscsi_scsi_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "task-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "task-command", 1, 1, "iscsi_scsi_task_mgt_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "task-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "task-response", 1, 1, "iscsi_scsi_task_mgt_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "text-command", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "text-command", 1, 1, "iscsi_text_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "text-response", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "text-response", 1, 1, "iscsi_text_rsp_hdr_t *", + "iscsiinfo_t *" }, + { "iscsi", "xfer-start", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "xfer-start", 1, 0, "idm_conn_t *", "iscsiinfo_t *" }, + { "iscsi", "xfer-start", 2, 1, "uintptr_t", "xferinfo_t *" }, + { "iscsi", "xfer-start", 3, 2, "uint32_t"}, + { "iscsi", "xfer-start", 4, 3, "uintptr_t"}, + { "iscsi", "xfer-start", 5, 4, "uint32_t"}, + { "iscsi", "xfer-start", 6, 5, "uint32_t"}, + { "iscsi", "xfer-start", 7, 6, "uint32_t"}, + { "iscsi", "xfer-start", 8, 7, "int"}, + { "iscsi", "xfer-done", 0, 0, "idm_conn_t *", "conninfo_t *" }, + { "iscsi", "xfer-done", 1, 0, "idm_conn_t *", "iscsiinfo_t *" }, + { "iscsi", "xfer-done", 2, 1, "uintptr_t", "xferinfo_t *" }, + { "iscsi", "xfer-done", 3, 2, "uint32_t"}, + { "iscsi", "xfer-done", 4, 3, "uintptr_t"}, + { "iscsi", "xfer-done", 5, 4, "uint32_t"}, + { "iscsi", "xfer-done", 6, 5, "uint32_t"}, + { "iscsi", "xfer-done", 7, 6, "uint32_t"}, + { "iscsi", "xfer-done", 8, 7, "int"}, + { "nfsv3", "op-getattr-start", 0, 0, "struct svc_req *", "conninfo_t *" }, { "nfsv3", "op-getattr-start", 1, 1, "nfsv3oparg_t *", @@ -864,6 +948,154 @@ sdt_argdesc_t sdt_args[] = { { "xpv", "setvcpucontext-end", 0, 0, "int" }, { "xpv", "setvcpucontext-start", 0, 0, "domid_t" }, { "xpv", "setvcpucontext-start", 1, 1, "vcpu_guest_context_t *" }, + + { "srp", "service-up", 0, 0, "srpt_session_t *", "conninfo_t *" }, + { "srp", "service-up", 1, 0, "srpt_session_t *", "srp_portinfo_t *" }, + { "srp", "service-down", 0, 0, "srpt_session_t *", "conninfo_t *" }, + { "srp", "service-down", 1, 0, "srpt_session_t *", + "srp_portinfo_t *" }, + { "srp", "login-command", 0, 0, "srpt_session_t *", "conninfo_t *" }, + { "srp", "login-command", 1, 0, "srpt_session_t *", + "srp_portinfo_t *" }, + { "srp", "login-command", 2, 1, "srp_login_req_t *", + "srp_logininfo_t *" }, + { "srp", "login-response", 0, 0, "srpt_session_t *", "conninfo_t *" }, + { "srp", "login-response", 1, 0, "srpt_session_t *", + "srp_portinfo_t *" }, + { "srp", "login-response", 2, 1, "srp_login_rsp_t *", + "srp_logininfo_t *" }, + { "srp", "login-response", 3, 2, "srp_login_rej_t *" }, + { "srp", "logout-command", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "logout-command", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "task-command", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "task-command", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "task-command", 2, 1, "srp_cmd_req_t *", "srp_taskinfo_t *" }, + { "srp", "task-response", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "task-response", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "task-response", 2, 1, "srp_rsp_t *", "srp_taskinfo_t *" }, + { "srp", "task-response", 3, 2, "scsi_task_t *" }, + { "srp", "task-response", 4, 3, "int8_t" }, + { "srp", "scsi-command", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "scsi-command", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "scsi-command", 2, 1, "scsi_task_t *", "scsicmd_t *" }, + { "srp", "scsi-command", 3, 2, "srp_cmd_req_t *", "srp_taskinfo_t *" }, + { "srp", "scsi-response", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "scsi-response", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "scsi-response", 2, 1, "srp_rsp_t *", "srp_taskinfo_t *" }, + { "srp", "scsi-response", 3, 2, "scsi_task_t *" }, + { "srp", "scsi-response", 4, 3, "int8_t" }, + { "srp", "xfer-start", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "xfer-start", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "xfer-start", 2, 1, "ibt_wr_ds_t *", "xferinfo_t *" }, + { "srp", "xfer-start", 3, 2, "srpt_iu_t *", "srp_taskinfo_t *" }, + { "srp", "xfer-start", 4, 3, "ibt_send_wr_t *"}, + { "srp", "xfer-start", 5, 4, "uint32_t" }, + { "srp", "xfer-start", 6, 5, "uint32_t" }, + { "srp", "xfer-start", 7, 6, "uint32_t" }, + { "srp", "xfer-start", 8, 7, "uint32_t" }, + { "srp", "xfer-done", 0, 0, "srpt_channel_t *", "conninfo_t *" }, + { "srp", "xfer-done", 1, 0, "srpt_channel_t *", + "srp_portinfo_t *" }, + { "srp", "xfer-done", 2, 1, "ibt_wr_ds_t *", "xferinfo_t *" }, + { "srp", "xfer-done", 3, 2, "srpt_iu_t *", "srp_taskinfo_t *" }, + { "srp", "xfer-done", 4, 3, "ibt_send_wr_t *"}, + { "srp", "xfer-done", 5, 4, "uint32_t" }, + { "srp", "xfer-done", 6, 5, "uint32_t" }, + { "srp", "xfer-done", 7, 6, "uint32_t" }, + { "srp", "xfer-done", 8, 7, "uint32_t" }, + + { "fc", "link-up", 0, 0, "fct_i_local_port_t *", "conninfo_t *" }, + { "fc", "link-down", 0, 0, "fct_i_local_port_t *", "conninfo_t *" }, + { "fc", "fabric-login-start", 0, 0, "fct_i_local_port_t *", + "conninfo_t *" }, + { "fc", "fabric-login-start", 1, 0, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "fabric-login-end", 0, 0, "fct_i_local_port_t *", + "conninfo_t *" }, + { "fc", "fabric-login-end", 1, 0, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-start", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "rport-login-start", 1, 1, "fct_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-start", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-start", 3, 3, "int", "int" }, + { "fc", "rport-login-end", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "rport-login-end", 1, 1, "fct_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-end", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-login-end", 3, 3, "int", "int" }, + { "fc", "rport-login-end", 4, 4, "int", "int" }, + { "fc", "rport-logout-start", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "rport-logout-start", 1, 1, "fct_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-logout-start", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-logout-start", 3, 3, "int", "int" }, + { "fc", "rport-logout-end", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "rport-logout-end", 1, 1, "fct_local_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-logout-end", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "rport-logout-end", 3, 3, "int", "int" }, + { "fc", "scsi-command", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "scsi-command", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "scsi-command", 2, 2, "scsi_task_t *", + "scsicmd_t *" }, + { "fc", "scsi-command", 3, 3, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "scsi-response", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "scsi-response", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "scsi-response", 2, 2, "scsi_task_t *", + "scsicmd_t *" }, + { "fc", "scsi-response", 3, 3, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-start", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "xfer-start", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-start", 2, 2, "scsi_task_t *", + "scsicmd_t *" }, + { "fc", "xfer-start", 3, 3, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-start", 4, 4, "stmf_data_buf_t *", + "fc_xferinfo_t *" }, + { "fc", "xfer-done", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "xfer-done", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-done", 2, 2, "scsi_task_t *", + "scsicmd_t *" }, + { "fc", "xfer-done", 3, 3, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + { "fc", "xfer-done", 4, 4, "stmf_data_buf_t *", + "fc_xferinfo_t *" }, + { "fc", "rscn-receive", 0, 0, "fct_i_local_port_t *", + "conninfo_t *" }, + { "fc", "rscn-receive", 1, 1, "int", "int"}, + { "fc", "abts-receive", 0, 0, "fct_cmd_t *", + "conninfo_t *" }, + { "fc", "abts-receive", 1, 1, "fct_i_local_port_t *", + "fc_port_info_t *" }, + { "fc", "abts-receive", 2, 2, "fct_i_remote_port_t *", + "fc_port_info_t *" }, + + { NULL } }; diff --git a/external/cddl/osnet/dist/uts/common/dtrace/systrace.c b/external/cddl/osnet/dist/uts/common/dtrace/systrace.c index fe7bee1ac85ff..b864041c450da 100644 --- a/external/cddl/osnet/dist/uts/common/dtrace/systrace.c +++ b/external/cddl/osnet/dist/uts/common/dtrace/systrace.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -140,7 +140,7 @@ systrace_destroy(void *arg, dtrace_id_t id, void *parg) } /*ARGSUSED*/ -static void +static int systrace_enable(void *arg, dtrace_id_t id, void *parg) { int sysnum = SYSTRACE_SYSNUM((uintptr_t)parg); @@ -161,7 +161,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) if (enabled) { ASSERT(sysent[sysnum].sy_callc == dtrace_systrace_syscall); - return; + return (0); } (void) casptr(&sysent[sysnum].sy_callc, @@ -172,6 +172,7 @@ systrace_enable(void *arg, dtrace_id_t id, void *parg) (void *)systrace_sysent32[sysnum].stsy_underlying, (void *)dtrace_systrace_syscall32); #endif + return (0); } /*ARGSUSED*/ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c b/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c index e6bba841ace5c..d0bf26e5f3b00 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/arc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -119,11 +119,11 @@ #include #include -#include #include #include #include #include +#include #ifdef _KERNEL #include #include @@ -132,6 +132,7 @@ #endif #include #include +#include static kmutex_t arc_reclaim_thr_lock; static kcondvar_t arc_reclaim_thr_cv; /* used to signal reclaim thr */ @@ -152,6 +153,12 @@ typedef enum arc_reclaim_strategy { /* number of seconds before growing cache again */ static int arc_grow_retry = 60; +/* shift of arc_c for calculating both min and max arc_p */ +static int arc_p_min_shift = 4; + +/* log2(fraction of arc to reclaim) */ +static int arc_shrink_shift = 5; + /* * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) @@ -171,7 +178,9 @@ static boolean_t arc_warm; uint64_t zfs_arc_max; uint64_t zfs_arc_min; uint64_t zfs_arc_meta_limit = 0; -int zfs_mdcomp_disable = 0; +int zfs_arc_grow_retry = 0; +int zfs_arc_shrink_shift = 0; +int zfs_arc_p_min_shift = 0; /* * Note that buffers can be in one of 6 states: @@ -239,6 +248,9 @@ typedef struct arc_stats { kstat_named_t arcstat_recycle_miss; kstat_named_t arcstat_mutex_miss; kstat_named_t arcstat_evict_skip; + kstat_named_t arcstat_evict_l2_cached; + kstat_named_t arcstat_evict_l2_eligible; + kstat_named_t arcstat_evict_l2_ineligible; kstat_named_t arcstat_hash_elements; kstat_named_t arcstat_hash_elements_max; kstat_named_t arcstat_hash_collisions; @@ -250,10 +262,14 @@ typedef struct arc_stats { kstat_named_t arcstat_c_max; kstat_named_t arcstat_size; kstat_named_t arcstat_hdr_size; + kstat_named_t arcstat_data_size; + kstat_named_t arcstat_other_size; kstat_named_t arcstat_l2_hits; kstat_named_t arcstat_l2_misses; kstat_named_t arcstat_l2_feeds; kstat_named_t arcstat_l2_rw_clash; + kstat_named_t arcstat_l2_read_bytes; + kstat_named_t arcstat_l2_write_bytes; kstat_named_t arcstat_l2_writes_sent; kstat_named_t arcstat_l2_writes_done; kstat_named_t arcstat_l2_writes_error; @@ -288,6 +304,9 @@ static arc_stats_t arc_stats = { { "recycle_miss", KSTAT_DATA_UINT64 }, { "mutex_miss", KSTAT_DATA_UINT64 }, { "evict_skip", KSTAT_DATA_UINT64 }, + { "evict_l2_cached", KSTAT_DATA_UINT64 }, + { "evict_l2_eligible", KSTAT_DATA_UINT64 }, + { "evict_l2_ineligible", KSTAT_DATA_UINT64 }, { "hash_elements", KSTAT_DATA_UINT64 }, { "hash_elements_max", KSTAT_DATA_UINT64 }, { "hash_collisions", KSTAT_DATA_UINT64 }, @@ -299,10 +318,14 @@ static arc_stats_t arc_stats = { { "c_max", KSTAT_DATA_UINT64 }, { "size", KSTAT_DATA_UINT64 }, { "hdr_size", KSTAT_DATA_UINT64 }, + { "data_size", KSTAT_DATA_UINT64 }, + { "other_size", KSTAT_DATA_UINT64 }, { "l2_hits", KSTAT_DATA_UINT64 }, { "l2_misses", KSTAT_DATA_UINT64 }, { "l2_feeds", KSTAT_DATA_UINT64 }, { "l2_rw_clash", KSTAT_DATA_UINT64 }, + { "l2_read_bytes", KSTAT_DATA_UINT64 }, + { "l2_write_bytes", KSTAT_DATA_UINT64 }, { "l2_writes_sent", KSTAT_DATA_UINT64 }, { "l2_writes_done", KSTAT_DATA_UINT64 }, { "l2_writes_error", KSTAT_DATA_UINT64 }, @@ -323,7 +346,7 @@ static arc_stats_t arc_stats = { #define ARCSTAT_INCR(stat, val) \ atomic_add_64(&arc_stats.stat.value.ui64, (val)); -#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) +#define ARCSTAT_BUMP(stat) ARCSTAT_INCR(stat, 1) #define ARCSTAT_BUMPDOWN(stat) ARCSTAT_INCR(stat, -1) #define ARCSTAT_MAX(stat, val) { \ @@ -357,7 +380,7 @@ static arc_stats_t arc_stats = { } kstat_t *arc_ksp; -static arc_state_t *arc_anon; +static arc_state_t *arc_anon; static arc_state_t *arc_mru; static arc_state_t *arc_mru_ghost; static arc_state_t *arc_mfu; @@ -380,6 +403,7 @@ static arc_state_t *arc_l2c_only; static int arc_no_grow; /* Don't try to grow cache size */ static uint64_t arc_tempreserve; +static uint64_t arc_loaned_bytes; static uint64_t arc_meta_used; static uint64_t arc_meta_limit; static uint64_t arc_meta_max = 0; @@ -425,7 +449,7 @@ struct arc_buf_hdr { /* immutable */ arc_buf_contents_t b_type; uint64_t b_size; - spa_t *b_spa; + uint64_t b_spa; /* protected by arc state mutex */ arc_state_t *b_state; @@ -447,7 +471,9 @@ static arc_buf_hdr_t arc_eviction_hdr; static void arc_get_data_buf(arc_buf_t *buf); static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock); static int arc_evict_needed(arc_buf_contents_t type); -static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); +static void arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes); + +static boolean_t l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab); #define GHOST_STATE(state) \ ((state) == arc_mru_ghost || (state) == arc_mfu_ghost || \ @@ -471,11 +497,11 @@ static void arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes); #define ARC_L2_WRITING (1 << 16) /* L2ARC write in progress */ #define ARC_L2_EVICTED (1 << 17) /* evicted during I/O */ #define ARC_L2_WRITE_HEAD (1 << 18) /* head of write list */ -#define ARC_STORED (1 << 19) /* has been store()d to */ #define HDR_IN_HASH_TABLE(hdr) ((hdr)->b_flags & ARC_IN_HASH_TABLE) #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_IO_ERROR) +#define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_PREFETCH) #define HDR_FREED_IN_READ(hdr) ((hdr)->b_flags & ARC_FREED_IN_READ) #define HDR_BUF_AVAILABLE(hdr) ((hdr)->b_flags & ARC_BUF_AVAILABLE) #define HDR_FREE_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FREE_IN_PROGRESS) @@ -529,8 +555,9 @@ uint64_t zfs_crc64_table[256]; */ #define L2ARC_WRITE_SIZE (8 * 1024 * 1024) /* initial write max */ -#define L2ARC_HEADROOM 4 /* num of writes */ -#define L2ARC_FEED_SECS 1 /* caching interval */ +#define L2ARC_HEADROOM 2 /* num of writes */ +#define L2ARC_FEED_SECS 1 /* caching interval secs */ +#define L2ARC_FEED_MIN_MS 200 /* min caching interval ms */ #define l2arc_writes_sent ARCSTAT(arcstat_l2_writes_sent) #define l2arc_writes_done ARCSTAT(arcstat_l2_writes_done) @@ -542,7 +569,10 @@ uint64_t l2arc_write_max = L2ARC_WRITE_SIZE; /* default max write size */ uint64_t l2arc_write_boost = L2ARC_WRITE_SIZE; /* extra write during warmup */ uint64_t l2arc_headroom = L2ARC_HEADROOM; /* number of dev writes */ uint64_t l2arc_feed_secs = L2ARC_FEED_SECS; /* interval seconds */ +uint64_t l2arc_feed_min_ms = L2ARC_FEED_MIN_MS; /* min interval milliseconds */ boolean_t l2arc_noprefetch = B_TRUE; /* don't cache prefetch bufs */ +boolean_t l2arc_feed_again = B_TRUE; /* turbo warmup */ +boolean_t l2arc_norw = B_TRUE; /* no reads during writes */ /* * L2ARC Internals @@ -557,6 +587,7 @@ typedef struct l2arc_dev { uint64_t l2ad_end; /* last addr on device */ uint64_t l2ad_evict; /* last addr eviction reached */ boolean_t l2ad_first; /* first sweep through */ + boolean_t l2ad_writing; /* currently writing */ list_t *l2ad_buflist; /* buffer list */ list_node_t l2ad_node; /* device list node */ } l2arc_dev_t; @@ -587,7 +618,7 @@ typedef struct l2arc_write_callback { struct l2arc_buf_hdr { /* protected by arc_buf_hdr mutex */ l2arc_dev_t *b_dev; /* L2ARC device */ - daddr_t b_daddr; /* disk address, offset byte */ + uint64_t b_daddr; /* disk address, offset byte */ }; typedef struct l2arc_data_free { @@ -607,9 +638,8 @@ static void l2arc_hdr_stat_add(void); static void l2arc_hdr_stat_remove(void); static uint64_t -buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) +buf_hash(uint64_t spa, const dva_t *dva, uint64_t birth) { - uintptr_t spav = (uintptr_t)spa; uint8_t *vdva = (uint8_t *)dva; uint64_t crc = -1ULL; int i; @@ -619,7 +649,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) for (i = 0; i < sizeof (dva_t); i++) crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ vdva[i]) & 0xFF]; - crc ^= (spav>>8) ^ birth; + crc ^= (spa>>8) ^ birth; return (crc); } @@ -635,7 +665,7 @@ buf_hash(spa_t *spa, const dva_t *dva, uint64_t birth) ((buf)->b_birth == birth) && ((buf)->b_spa == spa) static arc_buf_hdr_t * -buf_hash_find(spa_t *spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) +buf_hash_find(uint64_t spa, const dva_t *dva, uint64_t birth, kmutex_t **lockp) { uint64_t idx = BUF_HASH_INDEX(spa, dva, birth); kmutex_t *hash_lock = BUF_HASH_LOCK(idx); @@ -755,8 +785,8 @@ hdr_cons(void *vbuf, void *unused, int kmflag) refcount_create(&buf->b_refcnt); cv_init(&buf->b_cv, NULL, CV_DEFAULT, NULL); mutex_init(&buf->b_freeze_lock, NULL, MUTEX_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); - ARCSTAT_INCR(arcstat_hdr_size, HDR_SIZE); return (0); } @@ -768,6 +798,8 @@ buf_cons(void *vbuf, void *unused, int kmflag) bzero(buf, sizeof (arc_buf_t)); rw_init(&buf->b_lock, NULL, RW_DEFAULT, NULL); + arc_space_consume(sizeof (arc_buf_t), ARC_SPACE_HDRS); + return (0); } @@ -781,11 +813,11 @@ hdr_dest(void *vbuf, void *unused) { arc_buf_hdr_t *buf = vbuf; + ASSERT(BUF_EMPTY(buf)); refcount_destroy(&buf->b_refcnt); cv_destroy(&buf->b_cv); mutex_destroy(&buf->b_freeze_lock); - - ARCSTAT_INCR(arcstat_hdr_size, -HDR_SIZE); + arc_space_return(sizeof (arc_buf_hdr_t), ARC_SPACE_HDRS); } /* ARGSUSED */ @@ -795,6 +827,7 @@ buf_dest(void *vbuf, void *unused) arc_buf_t *buf = vbuf; rw_destroy(&buf->b_lock); + arc_space_return(sizeof (arc_buf_t), ARC_SPACE_HDRS); } /* @@ -1004,6 +1037,8 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) ASSERT(new_state != old_state); ASSERT(refcnt == 0 || ab->b_datacnt > 0); ASSERT(ab->b_datacnt == 0 || !GHOST_STATE(new_state)); + ASSERT(ab->b_datacnt <= 1 || new_state != arc_anon); + ASSERT(ab->b_datacnt <= 1 || old_state != arc_anon); from_delta = to_delta = ab->b_datacnt * ab->b_size; @@ -1081,15 +1116,49 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *ab, kmutex_t *hash_lock) } void -arc_space_consume(uint64_t space) +arc_space_consume(uint64_t space, arc_space_type_t type) { + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, space); + break; + } + atomic_add_64(&arc_meta_used, space); atomic_add_64(&arc_size, space); } void -arc_space_return(uint64_t space) +arc_space_return(uint64_t space, arc_space_type_t type) { + ASSERT(type >= 0 && type < ARC_SPACE_NUMTYPES); + + switch (type) { + case ARC_SPACE_DATA: + ARCSTAT_INCR(arcstat_data_size, -space); + break; + case ARC_SPACE_OTHER: + ARCSTAT_INCR(arcstat_other_size, -space); + break; + case ARC_SPACE_HDRS: + ARCSTAT_INCR(arcstat_hdr_size, -space); + break; + case ARC_SPACE_L2HDRS: + ARCSTAT_INCR(arcstat_l2_hdr_size, -space); + break; + } + ASSERT(arc_meta_used >= space); if (arc_meta_max < arc_meta_used) arc_meta_max = arc_meta_used; @@ -1126,7 +1195,7 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) ASSERT(BUF_EMPTY(hdr)); hdr->b_size = size; hdr->b_type = type; - hdr->b_spa = spa; + hdr->b_spa = spa_guid(spa); hdr->b_state = arc_anon; hdr->b_arc_access = 0; buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); @@ -1145,6 +1214,58 @@ arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type) return (buf); } +static char *arc_onloan_tag = "onloan"; + +/* + * Loan out an anonymous arc buffer. Loaned buffers are not counted as in + * flight data by arc_tempreserve_space() until they are "returned". Loaned + * buffers must be returned to the arc before they can be used by the DMU or + * freed. + */ +arc_buf_t * +arc_loan_buf(spa_t *spa, int size) +{ + arc_buf_t *buf; + + buf = arc_buf_alloc(spa, size, arc_onloan_tag, ARC_BUFC_DATA); + + atomic_add_64(&arc_loaned_bytes, size); + return (buf); +} + +/* + * Return a loaned arc buffer to the arc. + */ +void +arc_return_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr = buf->b_hdr; + + ASSERT(buf->b_data != NULL); + (void) refcount_add(&hdr->b_refcnt, tag); + (void) refcount_remove(&hdr->b_refcnt, arc_onloan_tag); + + atomic_add_64(&arc_loaned_bytes, -hdr->b_size); +} + +/* Detach an arc_buf from a dbuf (tag) */ +void +arc_loan_inuse_buf(arc_buf_t *buf, void *tag) +{ + arc_buf_hdr_t *hdr; + + rw_enter(&buf->b_lock, RW_WRITER); + ASSERT(buf->b_data != NULL); + hdr = buf->b_hdr; + (void) refcount_add(&hdr->b_refcnt, arc_onloan_tag); + (void) refcount_remove(&hdr->b_refcnt, tag); + buf->b_efunc = NULL; + buf->b_private = NULL; + + atomic_add_64(&arc_loaned_bytes, hdr->b_size); + rw_exit(&buf->b_lock); +} + static arc_buf_t * arc_buf_clone(arc_buf_t *from) { @@ -1152,6 +1273,8 @@ arc_buf_clone(arc_buf_t *from) arc_buf_hdr_t *hdr = from->b_hdr; uint64_t size = hdr->b_size; + ASSERT(hdr->b_state != arc_anon); + buf = kmem_cache_alloc(buf_cache, KM_PUSHPAGE); buf->b_hdr = hdr; buf->b_data = NULL; @@ -1189,6 +1312,7 @@ arc_buf_add_ref(arc_buf_t *buf, void* tag) ASSERT(hdr->b_state == arc_mru || hdr->b_state == arc_mfu); add_reference(hdr, hash_lock, tag); + DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); mutex_exit(hash_lock); ARCSTAT_BUMP(arcstat_hits); @@ -1232,15 +1356,17 @@ arc_buf_destroy(arc_buf_t *buf, boolean_t recycle, boolean_t all) arc_buf_contents_t type = buf->b_hdr->b_type; arc_cksum_verify(buf); + if (!recycle) { if (type == ARC_BUFC_METADATA) { arc_buf_data_free(buf->b_hdr, zio_buf_free, buf->b_data, size); - arc_space_return(size); + arc_space_return(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); arc_buf_data_free(buf->b_hdr, zio_data_buf_free, buf->b_data, size); + ARCSTAT_INCR(arcstat_data_size, -size); atomic_add_64(&arc_size, -size); } } @@ -1282,34 +1408,36 @@ arc_hdr_destroy(arc_buf_hdr_t *hdr) ASSERT(refcount_is_zero(&hdr->b_refcnt)); ASSERT3P(hdr->b_state, ==, arc_anon); ASSERT(!HDR_IO_IN_PROGRESS(hdr)); - ASSERT(!(hdr->b_flags & ARC_STORED)); + l2arc_buf_hdr_t *l2hdr = hdr->b_l2hdr; - if (hdr->b_l2hdr != NULL) { - if (!MUTEX_HELD(&l2arc_buflist_mtx)) { - /* - * To prevent arc_free() and l2arc_evict() from - * attempting to free the same buffer at the same time, - * a FREE_IN_PROGRESS flag is given to arc_free() to - * give it priority. l2arc_evict() can't destroy this - * header while we are waiting on l2arc_buflist_mtx. - * - * The hdr may be removed from l2ad_buflist before we - * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. - */ + if (l2hdr != NULL) { + boolean_t buflist_held = MUTEX_HELD(&l2arc_buflist_mtx); + /* + * To prevent arc_free() and l2arc_evict() from + * attempting to free the same buffer at the same time, + * a FREE_IN_PROGRESS flag is given to arc_free() to + * give it priority. l2arc_evict() can't destroy this + * header while we are waiting on l2arc_buflist_mtx. + * + * The hdr may be removed from l2ad_buflist before we + * grab l2arc_buflist_mtx, so b_l2hdr is rechecked. + */ + if (!buflist_held) { mutex_enter(&l2arc_buflist_mtx); - if (hdr->b_l2hdr != NULL) { - list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, - hdr); - } - mutex_exit(&l2arc_buflist_mtx); - } else { - list_remove(hdr->b_l2hdr->b_dev->l2ad_buflist, hdr); + l2hdr = hdr->b_l2hdr; } - ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); - kmem_free(hdr->b_l2hdr, sizeof (l2arc_buf_hdr_t)); - if (hdr->b_state == arc_l2c_only) - l2arc_hdr_stat_remove(); - hdr->b_l2hdr = NULL; + + if (l2hdr != NULL) { + list_remove(l2hdr->b_dev->l2ad_buflist, hdr); + ARCSTAT_INCR(arcstat_l2_size, -hdr->b_size); + kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); + if (hdr->b_state == arc_l2c_only) + l2arc_hdr_stat_remove(); + hdr->b_l2hdr = NULL; + } + + if (!buflist_held) + mutex_exit(&l2arc_buflist_mtx); } if (!BUF_EMPTY(hdr)) { @@ -1361,10 +1489,13 @@ arc_buf_free(arc_buf_t *buf, void *tag) mutex_enter(hash_lock); (void) remove_reference(hdr, hash_lock, tag); - if (hdr->b_datacnt > 1) + if (hdr->b_datacnt > 1) { arc_buf_destroy(buf, FALSE, TRUE); - else + } else { + ASSERT(buf == hdr->b_buf); + ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; + } mutex_exit(hash_lock); } else if (HDR_IO_IN_PROGRESS(hdr)) { int destroy_hdr; @@ -1398,6 +1529,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) int no_callback = (buf->b_efunc == NULL); if (hdr->b_state == arc_anon) { + ASSERT(hdr->b_datacnt == 1); arc_buf_free(buf, tag); return (no_callback); } @@ -1412,6 +1544,7 @@ arc_buf_remove_ref(arc_buf_t *buf, void* tag) arc_buf_destroy(buf, FALSE, TRUE); } else if (no_callback) { ASSERT(hdr->b_buf == buf && buf->b_next == NULL); + ASSERT(buf->b_efunc == NULL); hdr->b_flags |= ARC_BUF_AVAILABLE; } ASSERT(no_callback || hdr->b_datacnt > 1 || @@ -1440,7 +1573,7 @@ arc_buf_size(arc_buf_t *buf) * It may also return without evicting as much space as requested. */ static void * -arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, +arc_evict(arc_state_t *state, uint64_t spa, int64_t bytes, boolean_t recycle, arc_buf_contents_t type) { arc_state_t *evicted_state; @@ -1464,7 +1597,8 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, if (HDR_IO_IN_PROGRESS(ab) || (spa && ab->b_spa != spa) || (ab->b_flags & (ARC_PREFETCH|ARC_INDIRECT) && - lbolt - ab->b_arc_access < arc_min_prefetch_lifespan)) { + ddi_get_lbolt() - ab->b_arc_access < + arc_min_prefetch_lifespan)) { skipped++; continue; } @@ -1508,6 +1642,21 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, buf->b_data == stolen, TRUE); } } + + if (ab->b_l2hdr) { + ARCSTAT_INCR(arcstat_evict_l2_cached, + ab->b_size); + } else { + if (l2arc_write_eligible(ab->b_spa, ab)) { + ARCSTAT_INCR(arcstat_evict_l2_eligible, + ab->b_size); + } else { + ARCSTAT_INCR( + arcstat_evict_l2_ineligible, + ab->b_size); + } + } + if (ab->b_datacnt == 0) { arc_change_state(evicted_state, ab, hash_lock); ASSERT(HDR_IN_HASH_TABLE(ab)); @@ -1566,13 +1715,14 @@ arc_evict(arc_state_t *state, spa_t *spa, int64_t bytes, boolean_t recycle, * bytes. Destroy the buffers that are removed. */ static void -arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) +arc_evict_ghost(arc_state_t *state, uint64_t spa, int64_t bytes) { arc_buf_hdr_t *ab, *ab_prev; list_t *list = &state->arcs_list[ARC_BUFC_DATA]; kmutex_t *hash_lock; uint64_t bytes_deleted = 0; uint64_t bufs_skipped = 0; + boolean_t have_lock; ASSERT(GHOST_STATE(state)); top: @@ -1582,7 +1732,8 @@ arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) if (spa && ab->b_spa != spa) continue; hash_lock = HDR_LOCK(ab); - if (mutex_tryenter(hash_lock)) { + have_lock = MUTEX_HELD(hash_lock); + if (have_lock || mutex_tryenter(hash_lock)) { ASSERT(!HDR_IO_IN_PROGRESS(ab)); ASSERT(ab->b_buf == NULL); ARCSTAT_BUMP(arcstat_deleted); @@ -1594,10 +1745,12 @@ arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) * don't destroy the header. */ arc_change_state(arc_l2c_only, ab, hash_lock); - mutex_exit(hash_lock); + if (!have_lock) + mutex_exit(hash_lock); } else { arc_change_state(arc_anon, ab, hash_lock); - mutex_exit(hash_lock); + if (!have_lock) + mutex_exit(hash_lock); arc_hdr_destroy(ab); } @@ -1635,61 +1788,63 @@ arc_evict_ghost(arc_state_t *state, spa_t *spa, int64_t bytes) static void arc_adjust(void) { - int64_t top_sz, mru_over, arc_over, todelete; + int64_t adjustment, delta; - top_sz = arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used; + /* + * Adjust MRU size + */ + + adjustment = MIN(arc_size - arc_c, + arc_anon->arcs_size + arc_mru->arcs_size + arc_meta_used - arc_p); - if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { - int64_t toevict = - MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], top_sz - arc_p); - (void) arc_evict(arc_mru, NULL, toevict, FALSE, ARC_BUFC_DATA); - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_DATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; } - if (top_sz > arc_p && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t toevict = - MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], top_sz - arc_p); - (void) arc_evict(arc_mru, NULL, toevict, FALSE, + if (adjustment > 0 && arc_mru->arcs_lsize[ARC_BUFC_METADATA] > 0) { + delta = MIN(arc_mru->arcs_lsize[ARC_BUFC_METADATA], adjustment); + (void) arc_evict(arc_mru, NULL, delta, FALSE, ARC_BUFC_METADATA); - top_sz = arc_anon->arcs_size + arc_mru->arcs_size; } - mru_over = top_sz + arc_mru_ghost->arcs_size - arc_c; + /* + * Adjust MFU size + */ - if (mru_over > 0) { - if (arc_mru_ghost->arcs_size > 0) { - todelete = MIN(arc_mru_ghost->arcs_size, mru_over); - arc_evict_ghost(arc_mru_ghost, NULL, todelete); - } + adjustment = arc_size - arc_c; + + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { + delta = MIN(adjustment, arc_mfu->arcs_lsize[ARC_BUFC_DATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, ARC_BUFC_DATA); + adjustment -= delta; } - if ((arc_over = arc_size - arc_c) > 0) { - int64_t tbl_over; + if (adjustment > 0 && arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { + int64_t delta = MIN(adjustment, + arc_mfu->arcs_lsize[ARC_BUFC_METADATA]); + (void) arc_evict(arc_mfu, NULL, delta, FALSE, + ARC_BUFC_METADATA); + } - if (arc_mfu->arcs_lsize[ARC_BUFC_DATA] > 0) { - int64_t toevict = - MIN(arc_mfu->arcs_lsize[ARC_BUFC_DATA], arc_over); - (void) arc_evict(arc_mfu, NULL, toevict, FALSE, - ARC_BUFC_DATA); - arc_over = arc_size - arc_c; - } + /* + * Adjust ghost lists + */ - if (arc_over > 0 && - arc_mfu->arcs_lsize[ARC_BUFC_METADATA] > 0) { - int64_t toevict = - MIN(arc_mfu->arcs_lsize[ARC_BUFC_METADATA], - arc_over); - (void) arc_evict(arc_mfu, NULL, toevict, FALSE, - ARC_BUFC_METADATA); - } + adjustment = arc_mru->arcs_size + arc_mru_ghost->arcs_size - arc_c; - tbl_over = arc_size + arc_mru_ghost->arcs_size + - arc_mfu_ghost->arcs_size - arc_c * 2; + if (adjustment > 0 && arc_mru_ghost->arcs_size > 0) { + delta = MIN(arc_mru_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mru_ghost, NULL, delta); + } - if (tbl_over > 0 && arc_mfu_ghost->arcs_size > 0) { - todelete = MIN(arc_mfu_ghost->arcs_size, tbl_over); - arc_evict_ghost(arc_mfu_ghost, NULL, todelete); - } + adjustment = + arc_mru_ghost->arcs_size + arc_mfu_ghost->arcs_size - arc_c; + + if (adjustment > 0 && arc_mfu_ghost->arcs_size > 0) { + delta = MIN(arc_mfu_ghost->arcs_size, adjustment); + arc_evict_ghost(arc_mfu_ghost, NULL, delta); } } @@ -1723,29 +1878,34 @@ arc_do_user_evicts(void) void arc_flush(spa_t *spa) { + uint64_t guid = 0; + + if (spa) + guid = spa_guid(spa); + while (list_head(&arc_mru->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mru->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mru, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mru, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_DATA])) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_DATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_DATA); if (spa) break; } while (list_head(&arc_mfu->arcs_list[ARC_BUFC_METADATA])) { - (void) arc_evict(arc_mfu, spa, -1, FALSE, ARC_BUFC_METADATA); + (void) arc_evict(arc_mfu, guid, -1, FALSE, ARC_BUFC_METADATA); if (spa) break; } - arc_evict_ghost(arc_mru_ghost, spa, -1); - arc_evict_ghost(arc_mfu_ghost, spa, -1); + arc_evict_ghost(arc_mru_ghost, guid, -1); + arc_evict_ghost(arc_mfu_ghost, guid, -1); mutex_enter(&arc_reclaim_thr_lock); arc_do_user_evicts(); @@ -1753,8 +1913,6 @@ arc_flush(spa_t *spa) ASSERT(spa || arc_eviction_list == NULL); } -int arc_shrink_shift = 5; /* log2(fraction of arc to reclaim) */ - void arc_shrink(void) { @@ -1915,12 +2073,12 @@ arc_reclaim_thread(void) } /* reset the growth delay for every reclaim */ - growtime = lbolt + (arc_grow_retry * hz); + growtime = ddi_get_lbolt() + (arc_grow_retry * hz); arc_kmem_reap_now(last_reclaim); arc_warm = B_TRUE; - } else if (arc_no_grow && lbolt >= growtime) { + } else if (arc_no_grow && ddi_get_lbolt() >= growtime) { arc_no_grow = FALSE; } @@ -1934,7 +2092,7 @@ arc_reclaim_thread(void) /* block until needed, or one second, whichever is shorter */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&arc_reclaim_thr_cv, - &arc_reclaim_thr_lock, (lbolt + hz)); + &arc_reclaim_thr_lock, (ddi_get_lbolt() + hz)); CALLB_CPR_SAFE_END(&cpr, &arc_reclaim_thr_lock); } @@ -1953,6 +2111,7 @@ static void arc_adapt(int bytes, arc_state_t *state) { int mult; + uint64_t arc_p_min = (arc_c >> arc_p_min_shift); if (state == arc_l2c_only) return; @@ -1970,12 +2129,15 @@ arc_adapt(int bytes, arc_state_t *state) mult = ((arc_mru_ghost->arcs_size >= arc_mfu_ghost->arcs_size) ? 1 : (arc_mfu_ghost->arcs_size/arc_mru_ghost->arcs_size)); - arc_p = MIN(arc_c, arc_p + bytes * mult); + arc_p = MIN(arc_c - arc_p_min, arc_p + bytes * mult); } else if (state == arc_mfu_ghost) { + uint64_t delta; + mult = ((arc_mfu_ghost->arcs_size >= arc_mru_ghost->arcs_size) ? 1 : (arc_mru_ghost->arcs_size/arc_mfu_ghost->arcs_size)); - arc_p = MAX(0, (int64_t)arc_p - bytes * mult); + delta = MIN(bytes * mult, arc_p); + arc_p = MAX(arc_p_min, arc_p - delta); } ASSERT((int64_t)arc_p >= 0); @@ -2073,10 +2235,11 @@ arc_get_data_buf(arc_buf_t *buf) if (!arc_evict_needed(type)) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); + arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } goto out; @@ -2093,21 +2256,22 @@ arc_get_data_buf(arc_buf_t *buf) if (state == arc_mru || state == arc_anon) { uint64_t mru_used = arc_anon->arcs_size + arc_mru->arcs_size; - state = (arc_mfu->arcs_lsize[type] > 0 && + state = (arc_mfu->arcs_lsize[type] >= size && arc_p > mru_used) ? arc_mfu : arc_mru; } else { /* MFU cases */ uint64_t mfu_space = arc_c - arc_p; - state = (arc_mru->arcs_lsize[type] > 0 && + state = (arc_mru->arcs_lsize[type] >= size && mfu_space > arc_mfu->arcs_size) ? arc_mru : arc_mfu; } if ((buf->b_data = arc_evict(state, NULL, size, TRUE, type)) == NULL) { if (type == ARC_BUFC_METADATA) { buf->b_data = zio_buf_alloc(size); - arc_space_consume(size); + arc_space_consume(size, ARC_SPACE_DATA); } else { ASSERT(type == ARC_BUFC_DATA); buf->b_data = zio_data_buf_alloc(size); + ARCSTAT_INCR(arcstat_data_size, size); atomic_add_64(&arc_size, size); } ARCSTAT_BUMP(arcstat_recycle_miss); @@ -2143,6 +2307,8 @@ arc_get_data_buf(arc_buf_t *buf) static void arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) { + clock_t now; + ASSERT(MUTEX_HELD(hash_lock)); if (buf->b_state == arc_anon) { @@ -2153,11 +2319,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) */ ASSERT(buf->b_arc_access == 0); - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, buf); arc_change_state(arc_mru, buf, hash_lock); } else if (buf->b_state == arc_mru) { + now = ddi_get_lbolt(); + /* * If this buffer is here because of a prefetch, then either: * - clear the flag if this is a "referencing" read @@ -2173,7 +2341,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) buf->b_flags &= ~ARC_PREFETCH; ARCSTAT_BUMP(arcstat_mru_hits); } - buf->b_arc_access = lbolt; + buf->b_arc_access = now; return; } @@ -2182,13 +2350,13 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * but it is still in the cache. Move it to the MFU * state. */ - if (lbolt > buf->b_arc_access + ARC_MINTIME) { + if (now > buf->b_arc_access + ARC_MINTIME) { /* * More than 125ms have passed since we * instantiated this buffer. Move it to the * most frequently used state. */ - buf->b_arc_access = lbolt; + buf->b_arc_access = now; DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } @@ -2211,7 +2379,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); } - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); arc_change_state(new_state, buf, hash_lock); ARCSTAT_BUMP(arcstat_mru_ghost_hits); @@ -2230,7 +2398,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) ASSERT(list_link_active(&buf->b_arc_node)); } ARCSTAT_BUMP(arcstat_mfu_hits); - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); } else if (buf->b_state == arc_mfu_ghost) { arc_state_t *new_state = arc_mfu; /* @@ -2248,7 +2416,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) new_state = arc_mru; } - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(new_state, buf, hash_lock); @@ -2258,7 +2426,7 @@ arc_access(arc_buf_hdr_t *buf, kmutex_t *hash_lock) * This buffer is on the 2nd Level ARC. */ - buf->b_arc_access = lbolt; + buf->b_arc_access = ddi_get_lbolt(); DTRACE_PROBE1(new_state__mfu, arc_buf_hdr_t *, buf); arc_change_state(arc_mfu, buf, hash_lock); } else { @@ -2309,7 +2477,7 @@ arc_read_done(zio_t *zio) * reason for it not to be found is if we were freed during the * read. */ - found = buf_hash_find(zio->io_spa, &hdr->b_dva, hdr->b_birth, + found = buf_hash_find(hdr->b_spa, &hdr->b_dva, hdr->b_birth, &hash_lock); ASSERT((found == NULL && HDR_FREED_IN_READ(hdr) && hash_lock == NULL) || @@ -2323,7 +2491,7 @@ arc_read_done(zio_t *zio) /* byteswap if necessary */ callback_list = hdr->b_acb; ASSERT(callback_list != NULL); - if (BP_SHOULD_BYTESWAP(zio->io_bp)) { + if (BP_SHOULD_BYTESWAP(zio->io_bp) && zio->io_error == 0) { arc_byteswap_func_t *func = BP_GET_LEVEL(zio->io_bp) > 0 ? byteswap_uint64_array : dmu_ot[BP_GET_TYPE(zio->io_bp)].ot_byteswap; @@ -2332,6 +2500,16 @@ arc_read_done(zio_t *zio) arc_cksum_compute(buf, B_FALSE); + if (hash_lock && zio->io_error == 0 && hdr->b_state == arc_anon) { + /* + * Only call arc_access on anonymous buffers. This is because + * if we've issued an I/O for an evicted buffer, we've already + * called arc_access (to prevent any simultaneous readers from + * getting confused). + */ + arc_access(hdr, hash_lock); + } + /* create copies of the data buffer for the callers */ abuf = buf; for (acb = callback_list; acb; acb = acb->acb_next) { @@ -2345,8 +2523,11 @@ arc_read_done(zio_t *zio) hdr->b_acb = NULL; hdr->b_flags &= ~ARC_IO_IN_PROGRESS; ASSERT(!HDR_BUF_AVAILABLE(hdr)); - if (abuf == buf) + if (abuf == buf) { + ASSERT(buf->b_efunc == NULL); + ASSERT(hdr->b_datacnt == 1); hdr->b_flags |= ARC_BUF_AVAILABLE; + } ASSERT(refcount_is_zero(&hdr->b_refcnt) || callback_list != NULL); @@ -2367,14 +2548,6 @@ arc_read_done(zio_t *zio) cv_broadcast(&hdr->b_cv); if (hash_lock) { - /* - * Only call arc_access on anonymous buffers. This is because - * if we've issued an I/O for an evicted buffer, we've already - * called arc_access (to prevent any simultaneous readers from - * getting confused). - */ - if (zio->io_error == 0 && hdr->b_state == arc_anon) - arc_access(hdr, hash_lock); mutex_exit(hash_lock); } else { /* @@ -2425,16 +2598,14 @@ arc_read_done(zio_t *zio) * * Normal callers should use arc_read and pass the arc buffer and offset * for the bp. But if you know you don't need locking, you can use - * arc_read_nolock. Callers cannot use a "done" function in a prefetch - * call (i.e., with ARC_NOWAIT set). + * arc_read_bp. */ int -arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { int err; - arc_buf_hdr_t *hdr = pbuf->b_hdr; ASSERT(!refcount_is_zero(&pbuf->b_hdr->b_refcnt)); ASSERT3U((char *)bp - (char *)pbuf->b_data, <, pbuf->b_hdr->b_size); @@ -2442,14 +2613,13 @@ arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, err = arc_read_nolock(pio, spa, bp, done, private, priority, zio_flags, arc_flags, zb); - - ASSERT3P(hdr, ==, pbuf->b_hdr); rw_exit(&pbuf->b_lock); + return (err); } int -arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, +arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb) { @@ -2457,9 +2627,11 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *buf; kmutex_t *hash_lock; zio_t *rzio; + uint64_t guid = spa_guid(spa); top: - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + hdr = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), + &hash_lock); if (hdr && hdr->b_datacnt > 0) { *arc_flags |= ARC_CACHED; @@ -2482,7 +2654,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, acb->acb_private = private; if (pio != NULL) acb->acb_zio_dummy = zio_null(pio, - spa, NULL, NULL, zio_flags); + spa, NULL, NULL, NULL, zio_flags); ASSERT(acb->acb_done != NULL); acb->acb_next = hdr->b_acb; @@ -2513,6 +2685,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, } else { buf = arc_buf_clone(buf); } + } else if (*arc_flags & ARC_PREFETCH && refcount_count(&hdr->b_refcnt) == 0) { hdr->b_flags |= ARC_PREFETCH; @@ -2533,7 +2706,8 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, uint64_t size = BP_GET_LSIZE(bp); arc_callback_t *acb; vdev_t *vd = NULL; - daddr_t addr; + uint64_t addr; + boolean_t devw = B_FALSE; if (hdr == NULL) { /* this block is not in the cache */ @@ -2542,7 +2716,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, buf = arc_buf_alloc(spa, size, private, type); hdr = buf->b_hdr; hdr->b_dva = *BP_IDENTITY(bp); - hdr->b_birth = bp->blk_birth; + hdr->b_birth = BP_PHYSICAL_BIRTH(bp); hdr->b_cksum0 = bp->blk_cksum.zc_word[0]; exists = buf_hash_insert(hdr, &hash_lock); if (exists) { @@ -2588,7 +2762,6 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_get_data_buf(buf); ASSERT(hdr->b_datacnt == 0); hdr->b_datacnt = 1; - } acb = kmem_zalloc(sizeof (arc_callback_t), KM_SLEEP); @@ -2612,6 +2785,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, if (HDR_L2CACHE(hdr) && hdr->b_l2hdr != NULL && (vd = hdr->b_l2hdr->b_dev->l2ad_vdev) != NULL) { + devw = hdr->b_l2hdr->b_dev->l2ad_writing; addr = hdr->b_l2hdr->b_daddr; /* * Lock out device removal. @@ -2624,14 +2798,14 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, mutex_exit(hash_lock); ASSERT3U(hdr->b_size, ==, size); - DTRACE_PROBE3(arc__miss, blkptr_t *, bp, uint64_t, size, - zbookmark_t *, zb); + DTRACE_PROBE4(arc__miss, arc_buf_hdr_t *, hdr, blkptr_t *, bp, + uint64_t, size, zbookmark_t *, zb); ARCSTAT_BUMP(arcstat_misses); ARCSTAT_CONDSTAT(!(hdr->b_flags & ARC_PREFETCH), demand, prefetch, hdr->b_type != ARC_BUFC_METADATA, data, metadata, misses); - if (vd != NULL) { + if (vd != NULL && l2arc_ndev != 0 && !(l2arc_norw && devw)) { /* * Read from the L2ARC if the following are true: * 1. The L2ARC vdev was previously cached. @@ -2639,9 +2813,11 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, * 3. This buffer isn't currently writing to the L2ARC. * 4. The L2ARC entry wasn't evicted, which may * also have invalidated the vdev. + * 5. This isn't prefetch and l2arc_noprefetch is set. */ if (hdr->b_l2hdr != NULL && - !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr)) { + !HDR_L2_WRITING(hdr) && !HDR_L2_EVICTED(hdr) && + !(l2arc_noprefetch && HDR_PREFETCH(hdr))) { l2arc_read_callback_t *cb; DTRACE_PROBE1(l2arc__hit, arc_buf_hdr_t *, hdr); @@ -2667,6 +2843,7 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, ZIO_FLAG_DONT_RETRY, B_FALSE); DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); + ARCSTAT_INCR(arcstat_l2_read_bytes, size); if (*arc_flags & ARC_NOWAIT) { zio_nowait(rzio); @@ -2686,6 +2863,14 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, ARCSTAT_BUMP(arcstat_l2_rw_clash); spa_config_exit(spa, SCL_L2ARC, vd); } + } else { + if (vd != NULL) + spa_config_exit(spa, SCL_L2ARC, vd); + if (l2arc_ndev != 0) { + DTRACE_PROBE1(l2arc__miss, + arc_buf_hdr_t *, hdr); + ARCSTAT_BUMP(arcstat_l2_misses); + } } rzio = zio_read(pio, spa, bp, buf->b_data, size, @@ -2700,46 +2885,15 @@ arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, return (0); } -/* - * arc_read() variant to support pool traversal. If the block is already - * in the ARC, make a copy of it; otherwise, the caller will do the I/O. - * The idea is that we don't want pool traversal filling up memory, but - * if the ARC already has the data anyway, we shouldn't pay for the I/O. - */ -int -arc_tryread(spa_t *spa, blkptr_t *bp, void *data) -{ - arc_buf_hdr_t *hdr; - kmutex_t *hash_mtx; - int rc = 0; - - hdr = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_mtx); - - if (hdr && hdr->b_datacnt > 0 && !HDR_IO_IN_PROGRESS(hdr)) { - arc_buf_t *buf = hdr->b_buf; - - ASSERT(buf); - while (buf->b_data == NULL) { - buf = buf->b_next; - ASSERT(buf); - } - bcopy(buf->b_data, data, hdr->b_size); - } else { - rc = ENOENT; - } - - if (hash_mtx) - mutex_exit(hash_mtx); - - return (rc); -} - void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private) { ASSERT(buf->b_hdr != NULL); ASSERT(buf->b_hdr->b_state != arc_anon); ASSERT(!refcount_is_zero(&buf->b_hdr->b_refcnt) || func == NULL); + ASSERT(buf->b_efunc == NULL); + ASSERT(!HDR_BUF_AVAILABLE(buf->b_hdr)); + buf->b_efunc = func; buf->b_private = private; } @@ -2838,13 +2992,13 @@ arc_release(arc_buf_t *buf, void *tag) kmutex_t *hash_lock; l2arc_buf_hdr_t *l2hdr; uint64_t buf_size; + boolean_t released = B_FALSE; rw_enter(&buf->b_lock, RW_WRITER); hdr = buf->b_hdr; /* this buffer is not on any list */ ASSERT(refcount_count(&hdr->b_refcnt) > 0); - ASSERT(!(hdr->b_flags & ARC_STORED)); if (hdr->b_state == arc_anon) { /* this buffer is already released */ @@ -2853,12 +3007,12 @@ arc_release(arc_buf_t *buf, void *tag) ASSERT(buf->b_efunc == NULL); arc_buf_thaw(buf); rw_exit(&buf->b_lock); - return; + released = B_TRUE; + } else { + hash_lock = HDR_LOCK(hdr); + mutex_enter(hash_lock); } - hash_lock = HDR_LOCK(hdr); - mutex_enter(hash_lock); - l2hdr = hdr->b_l2hdr; if (l2hdr) { mutex_enter(&l2arc_buflist_mtx); @@ -2866,6 +3020,9 @@ arc_release(arc_buf_t *buf, void *tag) buf_size = hdr->b_size; } + if (released) + goto out; + /* * Do we have more than one buf? */ @@ -2873,7 +3030,7 @@ arc_release(arc_buf_t *buf, void *tag) arc_buf_hdr_t *nhdr; arc_buf_t **bufp; uint64_t blksz = hdr->b_size; - spa_t *spa = hdr->b_spa; + uint64_t spa = hdr->b_spa; arc_buf_contents_t type = hdr->b_type; uint32_t flags = hdr->b_flags; @@ -2933,6 +3090,7 @@ arc_release(arc_buf_t *buf, void *tag) buf->b_efunc = NULL; buf->b_private = NULL; +out: if (l2hdr) { list_remove(l2hdr->b_dev->l2ad_buflist, hdr); kmem_free(l2hdr, sizeof (l2arc_buf_hdr_t)); @@ -3011,11 +3169,16 @@ arc_write_done(zio_t *zio) arc_buf_t *buf = callback->awcb_buf; arc_buf_hdr_t *hdr = buf->b_hdr; - hdr->b_acb = NULL; + ASSERT(hdr->b_acb == NULL); + + if (zio->io_error == 0) { + hdr->b_dva = *BP_IDENTITY(zio->io_bp); + hdr->b_birth = BP_PHYSICAL_BIRTH(zio->io_bp); + hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; + } else { + ASSERT(BUF_EMPTY(hdr)); + } - hdr->b_dva = *BP_IDENTITY(zio->io_bp); - hdr->b_birth = zio->io_bp->blk_birth; - hdr->b_cksum0 = zio->io_bp->blk_cksum.zc_word[0]; /* * If the block to be written was all-zero, we may have * compressed it away. In this case no write was performed @@ -3026,6 +3189,8 @@ arc_write_done(zio_t *zio) arc_buf_hdr_t *exists; kmutex_t *hash_lock; + ASSERT(zio->io_error == 0); + arc_cksum_verify(buf); exists = buf_hash_insert(hdr, &hash_lock); @@ -3035,106 +3200,54 @@ arc_write_done(zio_t *zio) * sync-to-convergence, because we remove * buffers from the hash table when we arc_free(). */ - ASSERT(zio->io_flags & ZIO_FLAG_IO_REWRITE); - ASSERT(DVA_EQUAL(BP_IDENTITY(&zio->io_bp_orig), - BP_IDENTITY(zio->io_bp))); - ASSERT3U(zio->io_bp_orig.blk_birth, ==, - zio->io_bp->blk_birth); - - ASSERT(refcount_is_zero(&exists->b_refcnt)); - arc_change_state(arc_anon, exists, hash_lock); - mutex_exit(hash_lock); - arc_hdr_destroy(exists); - exists = buf_hash_insert(hdr, &hash_lock); - ASSERT3P(exists, ==, NULL); + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + if (!BP_EQUAL(&zio->io_bp_orig, zio->io_bp)) + panic("bad overwrite, hdr=%p exists=%p", + (void *)hdr, (void *)exists); + ASSERT(refcount_is_zero(&exists->b_refcnt)); + arc_change_state(arc_anon, exists, hash_lock); + mutex_exit(hash_lock); + arc_hdr_destroy(exists); + exists = buf_hash_insert(hdr, &hash_lock); + ASSERT3P(exists, ==, NULL); + } else { + /* Dedup */ + ASSERT(hdr->b_datacnt == 1); + ASSERT(hdr->b_state == arc_anon); + ASSERT(BP_GET_DEDUP(zio->io_bp)); + ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); + } } hdr->b_flags &= ~ARC_IO_IN_PROGRESS; /* if it's not anon, we are doing a scrub */ - if (hdr->b_state == arc_anon) + if (!exists && hdr->b_state == arc_anon) arc_access(hdr, hash_lock); mutex_exit(hash_lock); - } else if (callback->awcb_done == NULL) { - int destroy_hdr; - /* - * This is an anonymous buffer with no user callback, - * destroy it if there are no active references. - */ - mutex_enter(&arc_eviction_mtx); - destroy_hdr = refcount_is_zero(&hdr->b_refcnt); - hdr->b_flags &= ~ARC_IO_IN_PROGRESS; - mutex_exit(&arc_eviction_mtx); - if (destroy_hdr) - arc_hdr_destroy(hdr); } else { hdr->b_flags &= ~ARC_IO_IN_PROGRESS; } - hdr->b_flags &= ~ARC_STORED; - if (callback->awcb_done) { - ASSERT(!refcount_is_zero(&hdr->b_refcnt)); - callback->awcb_done(zio, buf, callback->awcb_private); - } + ASSERT(!refcount_is_zero(&hdr->b_refcnt)); + callback->awcb_done(zio, buf, callback->awcb_private); kmem_free(callback, sizeof (arc_write_callback_t)); } -void -write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp) -{ - boolean_t ismd = (wp->wp_level > 0 || dmu_ot[wp->wp_type].ot_metadata); - - /* Determine checksum setting */ - if (ismd) { - /* - * Metadata always gets checksummed. If the data - * checksum is multi-bit correctable, and it's not a - * ZBT-style checksum, then it's suitable for metadata - * as well. Otherwise, the metadata checksum defaults - * to fletcher4. - */ - if (zio_checksum_table[wp->wp_oschecksum].ci_correctable && - !zio_checksum_table[wp->wp_oschecksum].ci_zbt) - zp->zp_checksum = wp->wp_oschecksum; - else - zp->zp_checksum = ZIO_CHECKSUM_FLETCHER_4; - } else { - zp->zp_checksum = zio_checksum_select(wp->wp_dnchecksum, - wp->wp_oschecksum); - } - - /* Determine compression setting */ - if (ismd) { - /* - * XXX -- we should design a compression algorithm - * that specializes in arrays of bps. - */ - zp->zp_compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : - ZIO_COMPRESS_LZJB; - } else { - zp->zp_compress = zio_compress_select(wp->wp_dncompress, - wp->wp_oscompress); - } - - zp->zp_type = wp->wp_type; - zp->zp_level = wp->wp_level; - zp->zp_ndvas = MIN(wp->wp_copies + ismd, spa_max_replication(spa)); -} - zio_t * -arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, - boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int zio_flags, const zbookmark_t *zb) +arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_done_func_t *ready, arc_done_func_t *done, void *private, + int priority, int zio_flags, const zbookmark_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; arc_write_callback_t *callback; zio_t *zio; - zio_prop_t zp; ASSERT(ready != NULL); + ASSERT(done != NULL); ASSERT(!HDR_IO_ERROR(hdr)); ASSERT((hdr->b_flags & ARC_IO_IN_PROGRESS) == 0); - ASSERT(hdr->b_acb == 0); + ASSERT(hdr->b_acb == NULL); if (l2arc) hdr->b_flags |= ARC_L2CACHE; callback = kmem_zalloc(sizeof (arc_write_callback_t), KM_SLEEP); @@ -3143,36 +3256,25 @@ arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, callback->awcb_private = private; callback->awcb_buf = buf; - write_policy(spa, wp, &zp); - zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, &zp, + zio = zio_write(pio, spa, txg, bp, buf->b_data, hdr->b_size, zp, arc_write_ready, arc_write_done, callback, priority, zio_flags, zb); return (zio); } -int -arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, uint32_t arc_flags) +void +arc_free(spa_t *spa, const blkptr_t *bp) { arc_buf_hdr_t *ab; kmutex_t *hash_lock; - zio_t *zio; + uint64_t guid = spa_guid(spa); /* - * If this buffer is in the cache, release it, so it - * can be re-used. + * If this buffer is in the cache, release it, so it can be re-used. */ - ab = buf_hash_find(spa, BP_IDENTITY(bp), bp->blk_birth, &hash_lock); + ab = buf_hash_find(guid, BP_IDENTITY(bp), BP_PHYSICAL_BIRTH(bp), + &hash_lock); if (ab != NULL) { - /* - * The checksum of blocks to free is not always - * preserved (eg. on the deadlist). However, if it is - * nonzero, it should match what we have in the cache. - */ - ASSERT(bp->blk_cksum.zc_word[0] == 0 || - bp->blk_cksum.zc_word[0] == ab->b_cksum0 || - bp->blk_fill == BLK_FILL_ALREADY_FREED); - if (ab->b_state != arc_anon) arc_change_state(arc_anon, ab, hash_lock); if (HDR_IO_IN_PROGRESS(ab)) { @@ -3191,44 +3293,20 @@ arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, ab->b_buf->b_efunc = NULL; ab->b_buf->b_private = NULL; mutex_exit(hash_lock); - } else if (refcount_is_zero(&ab->b_refcnt)) { + } else { + ASSERT(refcount_is_zero(&ab->b_refcnt)); ab->b_flags |= ARC_FREE_IN_PROGRESS; mutex_exit(hash_lock); arc_hdr_destroy(ab); ARCSTAT_BUMP(arcstat_deleted); - } else { - /* - * We still have an active reference on this - * buffer. This can happen, e.g., from - * dbuf_unoverride(). - */ - ASSERT(!HDR_IN_HASH_TABLE(ab)); - ab->b_arc_access = 0; - bzero(&ab->b_dva, sizeof (dva_t)); - ab->b_birth = 0; - ab->b_cksum0 = 0; - ab->b_buf->b_efunc = NULL; - ab->b_buf->b_private = NULL; - mutex_exit(hash_lock); } } - - zio = zio_free(pio, spa, txg, bp, done, private, ZIO_FLAG_MUSTSUCCEED); - - if (arc_flags & ARC_WAIT) - return (zio_wait(zio)); - - ASSERT(arc_flags & ARC_NOWAIT); - zio_nowait(zio); - - return (0); } static int -arc_memory_throttle(uint64_t reserve, uint64_t txg) +arc_memory_throttle(uint64_t reserve, uint64_t inflight_data, uint64_t txg) { #ifdef _KERNEL - uint64_t inflight_data = arc_anon->arcs_size; uint64_t available_memory = ptob(freemem); static uint64_t page_load = 0; static uint64_t last_txg = 0; @@ -3290,6 +3368,7 @@ int arc_tempreserve_space(uint64_t reserve, uint64_t txg) { int error; + uint64_t anon_size; #ifdef ZFS_DEBUG /* @@ -3305,12 +3384,19 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) if (reserve > arc_c) return (ENOMEM); + /* + * Don't count loaned bufs as in flight dirty data to prevent long + * network delays from blocking transactions that are ready to be + * assigned to a txg. + */ + anon_size = MAX((int64_t)(arc_anon->arcs_size - arc_loaned_bytes), 0); + /* * Writes will, almost always, require additional memory allocations * in order to compress/encrypt/etc the data. We therefor need to * make sure that there is sufficient available memory for this. */ - if (error = arc_memory_throttle(reserve, txg)) + if (error = arc_memory_throttle(reserve, anon_size, txg)) return (error); /* @@ -3320,8 +3406,9 @@ arc_tempreserve_space(uint64_t reserve, uint64_t txg) * Note: if two requests come in concurrently, we might let them * both succeed, when one of them should fail. Not a huge deal. */ - if (reserve + arc_tempreserve + arc_anon->arcs_size > arc_c / 2 && - arc_anon->arcs_size > arc_c / 4) { + + if (reserve + arc_tempreserve + anon_size > arc_c / 2 && + anon_size > arc_c / 4) { dprintf("failing, arc_tempreserve=%lluK anon_meta=%lluK " "anon_data=%lluK tempreserve=%lluK arc_c=%lluK\n", arc_tempreserve>>10, @@ -3386,6 +3473,15 @@ arc_init(void) if (arc_c_min < arc_meta_limit / 2 && zfs_arc_min == 0) arc_c_min = arc_meta_limit / 2; + if (zfs_arc_grow_retry > 0) + arc_grow_retry = zfs_arc_grow_retry; + + if (zfs_arc_shrink_shift > 0) + arc_shrink_shift = zfs_arc_shrink_shift; + + if (zfs_arc_p_min_shift > 0) + arc_p_min_shift = zfs_arc_p_min_shift; + /* if kmem_flags are set, lets try to use less memory */ if (kmem_debugging()) arc_c = arc_c / 2; @@ -3492,10 +3588,13 @@ arc_fini(void) mutex_destroy(&arc_mru_ghost->arcs_mtx); mutex_destroy(&arc_mfu->arcs_mtx); mutex_destroy(&arc_mfu_ghost->arcs_mtx); + mutex_destroy(&arc_l2c_only->arcs_mtx); mutex_destroy(&zfs_write_limit_lock); buf_fini(); + + ASSERT(arc_loaned_bytes == 0); } /* @@ -3623,8 +3722,70 @@ arc_fini(void) * * Tunables may be removed or added as future performance improvements are * integrated, and also may become zpool properties. + * + * There are three key functions that control how the L2ARC warms up: + * + * l2arc_write_eligible() check if a buffer is eligible to cache + * l2arc_write_size() calculate how much to write + * l2arc_write_interval() calculate sleep delay between writes + * + * These three functions determine what to write, how much, and how quickly + * to send writes. */ +static boolean_t +l2arc_write_eligible(uint64_t spa_guid, arc_buf_hdr_t *ab) +{ + /* + * A buffer is *not* eligible for the L2ARC if it: + * 1. belongs to a different spa. + * 2. is already cached on the L2ARC. + * 3. has an I/O in progress (it may be an incomplete read). + * 4. is flagged not eligible (zfs property). + */ + if (ab->b_spa != spa_guid || ab->b_l2hdr != NULL || + HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) + return (B_FALSE); + + return (B_TRUE); +} + +static uint64_t +l2arc_write_size(l2arc_dev_t *dev) +{ + uint64_t size; + + size = dev->l2ad_write; + + if (arc_warm == B_FALSE) + size += dev->l2ad_boost; + + return (size); + +} + +static clock_t +l2arc_write_interval(clock_t began, uint64_t wanted, uint64_t wrote) +{ + clock_t interval, next, now; + + /* + * If the ARC lists are busy, increase our write rate; if the + * lists are stale, idle back. This is achieved by checking + * how much we previously wrote - if it was more than half of + * what we wanted, schedule the next write much sooner. + */ + if (l2arc_feed_again && wrote > (wanted / 2)) + interval = (hz * l2arc_feed_min_ms) / 1000; + else + interval = hz * l2arc_feed_secs; + + now = ddi_get_lbolt(); + next = MAX(now, MIN(now + interval, began + interval)); + + return (next); +} + static void l2arc_hdr_stat_add(void) { @@ -3857,11 +4018,15 @@ l2arc_read_done(zio_t *zio) * storage now. If there *is* a waiter, the caller must * issue the i/o in a context where it's OK to block. */ - if (zio->io_waiter == NULL) - zio_nowait(zio_read(zio->io_parent, - cb->l2rcb_spa, &cb->l2rcb_bp, + if (zio->io_waiter == NULL) { + zio_t *pio = zio_unique_parent(zio); + + ASSERT(!pio || pio->io_child_type == ZIO_CHILD_LOGICAL); + + zio_nowait(zio_read(pio, cb->l2rcb_spa, &cb->l2rcb_bp, buf->b_data, zio->io_size, arc_read_done, buf, zio->io_priority, cb->l2rcb_flags, &cb->l2rcb_zb)); + } } kmem_free(cb, sizeof (l2arc_read_callback_t)); @@ -4035,7 +4200,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) } mutex_exit(&l2arc_buflist_mtx); - spa_l2cache_space_update(dev->l2ad_vdev, 0, -(taddr - dev->l2ad_evict)); + vdev_space_update(dev->l2ad_vdev, -(taddr - dev->l2ad_evict), 0, 0); dev->l2ad_evict = taddr; } @@ -4045,7 +4210,7 @@ l2arc_evict(l2arc_dev_t *dev, uint64_t distance, boolean_t all) * An ARC_L2_WRITING flag is set so that the L2ARC buffers are not valid * for reading until they have completed writing. */ -static void +static uint64_t l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) { arc_buf_hdr_t *ab, *ab_prev, *head; @@ -4057,6 +4222,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) boolean_t have_lock, full; l2arc_write_callback_t *cb; zio_t *pio, *wzio; + uint64_t guid = spa_guid(spa); ASSERT(dev->l2ad_vdev != NULL); @@ -4110,20 +4276,7 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (ab->b_spa != spa) { - mutex_exit(hash_lock); - continue; - } - - if (ab->b_l2hdr != NULL) { - /* - * Already in L2ARC. - */ - mutex_exit(hash_lock); - continue; - } - - if (HDR_IO_IN_PROGRESS(ab) || !HDR_L2CACHE(ab)) { + if (!l2arc_write_eligible(guid, ab)) { mutex_exit(hash_lock); continue; } @@ -4134,12 +4287,6 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) break; } - if (ab->b_buf == NULL) { - DTRACE_PROBE1(l2arc__buf__null, void *, ab); - mutex_exit(hash_lock); - continue; - } - if (pio == NULL) { /* * Insert a dummy header on the buflist so @@ -4206,27 +4353,32 @@ l2arc_write_buffers(spa_t *spa, l2arc_dev_t *dev, uint64_t target_sz) if (pio == NULL) { ASSERT3U(write_sz, ==, 0); kmem_cache_free(hdr_cache, head); - return; + return (0); } ASSERT3U(write_sz, <=, target_sz); ARCSTAT_BUMP(arcstat_l2_writes_sent); + ARCSTAT_INCR(arcstat_l2_write_bytes, write_sz); ARCSTAT_INCR(arcstat_l2_size, write_sz); - spa_l2cache_space_update(dev->l2ad_vdev, 0, write_sz); + vdev_space_update(dev->l2ad_vdev, write_sz, 0, 0); /* * Bump device hand to the device start if it is approaching the end. * l2arc_evict() will already have evicted ahead for this case. */ if (dev->l2ad_hand >= (dev->l2ad_end - target_sz)) { - spa_l2cache_space_update(dev->l2ad_vdev, 0, - dev->l2ad_end - dev->l2ad_hand); + vdev_space_update(dev->l2ad_vdev, + dev->l2ad_end - dev->l2ad_hand, 0, 0); dev->l2ad_hand = dev->l2ad_start; dev->l2ad_evict = dev->l2ad_start; dev->l2ad_first = B_FALSE; } + dev->l2ad_writing = B_TRUE; (void) zio_wait(pio); + dev->l2ad_writing = B_FALSE; + + return (write_sz); } /* @@ -4239,20 +4391,19 @@ l2arc_feed_thread(void) callb_cpr_t cpr; l2arc_dev_t *dev; spa_t *spa; - uint64_t size; + uint64_t size, wrote; + clock_t begin, next = ddi_get_lbolt(); CALLB_CPR_INIT(&cpr, &l2arc_feed_thr_lock, callb_generic_cpr, FTAG); mutex_enter(&l2arc_feed_thr_lock); while (l2arc_thread_exit == 0) { - /* - * Pause for l2arc_feed_secs seconds between writes. - */ CALLB_CPR_SAFE_BEGIN(&cpr); (void) cv_timedwait(&l2arc_feed_thr_cv, &l2arc_feed_thr_lock, - lbolt + (hz * l2arc_feed_secs)); + next); CALLB_CPR_SAFE_END(&cpr, &l2arc_feed_thr_lock); + next = ddi_get_lbolt() + hz; /* * Quick check for L2ARC devices. @@ -4263,6 +4414,7 @@ l2arc_feed_thread(void) continue; } mutex_exit(&l2arc_dev_mtx); + begin = ddi_get_lbolt(); /* * This selects the next l2arc device to write to, and in @@ -4291,9 +4443,7 @@ l2arc_feed_thread(void) ARCSTAT_BUMP(arcstat_l2_feeds); - size = dev->l2ad_write; - if (arc_warm == B_FALSE) - size += dev->l2ad_boost; + size = l2arc_write_size(dev); /* * Evict L2ARC buffers that will be overwritten. @@ -4303,7 +4453,12 @@ l2arc_feed_thread(void) /* * Write ARC buffers. */ - l2arc_write_buffers(spa, dev, size); + wrote = l2arc_write_buffers(spa, dev, size); + + /* + * Calculate interval between writes. + */ + next = l2arc_write_interval(begin, size, wrote); spa_config_exit(spa, SCL_L2ARC, dev); } @@ -4334,7 +4489,7 @@ l2arc_vdev_present(vdev_t *vd) * validated the vdev and opened it. */ void -l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) +l2arc_add_vdev(spa_t *spa, vdev_t *vd) { l2arc_dev_t *adddev; @@ -4348,11 +4503,12 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) adddev->l2ad_vdev = vd; adddev->l2ad_write = l2arc_write_max; adddev->l2ad_boost = l2arc_write_boost; - adddev->l2ad_start = start; - adddev->l2ad_end = end; + adddev->l2ad_start = VDEV_LABEL_START_SIZE; + adddev->l2ad_end = VDEV_LABEL_START_SIZE + vdev_get_min_asize(vd); adddev->l2ad_hand = adddev->l2ad_start; adddev->l2ad_evict = adddev->l2ad_start; adddev->l2ad_first = B_TRUE; + adddev->l2ad_writing = B_FALSE; ASSERT3U(adddev->l2ad_write, >, 0); /* @@ -4363,7 +4519,7 @@ l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end) list_create(adddev->l2ad_buflist, sizeof (arc_buf_hdr_t), offsetof(arc_buf_hdr_t, b_l2node)); - spa_l2cache_space_update(vd, adddev->l2ad_end - adddev->l2ad_hand, 0); + vdev_space_update(vd, 0, 0, adddev->l2ad_end - adddev->l2ad_hand); /* * Add device to global list @@ -4458,7 +4614,7 @@ l2arc_fini(void) void l2arc_start(void) { - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return; (void) thread_create(NULL, 0, l2arc_feed_thread, NULL, 0, &p0, @@ -4468,7 +4624,7 @@ l2arc_start(void) void l2arc_stop(void) { - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return; mutex_enter(&l2arc_feed_thr_lock); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c b/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c index 93b7741d77be2..e03dd2e6f98c2 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/bplist.c @@ -19,13 +19,27 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include +void +bplist_init(bplist_t *bpl) +{ + bzero(bpl, sizeof (*bpl)); + mutex_init(&bpl->bpl_lock, NULL, MUTEX_DEFAULT, NULL); +} + +void +bplist_fini(bplist_t *bpl) +{ + ASSERT(bpl->bpl_queue == NULL); + mutex_destroy(&bpl->bpl_lock); +} + static int bplist_hold(bplist_t *bpl) { @@ -208,12 +222,13 @@ bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) bparray[off].blk_fill = 0; /* The bplist will compress better if we can leave off the checksum */ - bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); + if (!BP_GET_DEDUP(&bparray[off])) + bzero(&bparray[off].blk_cksum, sizeof (bparray[off].blk_cksum)); dmu_buf_will_dirty(bpl->bpl_dbuf, tx); bpl->bpl_phys->bpl_entries++; bpl->bpl_phys->bpl_bytes += - bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), bp); + bp_get_dsize_sync(dmu_objset_spa(bpl->bpl_mos), bp); if (bpl->bpl_havecomp) { bpl->bpl_phys->bpl_comp += BP_GET_PSIZE(bp); bpl->bpl_phys->bpl_uncomp += BP_GET_UCSIZE(bp); @@ -223,8 +238,14 @@ bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx) return (0); } +void +bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx) +{ + VERIFY(bplist_enqueue(bpl, bp, tx) == 0); +} + /* - * Deferred entry; will be written later by bplist_sync(). + * Deferred entry; will be processed later by bplist_sync(). */ void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp) @@ -240,7 +261,7 @@ bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp) } void -bplist_sync(bplist_t *bpl, dmu_tx_t *tx) +bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, void *arg, dmu_tx_t *tx) { bplist_q_t *bpq; @@ -248,7 +269,7 @@ bplist_sync(bplist_t *bpl, dmu_tx_t *tx) while ((bpq = bpl->bpl_queue) != NULL) { bpl->bpl_queue = bpq->bpq_next; mutex_exit(&bpl->bpl_lock); - VERIFY(0 == bplist_enqueue(bpl, &bpq->bpq_blk, tx)); + func(arg, &bpq->bpq_blk, tx); kmem_free(bpq, sizeof (*bpq)); mutex_enter(&bpl->bpl_lock); } @@ -311,12 +332,12 @@ bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp) } /* - * Return (in *dasizep) the amount of space on the deadlist which is: + * Return (in *dsizep) the amount of space on the deadlist which is: * mintxg < blk_birth <= maxtxg */ int bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg, - uint64_t *dasizep) + uint64_t *dsizep) { uint64_t size = 0; uint64_t itor = 0; @@ -331,19 +352,18 @@ bplist_space_birthrange(bplist_t *bpl, uint64_t mintxg, uint64_t maxtxg, mutex_enter(&bpl->bpl_lock); err = bplist_hold(bpl); if (err == 0) - *dasizep = bpl->bpl_phys->bpl_bytes; + *dsizep = bpl->bpl_phys->bpl_bytes; mutex_exit(&bpl->bpl_lock); return (err); } while ((err = bplist_iterate(bpl, &itor, &bp)) == 0) { if (bp.blk_birth > mintxg && bp.blk_birth <= maxtxg) { - size += - bp_get_dasize(dmu_objset_spa(bpl->bpl_mos), &bp); + size += bp_get_dsize(dmu_objset_spa(bpl->bpl_mos), &bp); } } if (err == ENOENT) err = 0; - *dasizep = size; + *dsizep = size; return (err); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c index d04610317a4ea..1608f7d3c1cf1 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dbuf.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,10 +38,6 @@ static void dbuf_destroy(dmu_buf_impl_t *db); static int dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx); static void dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx); -static arc_done_func_t dbuf_write_ready; -static arc_done_func_t dbuf_write_done; -static zio_done_func_t dbuf_skip_write_ready; -static zio_done_func_t dbuf_skip_write_done; /* * Global data structures and functions for the dbuf cache. @@ -109,7 +105,7 @@ dmu_buf_impl_t * dbuf_find(dnode_t *dn, uint8_t level, uint64_t blkid) { dbuf_hash_table_t *h = &dbuf_hash_table; - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; uint64_t obj = dn->dn_object; uint64_t hv = DBUF_HASH(os, obj, level, blkid); uint64_t idx = hv & h->hash_table_mask; @@ -140,7 +136,7 @@ static dmu_buf_impl_t * dbuf_hash_insert(dmu_buf_impl_t *db) { dbuf_hash_table_t *h = &dbuf_hash_table; - objset_impl_t *os = db->db_objset; + objset_t *os = db->db_objset; uint64_t obj = db->db.db_object; int level = db->db_level; uint64_t blkid = db->db_blkid; @@ -285,6 +281,7 @@ static void dbuf_verify(dmu_buf_impl_t *db) { dnode_t *dn = db->db_dnode; + dbuf_dirty_record_t *dr; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -310,13 +307,19 @@ dbuf_verify(dmu_buf_impl_t *db) ASSERT3U(db->db.db_offset, ==, db->db_blkid * db->db.db_size); } + for (dr = db->db_data_pending; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + + for (dr = db->db_last_dirty; dr != NULL; dr = dr->dr_next) + ASSERT(dr->dr_dbuf == db); + /* * We can't assert that db_size matches dn_datablksz because it * can be momentarily different when another thread is doing * dnode_set_blksz(). */ if (db->db_level == 0 && db->db.db_object == DMU_META_DNODE_OBJECT) { - dbuf_dirty_record_t *dr = db->db_data_pending; + dr = db->db_data_pending; /* * It should only be modified in syncing context, so * make sure we only have one copy of the data. @@ -329,7 +332,7 @@ dbuf_verify(dmu_buf_impl_t *db) if (db->db_parent == dn->dn_dbuf) { /* db is pointed to by the dnode */ /* ASSERT3U(db->db_blkid, <, dn->dn_nblkptr); */ - if (db->db.db_object == DMU_META_DNODE_OBJECT) + if (DMU_OBJECT_IS_SPECIAL(db->db.db_object)) ASSERT(db->db_parent == NULL); else ASSERT(db->db_parent != NULL); @@ -403,6 +406,29 @@ dbuf_set_data(dmu_buf_impl_t *db, arc_buf_t *buf) } } +/* + * Loan out an arc_buf for read. Return the loaned arc_buf. + */ +arc_buf_t * +dbuf_loan_arcbuf(dmu_buf_impl_t *db) +{ + arc_buf_t *abuf; + + mutex_enter(&db->db_mtx); + if (arc_released(db->db_buf) || refcount_count(&db->db_holds) > 1) { + int blksz = db->db.db_size; + mutex_exit(&db->db_mtx); + abuf = arc_loan_buf(db->db_dnode->dn_objset->os_spa, blksz); + bcopy(db->db.db_data, abuf->b_data, blksz); + } else { + abuf = db->db_buf; + arc_loan_inuse_buf(abuf, db); + dbuf_set_data(db, NULL); + mutex_exit(&db->db_mtx); + } + return (abuf); +} + uint64_t dbuf_whichblock(dnode_t *dn, uint64_t offset) { @@ -465,15 +491,15 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) ASSERT(db->db_buf == NULL); if (db->db_blkid == DB_BONUS_BLKID) { - int bonuslen = dn->dn_bonuslen; + int bonuslen = MIN(dn->dn_bonuslen, dn->dn_phys->dn_bonuslen); ASSERT3U(bonuslen, <=, db->db.db_size); db->db.db_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); if (bonuslen < DN_MAX_BONUSLEN) bzero(db->db.db_data, DN_MAX_BONUSLEN); - bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, - bonuslen); + if (bonuslen) + bcopy(DN_BONUS(dn->dn_phys), db->db.db_data, bonuslen); dbuf_update_data(db); db->db_state = DB_CACHED; mutex_exit(&db->db_mtx); @@ -505,11 +531,9 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t *flags) if (DBUF_IS_L2CACHEABLE(db)) aflags |= ARC_L2CACHE; - zb.zb_objset = db->db_objset->os_dsl_dataset ? - db->db_objset->os_dsl_dataset->ds_object : 0; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; + SET_BOOKMARK(&zb, db->db_objset->os_dsl_dataset ? + db->db_objset->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); dbuf_add_ref(db, NULL); /* ZIO_FLAG_CANFAIL callers have to check the parent zio's error */ @@ -665,7 +689,7 @@ dbuf_fix_old_data(dmu_buf_impl_t *db, uint64_t txg) if (db->db_blkid == DB_BONUS_BLKID) { /* Note that the data bufs here are zio_bufs */ dr->dt.dl.dr_data = zio_buf_alloc(DN_MAX_BONUSLEN); - arc_space_consume(DN_MAX_BONUSLEN); + arc_space_consume(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); bcopy(db->db.db_data, dr->dt.dl.dr_data, DN_MAX_BONUSLEN); } else if (refcount_count(&db->db_holds) > db->db_dirtycnt) { int size = db->db.db_size; @@ -682,6 +706,7 @@ void dbuf_unoverride(dbuf_dirty_record_t *dr) { dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; ASSERT(MUTEX_HELD(&db->db_mtx)); @@ -692,13 +717,12 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN) return; + ASSERT(db->db_data_pending != dr); + /* free this block */ - if (!BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) { - /* XXX can get silent EIO here */ - (void) dsl_free(NULL, - spa_get_dsl(db->db_dnode->dn_objset->os_spa), - txg, &dr->dt.dl.dr_overridden_by, NULL, NULL, ARC_WAIT); - } + if (!BP_IS_HOLE(bp)) + dsl_free(spa_get_dsl(db->db_dnode->dn_objset->os_spa), txg, bp); + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; /* * Release the already-written buffer, so we leave it in @@ -894,7 +918,7 @@ dbuf_dirty_record_t * dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) { dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; dbuf_dirty_record_t **drp, *dr; int drop_struct_lock = FALSE; boolean_t do_free_accounting = B_FALSE; @@ -908,15 +932,11 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * Shouldn't dirty a regular buffer in syncing context. Private * objects may be dirtied in syncing context, but only if they * were already pre-dirtied in open context. - * XXX We may want to prohibit dirtying in syncing context even - * if they did pre-dirty. */ ASSERT(!dmu_tx_is_syncing(tx) || BP_IS_HOLE(dn->dn_objset->os_rootbp) || - dn->dn_object == DMU_META_DNODE_OBJECT || - dn->dn_objset->os_dsl_dataset == NULL || - dsl_dir_is_private(dn->dn_objset->os_dsl_dataset->ds_dir)); - + DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + dn->dn_objset->os_dsl_dataset == NULL); /* * We make this assert for private objects as well, but after we * check if we're already dirty. They are allowed to re-dirty @@ -965,7 +985,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * we now need to reset its state. */ dbuf_unoverride(dr); - if (db->db.db_object != DMU_META_DNODE_OBJECT) + if (db->db.db_object != DMU_META_DNODE_OBJECT && + db->db_state != DB_NOFILL) arc_buf_thaw(db->db_buf); } mutex_exit(&db->db_mtx); @@ -975,7 +996,8 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * Only valid if not already dirty. */ - ASSERT(dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == + ASSERT(dn->dn_object == 0 || + dn->dn_dirtyctx == DN_UNDIRTIED || dn->dn_dirtyctx == (dmu_tx_is_syncing(tx) ? DN_DIRTY_SYNC : DN_DIRTY_OPEN)); ASSERT3U(dn->dn_nlevels, >, db->db_level); @@ -987,15 +1009,13 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) /* * We should only be dirtying in syncing context if it's the - * mos, a spa os, or we're initializing the os. However, we are - * allowed to dirty in syncing context provided we already - * dirtied it in open context. Hence we must make this - * assertion only if we're not already dirty. + * mos or we're initializing the os or it's a special object. + * However, we are allowed to dirty in syncing context provided + * we already dirtied it in open context. Hence we must make + * this assertion only if we're not already dirty. */ - ASSERT(!dmu_tx_is_syncing(tx) || - os->os_dsl_dataset == NULL || - !dsl_dir_is_private(os->os_dsl_dataset->ds_dir) || - !BP_IS_HOLE(os->os_rootbp)); + ASSERT(!dmu_tx_is_syncing(tx) || DMU_OBJECT_IS_SPECIAL(dn->dn_object) || + os->os_dsl_dataset == NULL || BP_IS_HOLE(os->os_rootbp)); ASSERT(db->db.db_size != 0); dprintf_dbuf(db, "size=%llx\n", (u_longlong_t)db->db.db_size); @@ -1005,7 +1025,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) * Update the accounting. * Note: we delay "free accounting" until after we drop * the db_mtx. This keeps us from grabbing other locks - * (and possibly deadlocking) in bp_get_dasize() while + * (and possibly deadlocking) in bp_get_dsize() while * also holding the db_mtx. */ dnode_willuse_space(dn, db->db.db_size, tx); @@ -1084,7 +1104,7 @@ dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx) } else if (do_free_accounting) { blkptr_t *bp = db->db_blkptr; int64_t willfree = (bp && !BP_IS_HOLE(bp)) ? - bp_get_dasize(os->os_spa, bp) : db->db.db_size; + bp_get_dsize(os->os_spa, bp) : db->db.db_size; /* * This is only a guess -- if the dbuf is dirty * in a previous txg, we don't know how much @@ -1165,7 +1185,6 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) ASSERT(db->db_blkid != DB_BONUS_BLKID); mutex_enter(&db->db_mtx); - /* * If this buffer is not dirty, we're done. */ @@ -1177,6 +1196,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) return (0); } ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_dbuf == db); /* * If this buffer is currently held, we cannot undirty @@ -1236,7 +1256,7 @@ dbuf_undirty(dmu_buf_impl_t *db, dmu_tx_t *tx) if (refcount_remove(&db->db_holds, (void *)(uintptr_t)txg) == 0) { arc_buf_t *buf = db->db_buf; - ASSERT(arc_released(buf)); + ASSERT(db->db_state == DB_NOFILL || arc_released(buf)); dbuf_set_data(db, NULL); VERIFY(arc_buf_remove_ref(buf, db) == 1); dbuf_evict(db); @@ -1311,6 +1331,70 @@ dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx) mutex_exit(&db->db_mtx); } +/* + * Directly assign a provided arc buf to a given dbuf if it's not referenced + * by anybody except our caller. Otherwise copy arcbuf's contents to dbuf. + */ +void +dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) +{ + ASSERT(!refcount_is_zero(&db->db_holds)); + ASSERT(db->db_dnode->dn_object != DMU_META_DNODE_OBJECT); + ASSERT(db->db_blkid != DB_BONUS_BLKID); + ASSERT(db->db_level == 0); + ASSERT(DBUF_GET_BUFC_TYPE(db) == ARC_BUFC_DATA); + ASSERT(buf != NULL); + ASSERT(arc_buf_size(buf) == db->db.db_size); + ASSERT(tx->tx_txg != 0); + + arc_return_buf(buf, db); + ASSERT(arc_released(buf)); + + mutex_enter(&db->db_mtx); + + while (db->db_state == DB_READ || db->db_state == DB_FILL) + cv_wait(&db->db_changed, &db->db_mtx); + + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + + if (db->db_state == DB_CACHED && + refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + bcopy(buf->b_data, db->db.db_data, db->db.db_size); + VERIFY(arc_buf_remove_ref(buf, db) == 1); + xuio_stat_wbuf_copied(); + return; + } + + xuio_stat_wbuf_nocopy(); + if (db->db_state == DB_CACHED) { + dbuf_dirty_record_t *dr = db->db_last_dirty; + + ASSERT(db->db_buf != NULL); + if (dr != NULL && dr->dr_txg == tx->tx_txg) { + ASSERT(dr->dt.dl.dr_data == db->db_buf); + if (!arc_released(db->db_buf)) { + ASSERT(dr->dt.dl.dr_override_state == + DR_OVERRIDDEN); + arc_release(db->db_buf, db); + } + dr->dt.dl.dr_data = buf; + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } else if (dr == NULL || dr->dt.dl.dr_data != db->db_buf) { + arc_release(db->db_buf, db); + VERIFY(arc_buf_remove_ref(db->db_buf, db) == 1); + } + db->db_buf = NULL; + } + ASSERT(db->db_buf == NULL); + dbuf_set_data(db, buf); + db->db_state = DB_FILL; + mutex_exit(&db->db_mtx); + (void) dbuf_dirty(db, tx); + dbuf_fill_done(db, tx); +} + /* * "Clear" the contents of this dbuf. This will mark the dbuf * EVICTING and clear *most* of its references. Unfortunetely, @@ -1341,7 +1425,7 @@ dbuf_clear(dmu_buf_impl_t *db) ASSERT(db->db.db_data != NULL); if (db->db_blkid == DB_BONUS_BLKID) { zio_buf_free(db->db.db_data, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db.db_data = NULL; db->db_state = DB_UNCACHED; @@ -1431,7 +1515,7 @@ static dmu_buf_impl_t * dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, dmu_buf_impl_t *parent, blkptr_t *blkptr) { - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; dmu_buf_impl_t *db, *odb; ASSERT(RW_LOCK_HELD(&dn->dn_struct_rwlock)); @@ -1463,7 +1547,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, db->db.db_offset = DB_BONUS_BLKID; db->db_state = DB_UNCACHED; /* the bonus dbuf is not placed in the hash table */ - arc_space_consume(sizeof (dmu_buf_impl_t)); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); return (db); } else { int blocksize = @@ -1490,7 +1574,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid, list_insert_head(&dn->dn_dbufs, db); db->db_state = DB_UNCACHED; mutex_exit(&dn->dn_dbufs_mtx); - arc_space_consume(sizeof (dmu_buf_impl_t)); + arc_space_consume(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); if (parent && parent != dn->dn_dbuf) dbuf_add_ref(parent, db); @@ -1559,7 +1643,7 @@ dbuf_destroy(dmu_buf_impl_t *db) ASSERT(db->db_data_pending == NULL); kmem_cache_free(dbuf_cache, db); - arc_space_return(sizeof (dmu_buf_impl_t)); + arc_space_return(sizeof (dmu_buf_impl_t), ARC_SPACE_OTHER); } void @@ -1592,13 +1676,12 @@ dbuf_prefetch(dnode_t *dn, uint64_t blkid) if (dbuf_findbp(dn, 0, blkid, TRUE, &db, &bp) == 0) { if (bp && !BP_IS_HOLE(bp)) { arc_buf_t *pbuf; + dsl_dataset_t *ds = dn->dn_objset->os_dsl_dataset; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; zbookmark_t zb; - zb.zb_objset = dn->dn_objset->os_dsl_dataset ? - dn->dn_objset->os_dsl_dataset->ds_object : 0; - zb.zb_object = dn->dn_object; - zb.zb_level = 0; - zb.zb_blkid = blkid; + + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + dn->dn_object, 0, blkid); if (db) pbuf = db->db_buf; @@ -1743,10 +1826,21 @@ dbuf_add_ref(dmu_buf_impl_t *db, void *tag) #pragma weak dmu_buf_rele = dbuf_rele void dbuf_rele(dmu_buf_impl_t *db, void *tag) +{ + mutex_enter(&db->db_mtx); + dbuf_rele_and_unlock(db, tag); +} + +/* + * dbuf_rele() for an already-locked dbuf. This is necessary to allow + * db_dirtycnt and db_holds to be updated atomically. + */ +void +dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag) { int64_t holds; - mutex_enter(&db->db_mtx); + ASSERT(MUTEX_HELD(&db->db_mtx)); DBUF_VERIFY(db); holds = refcount_remove(&db->db_holds, tag); @@ -1855,6 +1949,19 @@ dmu_buf_get_user(dmu_buf_t *db_fake) return (db->db_user_ptr); } +boolean_t +dmu_buf_freeable(dmu_buf_t *dbuf) +{ + boolean_t res = B_FALSE; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; + + if (db->db_blkptr) + res = dsl_dataset_block_freeable(db->db_objset->os_dsl_dataset, + db->db_blkptr->blk_birth); + + return (res); +} + static void dbuf_check_blkptr(dnode_t *dn, dmu_buf_impl_t *db) { @@ -1941,9 +2048,8 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) arc_buf_t **datap = &dr->dt.dl.dr_data; dmu_buf_impl_t *db = dr->dr_dbuf; dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; - int blksz; ASSERT(dmu_tx_is_syncing(tx)); @@ -1980,19 +2086,19 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) bcopy(*datap, DN_BONUS(dn->dn_phys), dn->dn_phys->dn_bonuslen); if (*datap != db->db.db_data) { zio_buf_free(*datap, DN_MAX_BONUSLEN); - arc_space_return(DN_MAX_BONUSLEN); + arc_space_return(DN_MAX_BONUSLEN, ARC_SPACE_OTHER); } db->db_data_pending = NULL; drp = &db->db_last_dirty; while (*drp != dr) drp = &(*drp)->dr_next; ASSERT(dr->dr_next == NULL); + ASSERT(dr->dr_dbuf == db); *drp = dr->dr_next; kmem_free(dr, sizeof (dbuf_dirty_record_t)); ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; - mutex_exit(&db->db_mtx); - dbuf_rele(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); return; } @@ -2014,67 +2120,26 @@ dbuf_sync_leaf(dbuf_dirty_record_t *dr, dmu_tx_t *tx) ASSERT(dr->dt.dl.dr_override_state != DR_NOT_OVERRIDDEN); } - /* - * If this dbuf has already been written out via an immediate write, - * just complete the write by copying over the new block pointer and - * updating the accounting via the write-completion functions. - */ - if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - zio_t zio_fake; - - zio_fake.io_private = &db; - zio_fake.io_error = 0; - zio_fake.io_bp = db->db_blkptr; - zio_fake.io_bp_orig = *db->db_blkptr; - zio_fake.io_txg = txg; - zio_fake.io_flags = 0; - - *db->db_blkptr = dr->dt.dl.dr_overridden_by; - dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; - db->db_data_pending = dr; - dr->dr_zio = &zio_fake; - mutex_exit(&db->db_mtx); - - ASSERT(!DVA_EQUAL(BP_IDENTITY(zio_fake.io_bp), - BP_IDENTITY(&zio_fake.io_bp_orig)) || - BP_IS_HOLE(zio_fake.io_bp)); - - if (BP_IS_OLDER(&zio_fake.io_bp_orig, txg)) - (void) dsl_dataset_block_kill(os->os_dsl_dataset, - &zio_fake.io_bp_orig, dn->dn_zio, tx); - - dbuf_write_ready(&zio_fake, db->db_buf, db); - dbuf_write_done(&zio_fake, db->db_buf, db); - - return; - } - - if (db->db_state != DB_NOFILL) { - blksz = arc_buf_size(*datap); - - if (dn->dn_object != DMU_META_DNODE_OBJECT) { - /* - * If this buffer is currently "in use" (i.e., there - * are active holds and db_data still references it), - * then make a copy before we start the write so that - * any modifications from the open txg will not leak - * into this write. - * - * NOTE: this copy does not need to be made for - * objects only modified in the syncing context (e.g. - * DNONE_DNODE blocks). - */ - if (refcount_count(&db->db_holds) > 1 && - *datap == db->db_buf) { - arc_buf_contents_t type = - DBUF_GET_BUFC_TYPE(db); - *datap = - arc_buf_alloc(os->os_spa, blksz, db, type); - bcopy(db->db.db_data, (*datap)->b_data, blksz); - } - } - - ASSERT(*datap != NULL); + if (db->db_state != DB_NOFILL && + dn->dn_object != DMU_META_DNODE_OBJECT && + refcount_count(&db->db_holds) > 1 && + dr->dt.dl.dr_override_state != DR_OVERRIDDEN && + *datap == db->db_buf) { + /* + * If this buffer is currently "in use" (i.e., there + * are active holds and db_data still references it), + * then make a copy before we start the write so that + * any modifications from the open txg will not leak + * into this write. + * + * NOTE: this copy does not need to be made for + * objects only modified in the syncing context (e.g. + * DNONE_DNODE blocks). + */ + int blksz = arc_buf_size(*datap); + arc_buf_contents_t type = DBUF_GET_BUFC_TYPE(db); + *datap = arc_buf_alloc(os->os_spa, blksz, db, type); + bcopy(db->db.db_data, (*datap)->b_data, blksz); } db->db_data_pending = dr; @@ -2115,130 +2180,27 @@ dbuf_sync_list(list_t *list, dmu_tx_t *tx) } } -static void -dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) -{ - dmu_buf_impl_t *db = dr->dr_dbuf; - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; - dmu_buf_impl_t *parent = db->db_parent; - uint64_t txg = tx->tx_txg; - zbookmark_t zb; - writeprops_t wp = { 0 }; - zio_t *zio; - - if (!BP_IS_HOLE(db->db_blkptr) && - (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE)) { - /* - * Private object buffers are released here rather - * than in dbuf_dirty() since they are only modified - * in the syncing context and we don't want the - * overhead of making multiple copies of the data. - */ - arc_release(data, db); - } else if (db->db_state != DB_NOFILL) { - ASSERT(arc_released(data)); - /* XXX why do we need to thaw here? */ - arc_buf_thaw(data); - } - - if (parent != dn->dn_dbuf) { - ASSERT(parent && parent->db_data_pending); - ASSERT(db->db_level == parent->db_level-1); - ASSERT(arc_released(parent->db_buf)); - zio = parent->db_data_pending->dr_zio; - } else { - ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); - ASSERT3P(db->db_blkptr, ==, - &dn->dn_phys->dn_blkptr[db->db_blkid]); - zio = dn->dn_zio; - } - - ASSERT(db->db_level == 0 || data == db->db_buf); - ASSERT3U(db->db_blkptr->blk_birth, <=, txg); - ASSERT(zio); - - zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; - - wp.wp_type = dn->dn_type; - wp.wp_level = db->db_level; - wp.wp_copies = os->os_copies; - wp.wp_dncompress = dn->dn_compress; - wp.wp_oscompress = os->os_compress; - wp.wp_dnchecksum = dn->dn_checksum; - wp.wp_oschecksum = os->os_checksum; - - if (BP_IS_OLDER(db->db_blkptr, txg)) - (void) dsl_dataset_block_kill( - os->os_dsl_dataset, db->db_blkptr, zio, tx); - - if (db->db_state == DB_NOFILL) { - zio_prop_t zp = { 0 }; - - write_policy(os->os_spa, &wp, &zp); - dr->dr_zio = zio_write(zio, os->os_spa, - txg, db->db_blkptr, NULL, - db->db.db_size, &zp, dbuf_skip_write_ready, - dbuf_skip_write_done, db, ZIO_PRIORITY_ASYNC_WRITE, - ZIO_FLAG_MUSTSUCCEED, &zb); - } else { - dr->dr_zio = arc_write(zio, os->os_spa, &wp, - DBUF_IS_L2CACHEABLE(db), txg, db->db_blkptr, - data, dbuf_write_ready, dbuf_write_done, db, - ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - } -} - -/* wrapper function for dbuf_write_ready bypassing ARC */ -static void -dbuf_skip_write_ready(zio_t *zio) -{ - blkptr_t *bp = zio->io_bp; - - if (!BP_IS_GANG(bp)) - zio_skip_write(zio); - - dbuf_write_ready(zio, NULL, zio->io_private); -} - -/* wrapper function for dbuf_write_done bypassing ARC */ -static void -dbuf_skip_write_done(zio_t *zio) -{ - dbuf_write_done(zio, NULL, zio->io_private); -} - /* ARGSUSED */ static void dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; - dnode_t *dn = db->db_dnode; - objset_impl_t *os = dn->dn_objset; blkptr_t *bp = zio->io_bp; blkptr_t *bp_orig = &zio->io_bp_orig; + dnode_t *dn = db->db_dnode; + spa_t *spa = zio->io_spa; + int64_t delta; uint64_t fill = 0; - int old_size, new_size, i; + int i; ASSERT(db->db_blkptr == bp); - dprintf_dbuf_bp(db, bp_orig, "bp_orig: %s", ""); - - old_size = bp_get_dasize(os->os_spa, bp_orig); - new_size = bp_get_dasize(os->os_spa, bp); - - dnode_diduse_space(dn, new_size - old_size); + delta = bp_get_dsize_sync(spa, bp) - bp_get_dsize_sync(spa, bp_orig); + dnode_diduse_space(dn, delta - zio->io_prev_space_delta); + zio->io_prev_space_delta = delta; if (BP_IS_HOLE(bp)) { - dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; - - if (bp_orig->blk_birth == tx->tx_txg) - (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); - ASSERT3U(bp->blk_fill, ==, 0); + ASSERT(bp->blk_fill == 0); return; } @@ -2269,9 +2231,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) for (i = db->db.db_size >> SPA_BLKPTRSHIFT; i > 0; i--, ibp++) { if (BP_IS_HOLE(ibp)) continue; - ASSERT3U(BP_GET_LSIZE(ibp), ==, - db->db_level == 1 ? dn->dn_datablksz : - (1<dn_phys->dn_indblkshift)); fill += ibp->blk_fill; } } @@ -2279,17 +2238,6 @@ dbuf_write_ready(zio_t *zio, arc_buf_t *buf, void *vdb) bp->blk_fill = fill; mutex_exit(&db->db_mtx); - - if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { - ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); - } else { - dsl_dataset_t *ds = os->os_dsl_dataset; - dmu_tx_t *tx = os->os_synctx; - - if (bp_orig->blk_birth == tx->tx_txg) - (void) dsl_dataset_block_kill(ds, bp_orig, zio, tx); - dsl_dataset_block_born(ds, bp, tx); - } } /* ARGSUSED */ @@ -2297,37 +2245,50 @@ static void dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + dnode_t *dn = db->db_dnode; + objset_t *os = dn->dn_objset; uint64_t txg = zio->io_txg; dbuf_dirty_record_t **drp, *dr; ASSERT3U(zio->io_error, ==, 0); + ASSERT(db->db_blkptr == bp); + + if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { + ASSERT(BP_EQUAL(bp, bp_orig)); + } else { + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); + } mutex_enter(&db->db_mtx); + DBUF_VERIFY(db); + drp = &db->db_last_dirty; while ((dr = *drp) != db->db_data_pending) drp = &dr->dr_next; ASSERT(!list_link_active(&dr->dr_dirty_node)); ASSERT(dr->dr_txg == txg); + ASSERT(dr->dr_dbuf == db); ASSERT(dr->dr_next == NULL); *drp = dr->dr_next; if (db->db_level == 0) { ASSERT(db->db_blkid != DB_BONUS_BLKID); ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); - if (db->db_state != DB_NOFILL) { if (dr->dt.dl.dr_data != db->db_buf) VERIFY(arc_buf_remove_ref(dr->dt.dl.dr_data, db) == 1); - else if (!BP_IS_HOLE(db->db_blkptr)) + else if (!arc_released(db->db_buf)) arc_set_callback(db->db_buf, dbuf_do_evict, db); - else - ASSERT(arc_released(db->db_buf)); } } else { - dnode_t *dn = db->db_dnode; - ASSERT(list_head(&dr->dt.di.dr_children) == NULL); ASSERT3U(db->db.db_size, ==, 1<dn_phys->dn_indblkshift); if (!BP_IS_HOLE(db->db_blkptr)) { @@ -2348,9 +2309,122 @@ dbuf_write_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT(db->db_dirtycnt > 0); db->db_dirtycnt -= 1; db->db_data_pending = NULL; + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); +} + +static void +dbuf_write_nofill_ready(zio_t *zio) +{ + dbuf_write_ready(zio, NULL, zio->io_private); +} + +static void +dbuf_write_nofill_done(zio_t *zio) +{ + dbuf_write_done(zio, NULL, zio->io_private); +} + +static void +dbuf_write_override_ready(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + + dbuf_write_ready(zio, NULL, db); +} + +static void +dbuf_write_override_done(zio_t *zio) +{ + dbuf_dirty_record_t *dr = zio->io_private; + dmu_buf_impl_t *db = dr->dr_dbuf; + blkptr_t *obp = &dr->dt.dl.dr_overridden_by; + + mutex_enter(&db->db_mtx); + if (!BP_EQUAL(zio->io_bp, obp)) { + if (!BP_IS_HOLE(obp)) + dsl_free(spa_get_dsl(zio->io_spa), zio->io_txg, obp); + arc_release(dr->dt.dl.dr_data, db); + } mutex_exit(&db->db_mtx); - dprintf_dbuf_bp(db, zio->io_bp, "bp: %s", ""); + dbuf_write_done(zio, NULL, db); +} + +static void +dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) +{ + dmu_buf_impl_t *db = dr->dr_dbuf; + dnode_t *dn = db->db_dnode; + objset_t *os = dn->dn_objset; + dmu_buf_impl_t *parent = db->db_parent; + uint64_t txg = tx->tx_txg; + zbookmark_t zb; + zio_prop_t zp; + zio_t *zio; - dbuf_rele(db, (void *)(uintptr_t)txg); + if (db->db_state != DB_NOFILL) { + if (db->db_level > 0 || dn->dn_type == DMU_OT_DNODE) { + /* + * Private object buffers are released here rather + * than in dbuf_dirty() since they are only modified + * in the syncing context and we don't want the + * overhead of making multiple copies of the data. + */ + if (BP_IS_HOLE(db->db_blkptr)) { + arc_buf_thaw(data); + } else { + arc_release(data, db); + } + } + } + + if (parent != dn->dn_dbuf) { + ASSERT(parent && parent->db_data_pending); + ASSERT(db->db_level == parent->db_level-1); + ASSERT(arc_released(parent->db_buf)); + zio = parent->db_data_pending->dr_zio; + } else { + ASSERT(db->db_level == dn->dn_phys->dn_nlevels-1); + ASSERT3P(db->db_blkptr, ==, + &dn->dn_phys->dn_blkptr[db->db_blkid]); + zio = dn->dn_zio; + } + + ASSERT(db->db_level == 0 || data == db->db_buf); + ASSERT3U(db->db_blkptr->blk_birth, <=, txg); + ASSERT(zio); + + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + db->db.db_object, db->db_level, db->db_blkid); + + dmu_write_policy(os, dn, db->db_level, + db->db_state == DB_NOFILL ? WP_NOFILL : 0, &zp); + + if (db->db_level == 0 && dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { + ASSERT(db->db_state != DB_NOFILL); + dr->dr_zio = zio_write(zio, os->os_spa, txg, + db->db_blkptr, data->b_data, arc_buf_size(data), &zp, + dbuf_write_override_ready, dbuf_write_override_done, dr, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + mutex_enter(&db->db_mtx); + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + zio_write_override(dr->dr_zio, &dr->dt.dl.dr_overridden_by, + dr->dt.dl.dr_copies); + mutex_exit(&db->db_mtx); + } else if (db->db_state == DB_NOFILL) { + ASSERT(zp.zp_checksum == ZIO_CHECKSUM_OFF); + dr->dr_zio = zio_write(zio, os->os_spa, txg, + db->db_blkptr, NULL, db->db.db_size, &zp, + dbuf_write_nofill_ready, dbuf_write_nofill_done, db, + ZIO_PRIORITY_ASYNC_WRITE, + ZIO_FLAG_MUSTSUCCEED | ZIO_FLAG_NODATA, &zb); + } else { + ASSERT(arc_released(data)); + dr->dr_zio = arc_write(zio, os->os_spa, txg, + db->db_blkptr, data, DBUF_IS_L2CACHEABLE(db), &zp, + dbuf_write_ready, dbuf_write_done, db, + ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); + } } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c b/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c new file mode 100644 index 0000000000000..afe72af7db11c --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/ddt.c @@ -0,0 +1,1064 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static const ddt_ops_t *ddt_ops[DDT_TYPES] = { + &ddt_zap_ops, +}; + +static const char *ddt_class_name[DDT_CLASSES] = { + "ditto", + "duplicate", + "unique", +}; + +static void +ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + boolean_t prehash = zio_checksum_table[ddt->ddt_checksum].ci_dedup; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp == 0); + VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); + ASSERT(*objectp != 0); + + VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, objectp, tx) == 0); + + VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); +} + +static void +ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + spa_t *spa = ddt->ddt_spa; + objset_t *os = ddt->ddt_os; + uint64_t *objectp = &ddt->ddt_object[type][class]; + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + ASSERT(*objectp != 0); + ASSERT(ddt_object_count(ddt, type, class) == 0); + ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); + VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); + VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); + VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); + + *objectp = 0; +} + +static int +ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + char name[DDT_NAMELEN]; + int error; + + ddt_object_name(ddt, type, class, name); + + error = zap_lookup(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, &ddt->ddt_object[type][class]); + + if (error) + return (error); + + error = zap_lookup(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class]); + + ASSERT(error == 0); + return (error); +} + +static void +ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_tx_t *tx) +{ + char name[DDT_NAMELEN]; + + ddt_object_name(ddt, type, class, name); + + VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), + &ddt->ddt_histogram[type][class], tx) == 0); +} + +static int +ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde) +{ + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, + ddt->ddt_object[type][class], dde)); +} + +static int +ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +static int +ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, + ddt->ddt_object[type][class], dde, tx)); +} + +int +ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + uint64_t *walk, ddt_entry_t *dde) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, + ddt->ddt_object[type][class], dde, walk)); +} + +uint64_t +ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + ASSERT(ddt_object_exists(ddt, type, class)); + + return (ddt_ops[type]->ddt_op_count(ddt->ddt_os, + ddt->ddt_object[type][class])); +} + +int +ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + dmu_object_info_t *doi) +{ + if (!ddt_object_exists(ddt, type, class)) + return (ENOENT); + + return (dmu_object_info(ddt->ddt_os, ddt->ddt_object[type][class], + doi)); +} + +boolean_t +ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + return (!!ddt->ddt_object[type][class]); +} + +void +ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, + char *name) +{ + (void) sprintf(name, DMU_POOL_DDT, + zio_checksum_table[ddt->ddt_checksum].ci_name, + ddt_ops[type]->ddt_op_name, ddt_class_name[class]); +} + +void +ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) +{ + ASSERT(txg != 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + bp->blk_dva[d] = ddp->ddp_dva[d]; + BP_SET_BIRTH(bp, txg, ddp->ddp_phys_birth); +} + +void +ddt_bp_create(enum zio_checksum checksum, + const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp) +{ + BP_ZERO(bp); + + if (ddp != NULL) + ddt_bp_fill(ddp, bp, ddp->ddp_phys_birth); + + bp->blk_cksum = ddk->ddk_cksum; + + BP_SET_LSIZE(bp, DDK_GET_LSIZE(ddk)); + BP_SET_PSIZE(bp, DDK_GET_PSIZE(ddk)); + BP_SET_COMPRESS(bp, DDK_GET_COMPRESS(ddk)); + BP_SET_CHECKSUM(bp, checksum); + BP_SET_TYPE(bp, DMU_OT_NONE); + BP_SET_LEVEL(bp, 0); + BP_SET_DEDUP(bp, 0); + BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); +} + +void +ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) +{ + ddk->ddk_cksum = bp->blk_cksum; + ddk->ddk_prop = 0; + + DDK_SET_LSIZE(ddk, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(ddk, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(ddk, BP_GET_COMPRESS(bp)); +} + +void +ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) +{ + ASSERT(ddp->ddp_phys_birth == 0); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + ddp->ddp_dva[d] = bp->blk_dva[d]; + ddp->ddp_phys_birth = BP_PHYSICAL_BIRTH(bp); +} + +void +ddt_phys_clear(ddt_phys_t *ddp) +{ + bzero(ddp, sizeof (*ddp)); +} + +void +ddt_phys_addref(ddt_phys_t *ddp) +{ + ddp->ddp_refcnt++; +} + +void +ddt_phys_decref(ddt_phys_t *ddp) +{ + ASSERT((int64_t)ddp->ddp_refcnt > 0); + ddp->ddp_refcnt--; +} + +void +ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) +{ + blkptr_t blk; + + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + ddt_phys_clear(ddp); + zio_free(ddt->ddt_spa, txg, &blk); +} + +ddt_phys_t * +ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp) +{ + ddt_phys_t *ddp = (ddt_phys_t *)dde->dde_phys; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (DVA_EQUAL(BP_IDENTITY(bp), &ddp->ddp_dva[0]) && + BP_PHYSICAL_BIRTH(bp) == ddp->ddp_phys_birth) + return (ddp); + } + return (NULL); +} + +uint64_t +ddt_phys_total_refcnt(const ddt_entry_t *dde) +{ + uint64_t refcnt = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) + refcnt += dde->dde_phys[p].ddp_refcnt; + + return (refcnt); +} + +static void +ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +{ + spa_t *spa = ddt->ddt_spa; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + uint64_t lsize = DDK_GET_LSIZE(ddk); + uint64_t psize = DDK_GET_PSIZE(ddk); + + bzero(dds, sizeof (*dds)); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + uint64_t dsize = 0; + uint64_t refcnt = ddp->ddp_refcnt; + + if (ddp->ddp_phys_birth == 0) + continue; + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + + dds->dds_blocks += 1; + dds->dds_lsize += lsize; + dds->dds_psize += psize; + dds->dds_dsize += dsize; + + dds->dds_ref_blocks += refcnt; + dds->dds_ref_lsize += lsize * refcnt; + dds->dds_ref_psize += psize * refcnt; + dds->dds_ref_dsize += dsize * refcnt; + } +} + +void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +{ + const uint64_t *s = (const uint64_t *)src; + uint64_t *d = (uint64_t *)dst; + uint64_t *d_end = (uint64_t *)(dst + 1); + + ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ + + while (d < d_end) + *d++ += (*s++ ^ neg) - neg; +} + +static void +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +{ + ddt_stat_t dds; + ddt_histogram_t *ddh; + int bucket; + + ddt_stat_generate(ddt, dde, &dds); + + bucket = highbit(dds.dds_ref_blocks) - 1; + ASSERT(bucket >= 0); + + ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +} + +void +ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) +{ + for (int h = 0; h < 64; h++) + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); +} + +void +ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +{ + bzero(dds, sizeof (*dds)); + + for (int h = 0; h < 64; h++) + ddt_stat_add(dds, &ddh->ddh_stat[h], 0); +} + +boolean_t +ddt_histogram_empty(const ddt_histogram_t *ddh) +{ + const uint64_t *s = (const uint64_t *)ddh; + const uint64_t *s_end = (const uint64_t *)(ddh + 1); + + while (s < s_end) + if (*s++ != 0) + return (B_FALSE); + + return (B_TRUE); +} + +void +ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo) +{ + dmu_object_info_t doi; + uint64_t count; + int error; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + error = ddt_object_info(ddt, type, class, &doi); + if (error == ENOENT) + continue; + ASSERT3U(error, ==, 0); + + count = ddt_object_count(ddt, type, class); + ddo->ddo_count += count; + ddo->ddo_dspace += + (doi.doi_physical_blocks_512 << 9) / count; + ddo->ddo_mspace += doi.doi_fill_count * + doi.doi_data_block_size / count; + } + } + } +} + +void +ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_histogram_add(ddh, + &ddt->ddt_histogram[type][class]); + } + } + } +} + +void +ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) +{ + ddt_histogram_t *ddh_total; + + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh_total); + ddt_histogram_stat(dds_total, ddh_total); + kmem_free(ddh_total, sizeof (ddt_histogram_t)); +} + +uint64_t +ddt_get_dedup_dspace(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + return (dds_total.dds_ref_dsize - dds_total.dds_dsize); +} + +uint64_t +ddt_get_pool_dedup_ratio(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + if (dds_total.dds_dsize == 0) + return (100); + + return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); +} + +int +ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, ddt_phys_t *ddp_willref) +{ + spa_t *spa = ddt->ddt_spa; + uint64_t total_refcnt = 0; + uint64_t ditto = spa->spa_dedup_ditto; + int total_copies = 0; + int desired_copies = 0; + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *zio = dde->dde_lead_zio[p]; + uint64_t refcnt = ddp->ddp_refcnt; /* committed refs */ + if (zio != NULL) + refcnt += zio->io_parent_count; /* pending refs */ + if (ddp == ddp_willref) + refcnt++; /* caller's ref */ + if (refcnt != 0) { + total_refcnt += refcnt; + total_copies += p; + } + } + + if (ditto == 0 || ditto > UINT32_MAX) + ditto = UINT32_MAX; + + if (total_refcnt >= 1) + desired_copies++; + if (total_refcnt >= ditto) + desired_copies++; + if (total_refcnt >= ditto * ditto) + desired_copies++; + + return (MAX(desired_copies, total_copies) - total_copies); +} + +int +ddt_ditto_copies_present(ddt_entry_t *dde) +{ + ddt_phys_t *ddp = &dde->dde_phys[DDT_PHYS_DITTO]; + dva_t *dva = ddp->ddp_dva; + int copies = 0 - DVA_GET_GANG(dva); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++, dva++) + if (DVA_IS_VALID(dva)) + copies++; + + ASSERT(copies >= 0 && copies < SPA_DVAS_PER_BP); + + return (copies); +} + +size_t +ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) +{ + uchar_t *version = dst++; + int cpfunc = ZIO_COMPRESS_ZLE; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + size_t c_len; + + ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ + + c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); + + if (c_len == s_len) { + cpfunc = ZIO_COMPRESS_OFF; + bcopy(src, dst, s_len); + } + + *version = (ZFS_HOST_BYTEORDER & DDT_COMPRESS_BYTEORDER_MASK) | cpfunc; + + return (c_len + 1); +} + +void +ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) +{ + uchar_t version = *src++; + int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + if (ci->ci_decompress != NULL) + (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); + else + bcopy(src, dst, d_len); + + if ((version ^ ZFS_HOST_BYTEORDER) & DDT_COMPRESS_BYTEORDER_MASK) + byteswap_uint64_array(dst, d_len); +} + +ddt_t * +ddt_select_by_checksum(spa_t *spa, enum zio_checksum c) +{ + return (spa->spa_ddt[c]); +} + +ddt_t * +ddt_select(spa_t *spa, const blkptr_t *bp) +{ + return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); +} + +void +ddt_enter(ddt_t *ddt) +{ + mutex_enter(&ddt->ddt_lock); +} + +void +ddt_exit(ddt_t *ddt) +{ + mutex_exit(&ddt->ddt_lock); +} + +static ddt_entry_t * +ddt_alloc(const ddt_key_t *ddk) +{ + ddt_entry_t *dde; + + dde = kmem_zalloc(sizeof (ddt_entry_t), KM_SLEEP); + cv_init(&dde->dde_cv, NULL, CV_DEFAULT, NULL); + + dde->dde_key = *ddk; + + return (dde); +} + +static void +ddt_free(ddt_entry_t *dde) +{ + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++) + ASSERT(dde->dde_lead_zio[p] == NULL); + + if (dde->dde_repair_data != NULL) + zio_buf_free(dde->dde_repair_data, + DDK_GET_PSIZE(&dde->dde_key)); + + cv_destroy(&dde->dde_cv); + kmem_free(dde, sizeof (*dde)); +} + +void +ddt_remove(ddt_t *ddt, ddt_entry_t *dde) +{ + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + avl_remove(&ddt->ddt_tree, dde); + ddt_free(dde); +} + +ddt_entry_t * +ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) +{ + ddt_entry_t *dde, dde_search; + enum ddt_type type; + enum ddt_class class; + avl_index_t where; + int error; + + ASSERT(MUTEX_HELD(&ddt->ddt_lock)); + + ddt_key_fill(&dde_search.dde_key, bp); + + dde = avl_find(&ddt->ddt_tree, &dde_search, &where); + if (dde == NULL) { + if (!add) + return (NULL); + dde = ddt_alloc(&dde_search.dde_key); + avl_insert(&ddt->ddt_tree, dde, where); + } + + while (dde->dde_loading) + cv_wait(&dde->dde_cv, &ddt->ddt_lock); + + if (dde->dde_loaded) + return (dde); + + dde->dde_loading = B_TRUE; + + ddt_exit(ddt); + + error = ENOENT; + + for (type = 0; type < DDT_TYPES; type++) { + for (class = 0; class < DDT_CLASSES; class++) { + error = ddt_object_lookup(ddt, type, class, dde); + if (error != ENOENT) + break; + } + if (error != ENOENT) + break; + } + + ASSERT(error == 0 || error == ENOENT); + + ddt_enter(ddt); + + ASSERT(dde->dde_loaded == B_FALSE); + ASSERT(dde->dde_loading == B_TRUE); + + dde->dde_type = type; /* will be DDT_TYPES if no entry found */ + dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ + dde->dde_loaded = B_TRUE; + dde->dde_loading = B_FALSE; + + if (error == 0) + ddt_stat_update(ddt, dde, -1ULL); + + cv_broadcast(&dde->dde_cv); + + return (dde); +} + +int +ddt_entry_compare(const void *x1, const void *x2) +{ + const ddt_entry_t *dde1 = x1; + const ddt_entry_t *dde2 = x2; + const uint64_t *u1 = (const uint64_t *)&dde1->dde_key; + const uint64_t *u2 = (const uint64_t *)&dde2->dde_key; + + for (int i = 0; i < DDT_KEY_WORDS; i++) { + if (u1[i] < u2[i]) + return (-1); + if (u1[i] > u2[i]) + return (1); + } + + return (0); +} + +static ddt_t * +ddt_table_alloc(spa_t *spa, enum zio_checksum c) +{ + ddt_t *ddt; + + ddt = kmem_zalloc(sizeof (*ddt), KM_SLEEP); + + mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); + avl_create(&ddt->ddt_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, + sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); + ddt->ddt_checksum = c; + ddt->ddt_spa = spa; + ddt->ddt_os = spa->spa_meta_objset; + + return (ddt); +} + +static void +ddt_table_free(ddt_t *ddt) +{ + ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); + ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); + avl_destroy(&ddt->ddt_tree); + avl_destroy(&ddt->ddt_repair_tree); + mutex_destroy(&ddt->ddt_lock); + kmem_free(ddt, sizeof (*ddt)); +} + +void +ddt_create(spa_t *spa) +{ + spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) + spa->spa_ddt[c] = ddt_table_alloc(spa, c); +} + +int +ddt_load(spa_t *spa) +{ + int error; + + ddt_create(spa); + + error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object); + + if (error) + return (error == ENOENT ? 0 : error); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_t *ddt = spa->spa_ddt[c]; + error = ddt_object_load(ddt, type, class); + if (error != 0 && error != ENOENT) + return (error); + } + } + } + + return (0); +} + +void +ddt_unload(spa_t *spa) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (spa->spa_ddt[c]) { + ddt_table_free(spa->spa_ddt[c]); + spa->spa_ddt[c] = NULL; + } + } +} + +boolean_t +ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) +{ + ddt_t *ddt; + ddt_entry_t dde; + + if (!BP_GET_DEDUP(bp)) + return (B_FALSE); + + if (max_class == DDT_CLASS_UNIQUE) + return (B_TRUE); + + ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; + + ddt_key_fill(&dde.dde_key, bp); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) + for (enum ddt_class class = 0; class <= max_class; class++) + if (ddt_object_lookup(ddt, type, class, &dde) == 0) + return (B_TRUE); + + return (B_FALSE); +} + +ddt_entry_t * +ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) +{ + ddt_key_t ddk; + ddt_entry_t *dde; + + ddt_key_fill(&ddk, bp); + + dde = ddt_alloc(&ddk); + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + /* + * We can only do repair if there are multiple copies + * of the block. For anything in the UNIQUE class, + * there's definitely only one copy, so don't even try. + */ + if (class != DDT_CLASS_UNIQUE && + ddt_object_lookup(ddt, type, class, dde) == 0) + return (dde); + } + } + + bzero(dde->dde_phys, sizeof (dde->dde_phys)); + + return (dde); +} + +void +ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde) +{ + avl_index_t where; + + ddt_enter(ddt); + + if (dde->dde_repair_data != NULL && spa_writeable(ddt->ddt_spa) && + avl_find(&ddt->ddt_repair_tree, dde, &where) == NULL) + avl_insert(&ddt->ddt_repair_tree, dde, where); + else + ddt_free(dde); + + ddt_exit(ddt); +} + +static void +ddt_repair_entry_done(zio_t *zio) +{ + ddt_entry_t *rdde = zio->io_private; + + ddt_free(rdde); +} + +static void +ddt_repair_entry(ddt_t *ddt, ddt_entry_t *dde, ddt_entry_t *rdde, zio_t *rio) +{ + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *rddp = rdde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + ddt_key_t *rddk = &rdde->dde_key; + zio_t *zio; + blkptr_t blk; + + zio = zio_null(rio, rio->io_spa, NULL, + ddt_repair_entry_done, rdde, rio->io_flags); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++, rddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth != rddp->ddp_phys_birth || + bcmp(ddp->ddp_dva, rddp->ddp_dva, sizeof (ddp->ddp_dva))) + continue; + ddt_bp_create(ddt->ddt_checksum, ddk, ddp, &blk); + zio_nowait(zio_rewrite(zio, zio->io_spa, 0, &blk, + rdde->dde_repair_data, DDK_GET_PSIZE(rddk), NULL, NULL, + ZIO_PRIORITY_SYNC_WRITE, ZIO_DDT_CHILD_FLAGS(zio), NULL)); + } + + zio_nowait(zio); +} + +static void +ddt_repair_table(ddt_t *ddt, zio_t *rio) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde, *rdde_next, *rdde; + avl_tree_t *t = &ddt->ddt_repair_tree; + blkptr_t blk; + + if (spa_sync_pass(spa) > 1) + return; + + ddt_enter(ddt); + for (rdde = avl_first(t); rdde != NULL; rdde = rdde_next) { + rdde_next = AVL_NEXT(t, rdde); + avl_remove(&ddt->ddt_repair_tree, rdde); + ddt_exit(ddt); + ddt_bp_create(ddt->ddt_checksum, &rdde->dde_key, NULL, &blk); + dde = ddt_repair_start(ddt, &blk); + ddt_repair_entry(ddt, dde, rdde, rio); + ddt_repair_done(ddt, dde); + ddt_enter(ddt); + } + ddt_exit(ddt); +} + +static void +ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) +{ + dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + enum ddt_type otype = dde->dde_type; + enum ddt_type ntype = DDT_TYPE_CURRENT; + enum ddt_class oclass = dde->dde_class; + enum ddt_class nclass; + uint64_t total_refcnt = 0; + + ASSERT(dde->dde_loaded); + ASSERT(!dde->dde_loading); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT((int64_t)ddp->ddp_refcnt >= 0); + if (ddp->ddp_phys_birth == 0) { + ASSERT(ddp->ddp_refcnt == 0); + continue; + } + if (p == DDT_PHYS_DITTO) { + if (ddt_ditto_copies_needed(ddt, dde, NULL) == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + continue; + } + if (ddp->ddp_refcnt == 0) + ddt_phys_free(ddt, ddk, ddp, txg); + total_refcnt += ddp->ddp_refcnt; + } + + if (dde->dde_phys[DDT_PHYS_DITTO].ddp_phys_birth != 0) + nclass = DDT_CLASS_DITTO; + else if (total_refcnt > 1) + nclass = DDT_CLASS_DUPLICATE; + else + nclass = DDT_CLASS_UNIQUE; + + if (otype != DDT_TYPES && + (otype != ntype || oclass != nclass || total_refcnt == 0)) { + VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); + ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); + } + + if (total_refcnt != 0) { + dde->dde_type = ntype; + dde->dde_class = nclass; + ddt_stat_update(ddt, dde, 0); + if (!ddt_object_exists(ddt, ntype, nclass)) + ddt_object_create(ddt, ntype, nclass, tx); + VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + + if (dp->dp_scrub_func != SCRUB_FUNC_NONE && + oclass > nclass && + nclass <= dp->dp_scrub_ddt_class_max) + dsl_pool_scrub_ddt_entry(dp, ddt->ddt_checksum, dde); + } +} + +static void +ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) +{ + spa_t *spa = ddt->ddt_spa; + ddt_entry_t *dde; + void *cookie = NULL; + + if (avl_numnodes(&ddt->ddt_tree) == 0) + return; + + ASSERT(spa_sync_pass(spa) == 1); + ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); + + if (spa->spa_ddt_stat_object == 0) { + spa->spa_ddt_stat_object = zap_create(ddt->ddt_os, + DMU_OT_DDT_STATS, DMU_OT_NONE, 0, tx); + VERIFY(zap_add(ddt->ddt_os, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_DDT_STATS, sizeof (uint64_t), 1, + &spa->spa_ddt_stat_object, tx) == 0); + } + + while ((dde = avl_destroy_nodes(&ddt->ddt_tree, &cookie)) != NULL) { + ddt_sync_entry(ddt, dde, tx, txg); + ddt_free(dde); + } + + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + if (!ddt_object_exists(ddt, type, class)) + continue; + ddt_object_sync(ddt, type, class, tx); + if (ddt_object_count(ddt, type, class) == 0) + ddt_object_destroy(ddt, type, class, tx); + } + } +} + +void +ddt_sync(spa_t *spa, uint64_t txg) +{ + dmu_tx_t *tx; + zio_t *rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + ASSERT(spa_syncing_txg(spa) == txg); + + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + if (ddt == NULL) + continue; + ddt_sync_table(ddt, tx, txg); + ddt_repair_table(ddt, rio); + } + + (void) zio_wait(rio); + + dmu_tx_commit(tx); +} + +int +ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) +{ + do { + do { + do { + ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + int error = ENOENT; + if (ddt_object_exists(ddt, ddb->ddb_type, + ddb->ddb_class)) { + error = ddt_object_walk(ddt, + ddb->ddb_type, ddb->ddb_class, + &ddb->ddb_cursor, dde); + } + if (error == 0) + return (0); + if (error != ENOENT) + return (error); + ddb->ddb_cursor = 0; + } while (++ddb->ddb_checksum < ZIO_CHECKSUM_FUNCTIONS); + ddb->ddb_checksum = 0; + } while (++ddb->ddb_type < DDT_TYPES); + ddb->ddb_type = 0; + } while (++ddb->ddb_class < DDT_CLASSES); + + return (ENOENT); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c b/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c new file mode 100644 index 0000000000000..1ba5278193373 --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/ddt_zap.c @@ -0,0 +1,150 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#include +#include +#include +#include +#include +#include +#include + +int ddt_zap_leaf_blockshift = 12; +int ddt_zap_indirect_blockshift = 12; + +static int +ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) +{ + zap_flags_t flags = ZAP_FLAG_HASH64 | ZAP_FLAG_UINT64_KEY; + + if (prehash) + flags |= ZAP_FLAG_PRE_HASHED_KEY; + + *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, + ddt_zap_leaf_blockshift, ddt_zap_indirect_blockshift, + DMU_OT_NONE, 0, tx); + + return (*objectp == 0 ? ENOTSUP : 0); +} + +static int +ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) +{ + return (zap_destroy(os, object, tx)); +} + +static int +ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t one, csize; + int error; + + error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, &one, &csize); + if (error) + return (error); + + ASSERT(one == 1); + ASSERT(csize <= sizeof (cbuf)); + + error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf); + if (error) + return (error); + + ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); + + return (0); +} + +static int +ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize; + + csize = ddt_compress(dde->dde_phys, cbuf, + sizeof (dde->dde_phys), sizeof (cbuf)); + + return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf, tx)); +} + +static int +ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +{ + return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, tx)); +} + +static int +ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) +{ + zap_cursor_t zc; + zap_attribute_t za; + int error; + + zap_cursor_init_serialized(&zc, os, object, *walk); + if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { + uchar_t cbuf[sizeof (dde->dde_phys) + 1]; + uint64_t csize = za.za_num_integers; + ASSERT(za.za_integer_length == 1); + error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, + DDT_KEY_WORDS, 1, csize, cbuf); + ASSERT(error == 0); + if (error == 0) { + ddt_decompress(cbuf, dde->dde_phys, csize, + sizeof (dde->dde_phys)); + dde->dde_key = *(ddt_key_t *)za.za_name; + } + zap_cursor_advance(&zc); + *walk = zap_cursor_serialize(&zc); + } + zap_cursor_fini(&zc); + return (error); +} + +static uint64_t +ddt_zap_count(objset_t *os, uint64_t object) +{ + uint64_t count = 0; + + VERIFY(zap_count(os, object, &count) == 0); + + return (count); +} + +const ddt_ops_t ddt_zap_ops = { + "zap", + ddt_zap_create, + ddt_zap_destroy, + ddt_zap_lookup, + ddt_zap_update, + ddt_zap_remove, + ddt_zap_walk, + ddt_zap_count, +}; diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c index b6205bd500a87..ad73451cb3a6c 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -85,6 +85,11 @@ const dmu_object_type_info_t dmu_ot[DMU_OT_NUMTYPES] = { { byteswap_uint64_array, TRUE, "FUID table size" }, { zap_byteswap, TRUE, "DSL dataset next clones"}, { zap_byteswap, TRUE, "scrub work queue" }, + { zap_byteswap, TRUE, "ZFS user/group used" }, + { zap_byteswap, TRUE, "ZFS user/group quota" }, + { zap_byteswap, TRUE, "snapshot refcount tags"}, + { zap_byteswap, TRUE, "DDT ZAP algorithm" }, + { zap_byteswap, TRUE, "DDT statistics" }, }; int @@ -96,7 +101,7 @@ dmu_buf_hold(objset_t *os, uint64_t object, uint64_t offset, dmu_buf_impl_t *db; int err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); blkid = dbuf_whichblock(dn, offset); @@ -147,7 +152,7 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) dmu_buf_impl_t *db; int error; - error = dnode_hold(os->os, object, FTAG, &dn); + error = dnode_hold(os, object, FTAG, &dn); if (error) return (error); @@ -180,22 +185,22 @@ dmu_bonus_hold(objset_t *os, uint64_t object, void *tag, dmu_buf_t **dbp) * whose dnodes are in the same block. */ static int -dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, - uint64_t length, int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp) +dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length, + int read, void *tag, int *numbufsp, dmu_buf_t ***dbpp, uint32_t flags) { dsl_pool_t *dp = NULL; dmu_buf_t **dbp; uint64_t blkid, nblks, i; - uint32_t flags; + uint32_t dbuf_flags; int err; zio_t *zio; hrtime_t start; ASSERT(length <= DMU_MAX_ACCESS); - flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT; - if (length > zfetch_array_rd_sz) - flags |= DB_RF_NOPREFETCH; + dbuf_flags = DB_RF_CANFAIL | DB_RF_NEVERWAIT | DB_RF_HAVESTRUCT; + if (flags & DMU_READ_NO_PREFETCH || length > zfetch_array_rd_sz) + dbuf_flags |= DB_RF_NOPREFETCH; rw_enter(&dn->dn_struct_rwlock, RW_READER); if (dn->dn_datablkshift) { @@ -210,6 +215,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, os_dsl_dataset->ds_object, (longlong_t)dn->dn_object, dn->dn_datablksz, (longlong_t)offset, (longlong_t)length); + rw_exit(&dn->dn_struct_rwlock); return (EIO); } nblks = 1; @@ -232,9 +238,7 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, } /* initiate async i/o */ if (read) { - rw_exit(&dn->dn_struct_rwlock); - (void) dbuf_read(db, zio, flags); - rw_enter(&dn->dn_struct_rwlock, RW_READER); + (void) dbuf_read(db, zio, dbuf_flags); } dbp[i] = &db->db; } @@ -280,12 +284,12 @@ dmu_buf_hold_array(objset_t *os, uint64_t object, uint64_t offset, dnode_t *dn; int err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp); + numbufsp, dbpp, DMU_READ_PREFETCH); dnode_rele(dn, FTAG); @@ -300,7 +304,7 @@ dmu_buf_hold_array_by_bonus(dmu_buf_t *db, uint64_t offset, int err; err = dmu_buf_hold_array_by_dnode(dn, offset, length, read, tag, - numbufsp, dbpp); + numbufsp, dbpp, DMU_READ_PREFETCH); return (err); } @@ -333,7 +337,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) return; if (len == 0) { /* they're interested in the bonus buffer */ - dn = os->os->os_meta_dnode; + dn = os->os_meta_dnode; if (object == 0 || object >= DN_MAX_OBJECT) return; @@ -350,7 +354,7 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) * already cached, we will do a *synchronous* read in the * dnode_hold() call. The same is true for any indirects. */ - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return; @@ -374,56 +378,51 @@ dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len) dnode_rele(dn, FTAG); } +/* + * Get the next "chunk" of file data to free. We traverse the file from + * the end so that the file gets shorter over time (if we crashes in the + * middle, this will leave us in a better state). We find allocated file + * data by simply searching the allocated level 1 indirects. + */ static int -get_next_chunk(dnode_t *dn, uint64_t *offset, uint64_t limit) +get_next_chunk(dnode_t *dn, uint64_t *start, uint64_t limit) { - uint64_t len = *offset - limit; - uint64_t chunk_len = dn->dn_datablksz * DMU_MAX_DELETEBLKCNT; - uint64_t subchunk = + uint64_t len = *start - limit; + uint64_t blkcnt = 0; + uint64_t maxblks = DMU_MAX_ACCESS / (1ULL << (dn->dn_indblkshift + 1)); + uint64_t iblkrange = dn->dn_datablksz * EPB(dn->dn_indblkshift, SPA_BLKPTRSHIFT); - ASSERT(limit <= *offset); + ASSERT(limit <= *start); - if (len <= chunk_len) { - *offset = limit; + if (len <= iblkrange * maxblks) { + *start = limit; return (0); } + ASSERT(ISP2(iblkrange)); - ASSERT(ISP2(subchunk)); - - while (*offset > limit) { - uint64_t initial_offset = P2ROUNDUP(*offset, subchunk); - uint64_t delta; + while (*start > limit && blkcnt < maxblks) { int err; - /* skip over allocated data */ + /* find next allocated L1 indirect */ err = dnode_next_offset(dn, - DNODE_FIND_HOLE|DNODE_FIND_BACKWARDS, offset, 1, 1, 0); - if (err == ESRCH) - *offset = limit; - else if (err) - return (err); + DNODE_FIND_BACKWARDS, start, 2, 1, 0); - ASSERT3U(*offset, <=, initial_offset); - *offset = P2ALIGN(*offset, subchunk); - delta = initial_offset - *offset; - if (delta >= chunk_len) { - *offset += delta - chunk_len; + /* if there are no more, then we are done */ + if (err == ESRCH) { + *start = limit; return (0); - } - chunk_len -= delta; - - /* skip over unallocated data */ - err = dnode_next_offset(dn, - DNODE_FIND_BACKWARDS, offset, 1, 1, 0); - if (err == ESRCH) - *offset = limit; - else if (err) + } else if (err) { return (err); + } + blkcnt += 1; - if (*offset < limit) - *offset = limit; - ASSERT3U(*offset, <, initial_offset); + /* reset offset to end of "next" block back */ + *start = P2ALIGN(*start, iblkrange); + if (*start <= limit) + *start = limit; + else + *start -= 1; } return (0); } @@ -442,7 +441,8 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, object_size = align == 1 ? dn->dn_datablksz : (dn->dn_maxblkid + 1) << dn->dn_datablkshift; - if (trunc || (end = offset + length) > object_size) + end = offset + length; + if (trunc || end > object_size) end = object_size; if (end <= offset) return (0); @@ -450,6 +450,7 @@ dmu_free_long_range_impl(objset_t *os, dnode_t *dn, uint64_t offset, while (length) { start = end; + /* assert(offset <= start) */ err = get_next_chunk(dn, &start, offset); if (err) return (err); @@ -485,7 +486,7 @@ dmu_free_long_range(objset_t *os, uint64_t object, dnode_t *dn; int err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err != 0) return (err); err = dmu_free_long_range_impl(os, dn, offset, length, FALSE); @@ -500,7 +501,7 @@ dmu_free_object(objset_t *os, uint64_t object) dmu_tx_t *tx; int err; - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err != 0) return (err); @@ -528,7 +529,7 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_tx_t *tx) { dnode_t *dn; - int err = dnode_hold(os->os, object, FTAG, &dn); + int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); ASSERT(offset < UINT64_MAX); @@ -540,13 +541,13 @@ dmu_free_range(objset_t *os, uint64_t object, uint64_t offset, int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf) + void *buf, uint32_t flags) { dnode_t *dn; dmu_buf_t **dbp; - int numbufs, i, err; + int numbufs, err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); @@ -555,7 +556,7 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, * block. If we ever do the tail block optimization, we will need to * handle that here as well. */ - if (dn->dn_datablkshift == 0) { + if (dn->dn_maxblkid == 0) { int newsz = offset > dn->dn_datablksz ? 0 : MIN(size, dn->dn_datablksz - offset); bzero((char *)buf + newsz, size - newsz); @@ -564,13 +565,14 @@ dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, while (size > 0) { uint64_t mylen = MIN(size, DMU_MAX_ACCESS / 2); + int i; /* * NB: we could do this block-at-a-time, but it's nice * to be reading in parallel. */ err = dmu_buf_hold_array_by_dnode(dn, offset, mylen, - TRUE, FTAG, &numbufs, &dbp); + TRUE, FTAG, &numbufs, &dbp, flags); if (err) break; @@ -659,12 +661,136 @@ dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, dmu_buf_rele_array(dbp, numbufs, FTAG); } +/* + * DMU support for xuio + */ +kstat_t *xuio_ksp = NULL; + +int +dmu_xuio_init(xuio_t *xuio, int nblk) +{ + dmu_xuio_t *priv; + uio_t *uio = &xuio->xu_uio; + + uio->uio_iovcnt = nblk; + uio->uio_iov = kmem_zalloc(nblk * sizeof (iovec_t), KM_SLEEP); + + priv = kmem_zalloc(sizeof (dmu_xuio_t), KM_SLEEP); + priv->cnt = nblk; + priv->bufs = kmem_zalloc(nblk * sizeof (arc_buf_t *), KM_SLEEP); + priv->iovp = uio->uio_iov; + XUIO_XUZC_PRIV(xuio) = priv; + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, nblk); + + return (0); +} + +void +dmu_xuio_fini(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int nblk = priv->cnt; + + kmem_free(priv->iovp, nblk * sizeof (iovec_t)); + kmem_free(priv->bufs, nblk * sizeof (arc_buf_t *)); + kmem_free(priv, sizeof (dmu_xuio_t)); + + if (XUIO_XUZC_RW(xuio) == UIO_READ) + XUIOSTAT_INCR(xuiostat_onloan_rbuf, -nblk); + else + XUIOSTAT_INCR(xuiostat_onloan_wbuf, -nblk); +} + +/* + * Initialize iov[priv->next] and priv->bufs[priv->next] with { off, n, abuf } + * and increase priv->next by 1. + */ +int +dmu_xuio_add(xuio_t *xuio, arc_buf_t *abuf, offset_t off, size_t n) +{ + struct iovec *iov; + uio_t *uio = &xuio->xu_uio; + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + int i = priv->next++; + + ASSERT(i < priv->cnt); + ASSERT(off + n <= arc_buf_size(abuf)); + iov = uio->uio_iov + i; + iov->iov_base = (char *)abuf->b_data + off; + iov->iov_len = n; + priv->bufs[i] = abuf; + return (0); +} + +int +dmu_xuio_cnt(xuio_t *xuio) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + return (priv->cnt); +} + +arc_buf_t * +dmu_xuio_arcbuf(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + return (priv->bufs[i]); +} + +void +dmu_xuio_clear(xuio_t *xuio, int i) +{ + dmu_xuio_t *priv = XUIO_XUZC_PRIV(xuio); + + ASSERT(i < priv->cnt); + priv->bufs[i] = NULL; +} + +static void +xuio_stat_init(void) +{ + xuio_ksp = kstat_create("zfs", 0, "xuio_stats", "misc", + KSTAT_TYPE_NAMED, sizeof (xuio_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + if (xuio_ksp != NULL) { + xuio_ksp->ks_data = &xuio_stats; + kstat_install(xuio_ksp); + } +} + +static void +xuio_stat_fini(void) +{ + if (xuio_ksp != NULL) { + kstat_delete(xuio_ksp); + xuio_ksp = NULL; + } +} + +void +xuio_stat_wbuf_copied() +{ + XUIOSTAT_BUMP(xuiostat_wbuf_copied); +} + +void +xuio_stat_wbuf_nocopy() +{ + XUIOSTAT_BUMP(xuiostat_wbuf_nocopy); +} + #ifdef _KERNEL int dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) { dmu_buf_t **dbp; int numbufs, i, err; + xuio_t *xuio = NULL; /* * NB: we could do this block-at-a-time, but it's nice @@ -675,6 +801,9 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) if (err) return (err); + if (uio->uio_extflg == UIO_XUIO) + xuio = (xuio_t *)uio; + for (i = 0; i < numbufs; i++) { int tocpy; int bufoff; @@ -685,8 +814,24 @@ dmu_read_uio(objset_t *os, uint64_t object, uio_t *uio, uint64_t size) bufoff = uio->uio_loffset - db->db_offset; tocpy = (int)MIN(db->db_size - bufoff, size); - err = uiomove((char *)db->db_data + bufoff, tocpy, - UIO_READ, uio); + if (xuio) { + dmu_buf_impl_t *dbi = (dmu_buf_impl_t *)db; + arc_buf_t *dbuf_abuf = dbi->db_buf; + arc_buf_t *abuf = dbuf_loan_arcbuf(dbi); + err = dmu_xuio_add(xuio, abuf, bufoff, tocpy); + if (!err) { + uio->uio_resid -= tocpy; + uio->uio_loffset += tocpy; + } + + if (abuf == dbuf_abuf) + XUIOSTAT_BUMP(xuiostat_rbuf_nocopy); + else + XUIOSTAT_BUMP(xuiostat_rbuf_copied); + } else { + err = uiomove((char *)db->db_data + bufoff, tocpy, + UIO_READ, uio); + } if (err) break; @@ -799,9 +944,6 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, if (tocpy == db->db_size) dmu_buf_fill_done(db, tx); - if (err) - break; - offset += tocpy; size -= tocpy; } @@ -810,48 +952,167 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, } #endif +/* + * Allocate a loaned anonymous arc buffer. + */ +arc_buf_t * +dmu_request_arcbuf(dmu_buf_t *handle, int size) +{ + dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + + return (arc_loan_buf(dn->dn_objset->os_spa, size)); +} + +/* + * Free a loaned arc buffer. + */ +void +dmu_return_arcbuf(arc_buf_t *buf) +{ + arc_return_buf(buf, FTAG); + VERIFY(arc_buf_remove_ref(buf, FTAG) == 1); +} + +/* + * When possible directly assign passed loaned arc buffer to a dbuf. + * If this is not possible copy the contents of passed arc buf via + * dmu_write(). + */ +void +dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, arc_buf_t *buf, + dmu_tx_t *tx) +{ + dnode_t *dn = ((dmu_buf_impl_t *)handle)->db_dnode; + dmu_buf_impl_t *db; + uint32_t blksz = (uint32_t)arc_buf_size(buf); + uint64_t blkid; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + blkid = dbuf_whichblock(dn, offset); + VERIFY((db = dbuf_hold(dn, blkid, FTAG)) != NULL); + rw_exit(&dn->dn_struct_rwlock); + + if (offset == db->db.db_offset && blksz == db->db.db_size) { + dbuf_assign_arcbuf(db, buf, tx); + dbuf_rele(db, FTAG); + } else { + dbuf_rele(db, FTAG); + dmu_write(dn->dn_objset, dn->dn_object, offset, blksz, + buf->b_data, tx); + dmu_return_arcbuf(buf); + XUIOSTAT_BUMP(xuiostat_wbuf_copied); + } +} + typedef struct { - dbuf_dirty_record_t *dr; - dmu_sync_cb_t *done; - void *arg; + dbuf_dirty_record_t *dsa_dr; + dmu_sync_cb_t *dsa_done; + zgd_t *dsa_zgd; + dmu_tx_t *dsa_tx; } dmu_sync_arg_t; /* ARGSUSED */ static void dmu_sync_ready(zio_t *zio, arc_buf_t *buf, void *varg) { + dmu_sync_arg_t *dsa = varg; + dmu_buf_t *db = dsa->dsa_zgd->zgd_db; + dnode_t *dn = ((dmu_buf_impl_t *)db)->db_dnode; blkptr_t *bp = zio->io_bp; - if (!BP_IS_HOLE(bp)) { - dmu_sync_arg_t *in = varg; - dbuf_dirty_record_t *dr = in->dr; - dmu_buf_impl_t *db = dr->dr_dbuf; - ASSERT(BP_GET_TYPE(bp) == db->db_dnode->dn_type); - ASSERT(BP_GET_LEVEL(bp) == 0); - bp->blk_fill = 1; + if (zio->io_error == 0) { + if (BP_IS_HOLE(bp)) { + /* + * A block of zeros may compress to a hole, but the + * block size still needs to be known for replay. + */ + BP_SET_LSIZE(bp, db->db_size); + } else { + ASSERT(BP_GET_TYPE(bp) == dn->dn_type); + ASSERT(BP_GET_LEVEL(bp) == 0); + bp->blk_fill = 1; + } } } +static void +dmu_sync_late_arrival_ready(zio_t *zio) +{ + dmu_sync_ready(zio, NULL, zio->io_private); +} + /* ARGSUSED */ static void dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) { - dmu_sync_arg_t *in = varg; - dbuf_dirty_record_t *dr = in->dr; + dmu_sync_arg_t *dsa = varg; + dbuf_dirty_record_t *dr = dsa->dsa_dr; dmu_buf_impl_t *db = dr->dr_dbuf; - dmu_sync_cb_t *done = in->done; mutex_enter(&db->db_mtx); ASSERT(dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC); - dr->dt.dl.dr_overridden_by = *zio->io_bp; /* structure assignment */ - dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + if (zio->io_error == 0) { + dr->dt.dl.dr_overridden_by = *zio->io_bp; + dr->dt.dl.dr_override_state = DR_OVERRIDDEN; + dr->dt.dl.dr_copies = zio->io_prop.zp_copies; + if (BP_IS_HOLE(&dr->dt.dl.dr_overridden_by)) + BP_ZERO(&dr->dt.dl.dr_overridden_by); + } else { + dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; + } cv_broadcast(&db->db_changed); mutex_exit(&db->db_mtx); - if (done) - done(&(db->db), in->arg); + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + kmem_free(dsa, sizeof (*dsa)); +} + +static void +dmu_sync_late_arrival_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + dmu_sync_arg_t *dsa = zio->io_private; + + if (zio->io_error == 0 && !BP_IS_HOLE(bp)) { + ASSERT(zio->io_bp->blk_birth == zio->io_txg); + ASSERT(zio->io_txg > spa_syncing_txg(zio->io_spa)); + zio_free(zio->io_spa, zio->io_txg, zio->io_bp); + } + + dmu_tx_commit(dsa->dsa_tx); + + dsa->dsa_done(dsa->dsa_zgd, zio->io_error); + + kmem_free(dsa, sizeof (*dsa)); +} + +static int +dmu_sync_late_arrival(zio_t *pio, objset_t *os, dmu_sync_cb_t *done, zgd_t *zgd, + zio_prop_t *zp, zbookmark_t *zb) +{ + dmu_sync_arg_t *dsa; + dmu_tx_t *tx; + + tx = dmu_tx_create(os); + dmu_tx_hold_space(tx, zgd->zgd_db->db_size); + if (dmu_tx_assign(tx, TXG_WAIT) != 0) { + dmu_tx_abort(tx); + return (EIO); /* Make zl_get_data do txg_waited_synced() */ + } - kmem_free(in, sizeof (dmu_sync_arg_t)); + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = NULL; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = tx; + + zio_nowait(zio_write(pio, os->os_spa, dmu_tx_get_txg(tx), zgd->zgd_bp, + zgd->zgd_db->db_data, zgd->zgd_db->db_size, zp, + dmu_sync_late_arrival_ready, dmu_sync_late_arrival_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, zb)); + + return (0); } /* @@ -870,156 +1131,108 @@ dmu_sync_done(zio_t *zio, arc_buf_t *buf, void *varg) * EALREADY: this block is already in the process of being synced. * The caller should track its progress (somehow). * - * EINPROGRESS: the IO has been initiated. - * The caller should log this blkptr in the callback. + * EIO: could not do the I/O. + * The caller should do a txg_wait_synced(). * - * 0: completed. Sets *bp to the blkptr just written. - * The caller should log this blkptr immediately. + * 0: the I/O has been initiated. + * The caller should log this blkptr in the done callback. + * It is possible that the I/O will fail, in which case + * the error will be reported to the done callback and + * propagated to pio from zio_done(). */ int -dmu_sync(zio_t *pio, dmu_buf_t *db_fake, - blkptr_t *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg) +dmu_sync(zio_t *pio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd) { - dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; - objset_impl_t *os = db->db_objset; - dsl_pool_t *dp = os->os_dsl_dataset->ds_dir->dd_pool; - tx_state_t *tx = &dp->dp_tx; + blkptr_t *bp = zgd->zgd_bp; + dmu_buf_impl_t *db = (dmu_buf_impl_t *)zgd->zgd_db; + objset_t *os = db->db_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; dbuf_dirty_record_t *dr; - dmu_sync_arg_t *in; + dmu_sync_arg_t *dsa; zbookmark_t zb; - writeprops_t wp = { 0 }; - zio_t *zio; - int err; + zio_prop_t zp; + ASSERT(pio != NULL); ASSERT(BP_IS_HOLE(bp)); ASSERT(txg != 0); - dprintf("dmu_sync txg=%llu, s,o,q %llu %llu %llu\n", - txg, tx->tx_synced_txg, tx->tx_open_txg, tx->tx_quiesced_txg); + SET_BOOKMARK(&zb, ds->ds_object, + db->db.db_object, db->db_level, db->db_blkid); + + dmu_write_policy(os, db->db_dnode, db->db_level, WP_DMU_SYNC, &zp); /* - * XXX - would be nice if we could do this without suspending... + * If we're frozen (running ziltest), we always need to generate a bp. */ - txg_suspend(dp); + if (txg > spa_freeze_txg(os->os_spa)) + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); /* - * If this txg already synced, there's nothing to do. + * Grabbing db_mtx now provides a barrier between dbuf_sync_leaf() + * and us. If we determine that this txg is not yet syncing, + * but it begins to sync a moment later, that's OK because the + * sync thread will block in dbuf_sync_leaf() until we drop db_mtx. */ - if (txg <= tx->tx_synced_txg) { - txg_resume(dp); + mutex_enter(&db->db_mtx); + + if (txg <= spa_last_synced_txg(os->os_spa)) { /* - * If we're running ziltest, we need the blkptr regardless. + * This txg has already synced. There's nothing to do. */ - if (txg > spa_freeze_txg(dp->dp_spa)) { - /* if db_blkptr == NULL, this was an empty write */ - if (db->db_blkptr) - *bp = *db->db_blkptr; /* structure assignment */ - return (0); - } + mutex_exit(&db->db_mtx); return (EEXIST); } - mutex_enter(&db->db_mtx); - - if (txg == tx->tx_syncing_txg) { - while (db->db_data_pending) { - /* - * IO is in-progress. Wait for it to finish. - * XXX - would be nice to be able to somehow "attach" - * this zio to the parent zio passed in. - */ - cv_wait(&db->db_changed, &db->db_mtx); - if (!db->db_data_pending && - db->db_blkptr && BP_IS_HOLE(db->db_blkptr)) { - /* - * IO was compressed away - */ - *bp = *db->db_blkptr; /* structure assignment */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); - } - ASSERT(db->db_data_pending || - (db->db_blkptr && db->db_blkptr->blk_birth == txg)); - } - - if (db->db_blkptr && db->db_blkptr->blk_birth == txg) { - /* - * IO is already completed. - */ - *bp = *db->db_blkptr; /* structure assignment */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); - } + if (txg <= spa_syncing_txg(os->os_spa)) { + /* + * This txg is currently syncing, so we can't mess with + * the dirty record anymore; just write a new log block. + */ + mutex_exit(&db->db_mtx); + return (dmu_sync_late_arrival(pio, os, done, zgd, &zp, &zb)); } dr = db->db_last_dirty; - while (dr && dr->dr_txg > txg) + while (dr && dr->dr_txg != txg) dr = dr->dr_next; - if (dr == NULL || dr->dr_txg < txg) { + + if (dr == NULL) { /* - * This dbuf isn't dirty, must have been free_range'd. + * There's no dr for this dbuf, so it must have been freed. * There's no need to log writes to freed blocks, so we're done. */ mutex_exit(&db->db_mtx); - txg_resume(dp); return (ENOENT); } ASSERT(dr->dr_txg == txg); - if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC) { + if (dr->dt.dl.dr_override_state == DR_IN_DMU_SYNC || + dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { /* - * We have already issued a sync write for this buffer. - */ - mutex_exit(&db->db_mtx); - txg_resume(dp); - return (EALREADY); - } else if (dr->dt.dl.dr_override_state == DR_OVERRIDDEN) { - /* - * This buffer has already been synced. It could not + * We have already issued a sync write for this buffer, + * or this buffer has already been synced. It could not * have been dirtied since, or we would have cleared the state. */ - *bp = dr->dt.dl.dr_overridden_by; /* structure assignment */ mutex_exit(&db->db_mtx); - txg_resume(dp); - return (0); + return (EALREADY); } + ASSERT(dr->dt.dl.dr_override_state == DR_NOT_OVERRIDDEN); dr->dt.dl.dr_override_state = DR_IN_DMU_SYNC; - in = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); - in->dr = dr; - in->done = done; - in->arg = arg; mutex_exit(&db->db_mtx); - txg_resume(dp); - - zb.zb_objset = os->os_dsl_dataset->ds_object; - zb.zb_object = db->db.db_object; - zb.zb_level = db->db_level; - zb.zb_blkid = db->db_blkid; - wp.wp_type = db->db_dnode->dn_type; - wp.wp_level = db->db_level; - wp.wp_copies = os->os_copies; - wp.wp_dnchecksum = db->db_dnode->dn_checksum; - wp.wp_oschecksum = os->os_checksum; - wp.wp_dncompress = db->db_dnode->dn_compress; - wp.wp_oscompress = os->os_compress; + dsa = kmem_alloc(sizeof (dmu_sync_arg_t), KM_SLEEP); + dsa->dsa_dr = dr; + dsa->dsa_done = done; + dsa->dsa_zgd = zgd; + dsa->dsa_tx = NULL; - ASSERT(BP_IS_HOLE(bp)); + zio_nowait(arc_write(pio, os->os_spa, txg, + bp, dr->dt.dl.dr_data, DBUF_IS_L2CACHEABLE(db), &zp, + dmu_sync_ready, dmu_sync_done, dsa, + ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_CANFAIL, &zb)); - zio = arc_write(pio, os->os_spa, &wp, DBUF_IS_L2CACHEABLE(db), - txg, bp, dr->dt.dl.dr_data, dmu_sync_ready, dmu_sync_done, in, - ZIO_PRIORITY_SYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); - if (pio) { - zio_nowait(zio); - err = EINPROGRESS; - } else { - err = zio_wait(zio); - ASSERT(err == 0); - } - return (err); + return (0); } int @@ -1029,7 +1242,7 @@ dmu_object_set_blocksize(objset_t *os, uint64_t object, uint64_t size, int ibs, dnode_t *dn; int err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); err = dnode_set_blksz(dn, size, ibs, tx); @@ -1044,7 +1257,7 @@ dmu_object_set_checksum(objset_t *os, uint64_t object, uint8_t checksum, dnode_t *dn; /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os->os, object, FTAG, &dn); + (void) dnode_hold(os, object, FTAG, &dn); ASSERT(checksum < ZIO_CHECKSUM_FUNCTIONS); dn->dn_checksum = checksum; dnode_setdirty(dn, tx); @@ -1058,20 +1271,98 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dnode_t *dn; /* XXX assumes dnode_hold will not get an i/o error */ - (void) dnode_hold(os->os, object, FTAG, &dn); + (void) dnode_hold(os, object, FTAG, &dn); ASSERT(compress < ZIO_COMPRESS_FUNCTIONS); dn->dn_compress = compress; dnode_setdirty(dn, tx); dnode_rele(dn, FTAG); } +int zfs_mdcomp_disable = 0; + +void +dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp) +{ + dmu_object_type_t type = dn ? dn->dn_type : DMU_OT_OBJSET; + boolean_t ismd = (level > 0 || dmu_ot[type].ot_metadata); + enum zio_checksum checksum = os->os_checksum; + enum zio_compress compress = os->os_compress; + enum zio_checksum dedup_checksum = os->os_dedup_checksum; + boolean_t dedup; + boolean_t dedup_verify = os->os_dedup_verify; + int copies = os->os_copies; + + /* + * Determine checksum setting. + */ + if (ismd) { + /* + * Metadata always gets checksummed. If the data + * checksum is multi-bit correctable, and it's not a + * ZBT-style checksum, then it's suitable for metadata + * as well. Otherwise, the metadata checksum defaults + * to fletcher4. + */ + if (zio_checksum_table[checksum].ci_correctable < 1 || + zio_checksum_table[checksum].ci_eck) + checksum = ZIO_CHECKSUM_FLETCHER_4; + } else { + checksum = zio_checksum_select(dn->dn_checksum, checksum); + } + + /* + * Determine compression setting. + */ + if (ismd) { + /* + * XXX -- we should design a compression algorithm + * that specializes in arrays of bps. + */ + compress = zfs_mdcomp_disable ? ZIO_COMPRESS_EMPTY : + ZIO_COMPRESS_LZJB; + } else { + compress = zio_compress_select(dn->dn_compress, compress); + } + + /* + * Determine dedup setting. If we are in dmu_sync(), we won't + * actually dedup now because that's all done in syncing context; + * but we do want to use the dedup checkum. If the checksum is not + * strong enough to ensure unique signatures, force dedup_verify. + */ + dedup = (!ismd && dedup_checksum != ZIO_CHECKSUM_OFF); + if (dedup) { + checksum = dedup_checksum; + if (!zio_checksum_table[checksum].ci_dedup) + dedup_verify = 1; + } + + if (wp & WP_DMU_SYNC) + dedup = 0; + + if (wp & WP_NOFILL) { + ASSERT(!ismd && level == 0); + checksum = ZIO_CHECKSUM_OFF; + compress = ZIO_COMPRESS_OFF; + dedup = B_FALSE; + } + + zp->zp_checksum = checksum; + zp->zp_compress = compress; + zp->zp_type = type; + zp->zp_level = level; + zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa)); + zp->zp_dedup = dedup; + zp->zp_dedup_verify = dedup && dedup_verify; +} + int dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) { dnode_t *dn; int i, err; - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); /* @@ -1085,7 +1376,7 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) if (i != TXG_SIZE) { dnode_rele(dn, FTAG); txg_wait_synced(dmu_objset_pool(os), 0); - err = dnode_hold(os->os, object, FTAG, &dn); + err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); } @@ -1099,21 +1390,27 @@ dmu_offset_next(objset_t *os, uint64_t object, boolean_t hole, uint64_t *off) void dmu_object_info_from_dnode(dnode_t *dn, dmu_object_info_t *doi) { + dnode_phys_t *dnp; + rw_enter(&dn->dn_struct_rwlock, RW_READER); mutex_enter(&dn->dn_mtx); + dnp = dn->dn_phys; + doi->doi_data_block_size = dn->dn_datablksz; doi->doi_metadata_block_size = dn->dn_indblkshift ? 1ULL << dn->dn_indblkshift : 0; + doi->doi_type = dn->dn_type; + doi->doi_bonus_type = dn->dn_bonustype; + doi->doi_bonus_size = dn->dn_bonuslen; doi->doi_indirection = dn->dn_nlevels; doi->doi_checksum = dn->dn_checksum; doi->doi_compress = dn->dn_compress; - doi->doi_physical_blks = (DN_USED_BYTES(dn->dn_phys) + - SPA_MINBLOCKSIZE/2) >> SPA_MINBLOCKSHIFT; - doi->doi_max_block_offset = dn->dn_phys->dn_maxblkid; - doi->doi_type = dn->dn_type; - doi->doi_bonus_size = dn->dn_bonuslen; - doi->doi_bonus_type = dn->dn_bonustype; + doi->doi_physical_blocks_512 = (DN_USED_BYTES(dnp) + 256) >> 9; + doi->doi_max_offset = (dnp->dn_maxblkid + 1) * dn->dn_datablksz; + doi->doi_fill_count = 0; + for (int i = 0; i < dnp->dn_nblkptr; i++) + doi->doi_fill_count += dnp->dn_blkptr[i].blk_fill; mutex_exit(&dn->dn_mtx); rw_exit(&dn->dn_struct_rwlock); @@ -1127,7 +1424,7 @@ int dmu_object_info(objset_t *os, uint64_t object, dmu_object_info_t *doi) { dnode_t *dn; - int err = dnode_hold(os->os, object, FTAG, &dn); + int err = dnode_hold(os, object, FTAG, &dn); if (err) return (err); @@ -1213,15 +1510,19 @@ dmu_init(void) { dbuf_init(); dnode_init(); + zfetch_init(); arc_init(); l2arc_init(); + xuio_stat_init(); } void dmu_fini(void) { arc_fini(); + zfetch_fini(); dnode_fini(); dbuf_fini(); l2arc_fini(); + xuio_stat_fini(); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c index 1b9247d66e65e..06c0ee490b016 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_object.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -34,16 +32,15 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - objset_impl_t *osi = os->os; uint64_t object; uint64_t L2_dnode_count = DNODES_PER_BLOCK << - (osi->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); + (os->os_meta_dnode->dn_indblkshift - SPA_BLKPTRSHIFT); dnode_t *dn = NULL; int restarted = B_FALSE; - mutex_enter(&osi->os_obj_lock); + mutex_enter(&os->os_obj_lock); for (;;) { - object = osi->os_obj_next; + object = os->os_obj_next; /* * Each time we polish off an L2 bp worth of dnodes * (2^13 objects), move to another L2 bp that's still @@ -53,14 +50,14 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, */ if (P2PHASE(object, L2_dnode_count) == 0) { uint64_t offset = restarted ? object << DNODE_SHIFT : 0; - int error = dnode_next_offset(osi->os_meta_dnode, + int error = dnode_next_offset(os->os_meta_dnode, DNODE_FIND_HOLE, &offset, 2, DNODES_PER_BLOCK >> 2, 0); restarted = B_TRUE; if (error == 0) object = offset >> DNODE_SHIFT; } - osi->os_obj_next = ++object; + os->os_obj_next = ++object; /* * XXX We should check for an i/o error here and return @@ -68,19 +65,19 @@ dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int blocksize, * dmu_tx_assign(), but there is currently no mechanism * to do so. */ - (void) dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, + (void) dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (dn) break; if (dmu_object_next(os, &object, B_TRUE, 0) == 0) - osi->os_obj_next = object - 1; + os->os_obj_next = object - 1; } dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); dnode_rele(dn, FTAG); - mutex_exit(&osi->os_obj_lock); + mutex_exit(&os->os_obj_lock); dmu_tx_add_new_object(tx, os, object); return (object); @@ -96,7 +93,7 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) return (EBADF); - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_FREE, FTAG, &dn); + err = dnode_hold_impl(os, object, DNODE_MUST_BE_FREE, FTAG, &dn); if (err) return (err); dnode_allocate(dn, ot, blocksize, 0, bonustype, bonuslen, tx); @@ -108,22 +105,56 @@ dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) + int blocksize, dmu_object_type_t bonustype, int bonuslen) { dnode_t *dn; + dmu_tx_t *tx; + int nblkptr; int err; - if (object == DMU_META_DNODE_OBJECT && !dmu_tx_private_ok(tx)) + if (object == DMU_META_DNODE_OBJECT) return (EBADF); - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); + + if (dn->dn_type == ot && dn->dn_datablksz == blocksize && + dn->dn_bonustype == bonustype && dn->dn_bonuslen == bonuslen) { + /* nothing is changing, this is a noop */ + dnode_rele(dn, FTAG); + return (0); + } + + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + + /* + * If we are losing blkptrs or changing the block size this must + * be a new file instance. We must clear out the previous file + * contents before we can change this type of metadata in the dnode. + */ + if (dn->dn_nblkptr > nblkptr || dn->dn_datablksz != blocksize) { + err = dmu_free_long_range(os, object, 0, DMU_OBJECT_END); + if (err) + goto out; + } + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + goto out; + } + dnode_reallocate(dn, ot, blocksize, bonustype, bonuslen, tx); + + dmu_tx_commit(tx); +out: dnode_rele(dn, FTAG); - return (0); + return (err); } int @@ -134,7 +165,7 @@ dmu_object_free(objset_t *os, uint64_t object, dmu_tx_t *tx) ASSERT(object != DMU_META_DNODE_OBJECT || dmu_tx_private_ok(tx)); - err = dnode_hold_impl(os->os, object, DNODE_MUST_BE_ALLOCATED, + err = dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, FTAG, &dn); if (err) return (err); @@ -153,7 +184,7 @@ dmu_object_next(objset_t *os, uint64_t *objectp, boolean_t hole, uint64_t txg) uint64_t offset = (*objectp + 1) << DNODE_SHIFT; int error; - error = dnode_next_offset(os->os->os_meta_dnode, + error = dnode_next_offset(os->os_meta_dnode, (hole ? DNODE_FIND_HOLE : 0), &offset, 0, DNODES_PER_BLOCK, txg); *objectp = offset >> DNODE_SHIFT; diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c index f37cc2fc56cb2..ac29deb6c1af2 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_objset.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,22 +36,22 @@ #include #include #include -#include #include #include #include #include +#include spa_t * dmu_objset_spa(objset_t *os) { - return (os->os->os_spa); + return (os->os_spa); } zilog_t * dmu_objset_zil(objset_t *os) { - return (os->os->os_zil); + return (os->os_zil); } dsl_pool_t * @@ -59,82 +59,106 @@ dmu_objset_pool(objset_t *os) { dsl_dataset_t *ds; - if ((ds = os->os->os_dsl_dataset) != NULL && ds->ds_dir) + if ((ds = os->os_dsl_dataset) != NULL && ds->ds_dir) return (ds->ds_dir->dd_pool); else - return (spa_get_dsl(os->os->os_spa)); + return (spa_get_dsl(os->os_spa)); } dsl_dataset_t * dmu_objset_ds(objset_t *os) { - return (os->os->os_dsl_dataset); + return (os->os_dsl_dataset); } dmu_objset_type_t dmu_objset_type(objset_t *os) { - return (os->os->os_phys->os_type); + return (os->os_phys->os_type); } void dmu_objset_name(objset_t *os, char *buf) { - dsl_dataset_name(os->os->os_dsl_dataset, buf); + dsl_dataset_name(os->os_dsl_dataset, buf); } uint64_t dmu_objset_id(objset_t *os) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; return (ds ? ds->ds_object : 0); } +uint64_t +dmu_objset_logbias(objset_t *os) +{ + return (os->os_logbias); +} + static void checksum_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance should have been done by now. */ ASSERT(newval != ZIO_CHECKSUM_INHERIT); - osi->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); + os->os_checksum = zio_checksum_select(newval, ZIO_CHECKSUM_ON_VALUE); } static void compression_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval != ZIO_COMPRESS_INHERIT); - osi->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); + os->os_compress = zio_compress_select(newval, ZIO_COMPRESS_ON_VALUE); } static void copies_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. */ ASSERT(newval > 0); - ASSERT(newval <= spa_max_replication(osi->os_spa)); + ASSERT(newval <= spa_max_replication(os->os_spa)); + + os->os_copies = newval; +} + +static void +dedup_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + spa_t *spa = os->os_spa; + enum zio_checksum checksum; + + /* + * Inheritance should have been done by now. + */ + ASSERT(newval != ZIO_CHECKSUM_INHERIT); + + checksum = zio_checksum_dedup_select(spa, newval, ZIO_CHECKSUM_OFF); - osi->os_copies = newval; + os->os_dedup_checksum = checksum & ZIO_CHECKSUM_MASK; + os->os_dedup_verify = !!(checksum & ZIO_CHECKSUM_VERIFY); } static void primary_cache_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. @@ -142,13 +166,13 @@ primary_cache_changed_cb(void *arg, uint64_t newval) ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); - osi->os_primary_cache = newval; + os->os_primary_cache = newval; } static void secondary_cache_changed_cb(void *arg, uint64_t newval) { - objset_impl_t *osi = arg; + objset_t *os = arg; /* * Inheritance and range checking should have been done by now. @@ -156,7 +180,19 @@ secondary_cache_changed_cb(void *arg, uint64_t newval) ASSERT(newval == ZFS_CACHE_ALL || newval == ZFS_CACHE_NONE || newval == ZFS_CACHE_METADATA); - osi->os_secondary_cache = newval; + os->os_secondary_cache = newval; +} + +static void +logbias_changed_cb(void *arg, uint64_t newval) +{ + objset_t *os = arg; + + ASSERT(newval == ZFS_LOGBIAS_LATENCY || + newval == ZFS_LOGBIAS_THROUGHPUT); + os->os_logbias = newval; + if (os->os_zil) + zil_set_logbias(os->os_zil, newval); } void @@ -164,58 +200,79 @@ dmu_objset_byteswap(void *buf, size_t size) { objset_phys_t *osp = buf; - ASSERT(size == sizeof (objset_phys_t)); + ASSERT(size == OBJSET_OLD_PHYS_SIZE || size == sizeof (objset_phys_t)); dnode_byteswap(&osp->os_meta_dnode); byteswap_uint64_array(&osp->os_zil_header, sizeof (zil_header_t)); osp->os_type = BSWAP_64(osp->os_type); + osp->os_flags = BSWAP_64(osp->os_flags); + if (size == sizeof (objset_phys_t)) { + dnode_byteswap(&osp->os_userused_dnode); + dnode_byteswap(&osp->os_groupused_dnode); + } } int dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, - objset_impl_t **osip) + objset_t **osp) { - objset_impl_t *osi; + objset_t *os; int i, err; ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); - osi = kmem_zalloc(sizeof (objset_impl_t), KM_SLEEP); - osi->os.os = osi; - osi->os_dsl_dataset = ds; - osi->os_spa = spa; - osi->os_rootbp = bp; - if (!BP_IS_HOLE(osi->os_rootbp)) { + os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); + os->os_dsl_dataset = ds; + os->os_spa = spa; + os->os_rootbp = bp; + if (!BP_IS_HOLE(os->os_rootbp)) { uint32_t aflags = ARC_WAIT; zbookmark_t zb; - zb.zb_objset = ds ? ds->ds_object : 0; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = 0; - if (DMU_OS_IS_L2CACHEABLE(osi)) + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + if (DMU_OS_IS_L2CACHEABLE(os)) aflags |= ARC_L2CACHE; - dprintf_bp(osi->os_rootbp, "reading %s", ""); + dprintf_bp(os->os_rootbp, "reading %s", ""); /* * NB: when bprewrite scrub can change the bp, * and this is called from dmu_objset_open_ds_os, the bp * could change, and we'll need a lock. */ - err = arc_read_nolock(NULL, spa, osi->os_rootbp, - arc_getbuf_func, &osi->os_phys_buf, + err = arc_read_nolock(NULL, spa, os->os_rootbp, + arc_getbuf_func, &os->os_phys_buf, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL, &aflags, &zb); if (err) { - kmem_free(osi, sizeof (objset_impl_t)); + kmem_free(os, sizeof (objset_t)); /* convert checksum errors into IO errors */ if (err == ECKSUM) err = EIO; return (err); } - osi->os_phys = osi->os_phys_buf->b_data; + + /* Increase the blocksize if we are permitted. */ + if (spa_version(spa) >= SPA_VERSION_USERSPACE && + arc_buf_size(os->os_phys_buf) < sizeof (objset_phys_t)) { + arc_buf_t *buf = arc_buf_alloc(spa, + sizeof (objset_phys_t), &os->os_phys_buf, + ARC_BUFC_METADATA); + bzero(buf->b_data, sizeof (objset_phys_t)); + bcopy(os->os_phys_buf->b_data, buf->b_data, + arc_buf_size(os->os_phys_buf)); + (void) arc_buf_remove_ref(os->os_phys_buf, + &os->os_phys_buf); + os->os_phys_buf = buf; + } + + os->os_phys = os->os_phys_buf->b_data; + os->os_flags = os->os_phys->os_flags; } else { - osi->os_phys_buf = arc_buf_alloc(spa, sizeof (objset_phys_t), - &osi->os_phys_buf, ARC_BUFC_METADATA); - osi->os_phys = osi->os_phys_buf->b_data; - bzero(osi->os_phys, sizeof (objset_phys_t)); + int size = spa_version(spa) >= SPA_VERSION_USERSPACE ? + sizeof (objset_phys_t) : OBJSET_OLD_PHYS_SIZE; + os->os_phys_buf = arc_buf_alloc(spa, size, + &os->os_phys_buf, ARC_BUFC_METADATA); + os->os_phys = os->os_phys_buf->b_data; + bzero(os->os_phys, size); } /* @@ -226,173 +283,167 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, */ if (ds) { err = dsl_prop_register(ds, "primarycache", - primary_cache_changed_cb, osi); + primary_cache_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "secondarycache", - secondary_cache_changed_cb, osi); + secondary_cache_changed_cb, os); if (!dsl_dataset_is_snapshot(ds)) { if (err == 0) err = dsl_prop_register(ds, "checksum", - checksum_changed_cb, osi); + checksum_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "compression", - compression_changed_cb, osi); + compression_changed_cb, os); if (err == 0) err = dsl_prop_register(ds, "copies", - copies_changed_cb, osi); + copies_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "dedup", + dedup_changed_cb, os); + if (err == 0) + err = dsl_prop_register(ds, "logbias", + logbias_changed_cb, os); } if (err) { - VERIFY(arc_buf_remove_ref(osi->os_phys_buf, - &osi->os_phys_buf) == 1); - kmem_free(osi, sizeof (objset_impl_t)); + VERIFY(arc_buf_remove_ref(os->os_phys_buf, + &os->os_phys_buf) == 1); + kmem_free(os, sizeof (objset_t)); return (err); } } else if (ds == NULL) { /* It's the meta-objset. */ - osi->os_checksum = ZIO_CHECKSUM_FLETCHER_4; - osi->os_compress = ZIO_COMPRESS_LZJB; - osi->os_copies = spa_max_replication(spa); - osi->os_primary_cache = ZFS_CACHE_ALL; - osi->os_secondary_cache = ZFS_CACHE_ALL; + os->os_checksum = ZIO_CHECKSUM_FLETCHER_4; + os->os_compress = ZIO_COMPRESS_LZJB; + os->os_copies = spa_max_replication(spa); + os->os_dedup_checksum = ZIO_CHECKSUM_OFF; + os->os_dedup_verify = 0; + os->os_logbias = 0; + os->os_primary_cache = ZFS_CACHE_ALL; + os->os_secondary_cache = ZFS_CACHE_ALL; } - osi->os_zil_header = osi->os_phys->os_zil_header; - osi->os_zil = zil_alloc(&osi->os, &osi->os_zil_header); + os->os_zil_header = os->os_phys->os_zil_header; + os->os_zil = zil_alloc(os, &os->os_zil_header); for (i = 0; i < TXG_SIZE; i++) { - list_create(&osi->os_dirty_dnodes[i], sizeof (dnode_t), + list_create(&os->os_dirty_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); - list_create(&osi->os_free_dnodes[i], sizeof (dnode_t), + list_create(&os->os_free_dnodes[i], sizeof (dnode_t), offsetof(dnode_t, dn_dirty_link[i])); } - list_create(&osi->os_dnodes, sizeof (dnode_t), + list_create(&os->os_dnodes, sizeof (dnode_t), offsetof(dnode_t, dn_link)); - list_create(&osi->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), + list_create(&os->os_downgraded_dbufs, sizeof (dmu_buf_impl_t), offsetof(dmu_buf_impl_t, db_link)); - mutex_init(&osi->os_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&osi->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&osi->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); - - osi->os_meta_dnode = dnode_special_open(osi, - &osi->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + mutex_init(&os->os_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_obj_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&os->os_user_ptr_lock, NULL, MUTEX_DEFAULT, NULL); + + os->os_meta_dnode = dnode_special_open(os, + &os->os_phys->os_meta_dnode, DMU_META_DNODE_OBJECT); + if (arc_buf_size(os->os_phys_buf) >= sizeof (objset_phys_t)) { + os->os_userused_dnode = dnode_special_open(os, + &os->os_phys->os_userused_dnode, DMU_USERUSED_OBJECT); + os->os_groupused_dnode = dnode_special_open(os, + &os->os_phys->os_groupused_dnode, DMU_GROUPUSED_OBJECT); + } /* * We should be the only thread trying to do this because we * have ds_opening_lock */ if (ds) { - VERIFY(NULL == dsl_dataset_set_user_ptr(ds, osi, - dmu_objset_evict)); + mutex_enter(&ds->ds_lock); + ASSERT(ds->ds_objset == NULL); + ds->ds_objset = os; + mutex_exit(&ds->ds_lock); } - *osip = osi; + *osp = os; return (0); } -static int -dmu_objset_open_ds_os(dsl_dataset_t *ds, objset_t *os, dmu_objset_type_t type) +int +dmu_objset_from_ds(dsl_dataset_t *ds, objset_t **osp) { - objset_impl_t *osi; + int err = 0; mutex_enter(&ds->ds_opening_lock); - osi = dsl_dataset_get_user_ptr(ds); - if (osi == NULL) { - int err; - + *osp = ds->ds_objset; + if (*osp == NULL) { err = dmu_objset_open_impl(dsl_dataset_get_spa(ds), - ds, &ds->ds_phys->ds_bp, &osi); - if (err) { - mutex_exit(&ds->ds_opening_lock); - return (err); - } + ds, &ds->ds_phys->ds_bp, osp); } mutex_exit(&ds->ds_opening_lock); - - os->os = osi; - os->os_mode = DS_MODE_NOHOLD; - - if (type != DMU_OST_ANY && type != os->os->os_phys->os_type) - return (EINVAL); - return (0); + return (err); } +/* called from zpl */ int -dmu_objset_open_ds(dsl_dataset_t *ds, dmu_objset_type_t type, objset_t **osp) +dmu_objset_hold(const char *name, void *tag, objset_t **osp) { - objset_t *os; + dsl_dataset_t *ds; int err; - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - err = dmu_objset_open_ds_os(ds, os, type); + err = dsl_dataset_hold(name, tag, &ds); if (err) - kmem_free(os, sizeof (objset_t)); - else - *osp = os; + return (err); + + err = dmu_objset_from_ds(ds, osp); + if (err) + dsl_dataset_rele(ds, tag); + return (err); } /* called from zpl */ int -dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp) +dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp) { - objset_t *os; dsl_dataset_t *ds; int err; - ASSERT(DS_MODE_TYPE(mode) == DS_MODE_USER || - DS_MODE_TYPE(mode) == DS_MODE_OWNER); - - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - if (DS_MODE_TYPE(mode) == DS_MODE_USER) - err = dsl_dataset_hold(name, os, &ds); - else - err = dsl_dataset_own(name, mode, os, &ds); - if (err) { - kmem_free(os, sizeof (objset_t)); + err = dsl_dataset_own(name, B_FALSE, tag, &ds); + if (err) return (err); - } - err = dmu_objset_open_ds_os(ds, os, type); + err = dmu_objset_from_ds(ds, osp); if (err) { - if (DS_MODE_TYPE(mode) == DS_MODE_USER) - dsl_dataset_rele(ds, os); - else - dsl_dataset_disown(ds, os); - kmem_free(os, sizeof (objset_t)); - } else { - os->os_mode = mode; - *osp = os; + dsl_dataset_disown(ds, tag); + } else if (type != DMU_OST_ANY && type != (*osp)->os_phys->os_type) { + dmu_objset_disown(*osp, tag); + return (EINVAL); + } else if (!readonly && dsl_dataset_is_snapshot(ds)) { + dmu_objset_disown(*osp, tag); + return (EROFS); } return (err); } void -dmu_objset_close(objset_t *os) +dmu_objset_rele(objset_t *os, void *tag) { - ASSERT(DS_MODE_TYPE(os->os_mode) == DS_MODE_USER || - DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER || - DS_MODE_TYPE(os->os_mode) == DS_MODE_NOHOLD); - - if (DS_MODE_TYPE(os->os_mode) == DS_MODE_USER) - dsl_dataset_rele(os->os->os_dsl_dataset, os); - else if (DS_MODE_TYPE(os->os_mode) == DS_MODE_OWNER) - dsl_dataset_disown(os->os->os_dsl_dataset, os); - kmem_free(os, sizeof (objset_t)); + dsl_dataset_rele(os->os_dsl_dataset, tag); +} + +void +dmu_objset_disown(objset_t *os, void *tag) +{ + dsl_dataset_disown(os->os_dsl_dataset, tag); } int dmu_objset_evict_dbufs(objset_t *os) { - objset_impl_t *osi = os->os; dnode_t *dn; - mutex_enter(&osi->os_lock); + mutex_enter(&os->os_lock); /* process the mdn last, since the other dnodes have holds on it */ - list_remove(&osi->os_dnodes, osi->os_meta_dnode); - list_insert_tail(&osi->os_dnodes, osi->os_meta_dnode); + list_remove(&os->os_dnodes, os->os_meta_dnode); + list_insert_tail(&os->os_dnodes, os->os_meta_dnode); /* * Find the first dnode with holds. We have to do this dance @@ -400,91 +451,98 @@ dmu_objset_evict_dbufs(objset_t *os) * hold. If there are no holds then it has no dbufs so OK to * skip. */ - for (dn = list_head(&osi->os_dnodes); + for (dn = list_head(&os->os_dnodes); dn && !dnode_add_ref(dn, FTAG); - dn = list_next(&osi->os_dnodes, dn)) + dn = list_next(&os->os_dnodes, dn)) continue; while (dn) { dnode_t *next_dn = dn; do { - next_dn = list_next(&osi->os_dnodes, next_dn); + next_dn = list_next(&os->os_dnodes, next_dn); } while (next_dn && !dnode_add_ref(next_dn, FTAG)); - mutex_exit(&osi->os_lock); + mutex_exit(&os->os_lock); dnode_evict_dbufs(dn); dnode_rele(dn, FTAG); - mutex_enter(&osi->os_lock); + mutex_enter(&os->os_lock); dn = next_dn; } - mutex_exit(&osi->os_lock); - return (list_head(&osi->os_dnodes) != osi->os_meta_dnode); + mutex_exit(&os->os_lock); + return (list_head(&os->os_dnodes) != os->os_meta_dnode); } void -dmu_objset_evict(dsl_dataset_t *ds, void *arg) +dmu_objset_evict(objset_t *os) { - objset_impl_t *osi = arg; - objset_t os; - int i; + dsl_dataset_t *ds = os->os_dsl_dataset; - for (i = 0; i < TXG_SIZE; i++) { - ASSERT(list_head(&osi->os_dirty_dnodes[i]) == NULL); - ASSERT(list_head(&osi->os_free_dnodes[i]) == NULL); - } + for (int t = 0; t < TXG_SIZE; t++) + ASSERT(!dmu_objset_is_dirty(os, t)); if (ds) { if (!dsl_dataset_is_snapshot(ds)) { VERIFY(0 == dsl_prop_unregister(ds, "checksum", - checksum_changed_cb, osi)); + checksum_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "compression", - compression_changed_cb, osi)); + compression_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "copies", - copies_changed_cb, osi)); + copies_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "dedup", + dedup_changed_cb, os)); + VERIFY(0 == dsl_prop_unregister(ds, "logbias", + logbias_changed_cb, os)); } VERIFY(0 == dsl_prop_unregister(ds, "primarycache", - primary_cache_changed_cb, osi)); + primary_cache_changed_cb, os)); VERIFY(0 == dsl_prop_unregister(ds, "secondarycache", - secondary_cache_changed_cb, osi)); + secondary_cache_changed_cb, os)); } /* * We should need only a single pass over the dnode list, since * nothing can be added to the list at this point. */ - os.os = osi; - (void) dmu_objset_evict_dbufs(&os); + (void) dmu_objset_evict_dbufs(os); - ASSERT3P(list_head(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_tail(&osi->os_dnodes), ==, osi->os_meta_dnode); - ASSERT3P(list_head(&osi->os_meta_dnode->dn_dbufs), ==, NULL); + dnode_special_close(os->os_meta_dnode); + if (os->os_userused_dnode) { + dnode_special_close(os->os_userused_dnode); + dnode_special_close(os->os_groupused_dnode); + } + zil_free(os->os_zil); - dnode_special_close(osi->os_meta_dnode); - zil_free(osi->os_zil); + ASSERT3P(list_head(&os->os_dnodes), ==, NULL); - VERIFY(arc_buf_remove_ref(osi->os_phys_buf, &osi->os_phys_buf) == 1); - mutex_destroy(&osi->os_lock); - mutex_destroy(&osi->os_obj_lock); - mutex_destroy(&osi->os_user_ptr_lock); - kmem_free(osi, sizeof (objset_impl_t)); + VERIFY(arc_buf_remove_ref(os->os_phys_buf, &os->os_phys_buf) == 1); + mutex_destroy(&os->os_lock); + mutex_destroy(&os->os_obj_lock); + mutex_destroy(&os->os_user_ptr_lock); + kmem_free(os, sizeof (objset_t)); +} + +timestruc_t +dmu_objset_snap_cmtime(objset_t *os) +{ + return (dsl_dir_snap_cmtime(os->os_dsl_dataset->ds_dir)); } /* called from dsl for meta-objset */ -objset_impl_t * +objset_t * dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx) { - objset_impl_t *osi; + objset_t *os; dnode_t *mdn; ASSERT(dmu_tx_is_syncing(tx)); if (ds) mutex_enter(&ds->ds_opening_lock); - VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &osi)); + VERIFY(0 == dmu_objset_open_impl(spa, ds, bp, &os)); if (ds) mutex_exit(&ds->ds_opening_lock); - mdn = osi->os_meta_dnode; + mdn = os->os_meta_dnode; dnode_allocate(mdn, DMU_OT_DNODE, 1 << DNODE_BLOCK_SHIFT, DN_MAX_INDBLKSHIFT, DMU_OT_NONE, 0, tx); @@ -519,17 +577,21 @@ dmu_objset_create_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(type != DMU_OST_NONE); ASSERT(type != DMU_OST_ANY); ASSERT(type < DMU_OST_NUMTYPES); - osi->os_phys->os_type = type; + os->os_phys->os_type = type; + if (dmu_objset_userused_enabled(os)) { + os->os_phys->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + os->os_flags = os->os_phys->os_flags; + } dsl_dataset_dirty(ds, tx); - return (osi); + return (os); } struct oscarg { void (*userfunc)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); void *userarg; - dsl_dataset_t *clone_parent; + dsl_dataset_t *clone_origin; const char *lastname; dmu_objset_type_t type; uint64_t flags; @@ -550,17 +612,13 @@ dmu_objset_create_check(void *arg1, void *arg2, dmu_tx_t *tx) if (err != ENOENT) return (err ? err : EEXIST); - if (oa->clone_parent != NULL) { - /* - * You can't clone across pools. - */ - if (oa->clone_parent->ds_dir->dd_pool != dd->dd_pool) + if (oa->clone_origin != NULL) { + /* You can't clone across pools. */ + if (oa->clone_origin->ds_dir->dd_pool != dd->dd_pool) return (EXDEV); - /* - * You can only clone snapshots, not the head datasets. - */ - if (oa->clone_parent->ds_phys->ds_num_children == 0) + /* You can only clone snapshots, not the head datasets. */ + if (!dsl_dataset_is_snapshot(oa->clone_origin)) return (EINVAL); } @@ -572,37 +630,37 @@ dmu_objset_create_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct oscarg *oa = arg2; - dsl_dataset_t *ds; - blkptr_t *bp; uint64_t dsobj; ASSERT(dmu_tx_is_syncing(tx)); dsobj = dsl_dataset_create_sync(dd, oa->lastname, - oa->clone_parent, oa->flags, cr, tx); + oa->clone_origin, oa->flags, cr, tx); - VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, FTAG, &ds)); - bp = dsl_dataset_get_blkptr(ds); - if (BP_IS_HOLE(bp)) { - objset_impl_t *osi; + if (oa->clone_origin == NULL) { + dsl_dataset_t *ds; + blkptr_t *bp; + objset_t *os; - /* This is an empty dmu_objset; not a clone. */ - osi = dmu_objset_create_impl(dsl_dataset_get_spa(ds), + VERIFY(0 == dsl_dataset_hold_obj(dd->dd_pool, dsobj, + FTAG, &ds)); + bp = dsl_dataset_get_blkptr(ds); + ASSERT(BP_IS_HOLE(bp)); + + os = dmu_objset_create_impl(dsl_dataset_get_spa(ds), ds, bp, oa->type, tx); if (oa->userfunc) - oa->userfunc(&osi->os, oa->userarg, cr, tx); + oa->userfunc(os, oa->userarg, cr, tx); + dsl_dataset_rele(ds, FTAG); } spa_history_internal_log(LOG_DS_CREATE, dd->dd_pool->dp_spa, tx, cr, "dataset = %llu", dsobj); - - dsl_dataset_rele(ds, FTAG); } int -dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, uint64_t flags, +dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg) { dsl_dir_t *pdd; @@ -619,24 +677,12 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, return (EEXIST); } - dprintf("name=%s\n", name); - oa.userfunc = func; oa.userarg = arg; oa.lastname = tail; oa.type = type; oa.flags = flags; - if (clone_parent != NULL) { - /* - * You can't clone to a different type. - */ - if (clone_parent->os->os_phys->os_type != type) { - dsl_dir_close(pdd, FTAG); - return (EINVAL); - } - oa.clone_parent = clone_parent->os->os_dsl_dataset; - } err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, dmu_objset_create_sync, pdd, &oa, 5); dsl_dir_close(pdd, FTAG); @@ -644,100 +690,135 @@ dmu_objset_create(const char *name, dmu_objset_type_t type, } int -dmu_objset_destroy(const char *name) +dmu_objset_clone(const char *name, dsl_dataset_t *clone_origin, uint64_t flags) { - objset_t *os; - int error; - - /* - * If it looks like we'll be able to destroy it, and there's - * an unplayed replay log sitting around, destroy the log. - * It would be nicer to do this in dsl_dataset_destroy_sync(), - * but the replay log objset is modified in open context. - */ - error = dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_OWNER|DS_MODE_READONLY|DS_MODE_INCONSISTENT, &os); - if (error == 0) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; - zil_destroy(dmu_objset_zil(os), B_FALSE); + dsl_dir_t *pdd; + const char *tail; + int err = 0; + struct oscarg oa = { 0 }; - error = dsl_dataset_destroy(ds, os); - /* - * dsl_dataset_destroy() closes the ds. - */ - kmem_free(os, sizeof (objset_t)); + ASSERT(strchr(name, '@') == NULL); + err = dsl_dir_open(name, FTAG, &pdd, &tail); + if (err) + return (err); + if (tail == NULL) { + dsl_dir_close(pdd, FTAG); + return (EEXIST); } - return (error); + oa.lastname = tail; + oa.clone_origin = clone_origin; + oa.flags = flags; + + err = dsl_sync_task_do(pdd->dd_pool, dmu_objset_create_check, + dmu_objset_create_sync, pdd, &oa, 5); + dsl_dir_close(pdd, FTAG); + return (err); } -/* - * This will close the objset. - */ int -dmu_objset_rollback(objset_t *os) +dmu_objset_destroy(const char *name, boolean_t defer) { - int err; dsl_dataset_t *ds; - - ds = os->os->os_dsl_dataset; - - if (!dsl_dataset_tryown(ds, TRUE, os)) { - dmu_objset_close(os); - return (EBUSY); - } - - err = dsl_dataset_rollback(ds, os->os->os_phys->os_type); + int error; /* - * NB: we close the objset manually because the rollback - * actually implicitly called dmu_objset_evict(), thus freeing - * the objset_impl_t. + * dsl_dataset_destroy() can free any claimed-but-unplayed + * intent log, but if there is an active log, it has blocks that + * are allocated, but may not yet be reflected in the on-disk + * structure. Only the ZIL knows how to free them, so we have + * to call into it here. */ - dsl_dataset_disown(ds, os); - kmem_free(os, sizeof (objset_t)); - return (err); + error = dsl_dataset_own(name, B_TRUE, FTAG, &ds); + if (error == 0) { + objset_t *os; + if (dmu_objset_from_ds(ds, &os) == 0) + zil_destroy(dmu_objset_zil(os), B_FALSE); + error = dsl_dataset_destroy(ds, FTAG, defer); + /* dsl_dataset_destroy() closes the ds. */ + } + + return (error); } struct snaparg { dsl_sync_task_group_t *dstg; char *snapname; char failed[MAXPATHLEN]; - boolean_t checkperms; - list_t objsets; + boolean_t recursive; + nvlist_t *props; }; -struct osnode { - list_node_t node; - objset_t *os; -}; +static int +snapshot_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + objset_t *os = arg1; + struct snaparg *sn = arg2; + + /* The props have already been checked by zfs_check_userprops(). */ + + return (dsl_dataset_snapshot_check(os->os_dsl_dataset, + sn->snapname, tx)); +} + +static void +snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + objset_t *os = arg1; + dsl_dataset_t *ds = os->os_dsl_dataset; + struct snaparg *sn = arg2; + + dsl_dataset_snapshot_sync(ds, sn->snapname, cr, tx); + + if (sn->props) { + dsl_props_arg_t pa; + pa.pa_props = sn->props; + pa.pa_source = ZPROP_SRC_LOCAL; + dsl_props_set_sync(ds->ds_prev, &pa, cr, tx); + } +} static int -dmu_objset_snapshot_one(char *name, void *arg) +dmu_objset_snapshot_one(const char *name, void *arg) { struct snaparg *sn = arg; objset_t *os; int err; + char *cp; + + /* + * If the objset starts with a '%', then ignore it unless it was + * explicitly named (ie, not recursive). These hidden datasets + * are always inconsistent, and by not opening them here, we can + * avoid a race with dsl_dir_destroy_check(). + */ + cp = strrchr(name, '/'); + if (cp && cp[1] == '%' && sn->recursive) + return (0); (void) strcpy(sn->failed, name); /* - * Check permissions only when requested. This only applies when - * doing a recursive snapshot. The permission checks for the starting - * dataset have already been performed in zfs_secpolicy_snapshot() + * Check permissions if we are doing a recursive snapshot. The + * permission checks for the starting dataset have already been + * performed in zfs_secpolicy_snapshot() */ - if (sn->checkperms == B_TRUE && - (err = zfs_secpolicy_snapshot_perms(name, CRED()))) + if (sn->recursive && (err = zfs_secpolicy_snapshot_perms(name, CRED()))) return (err); - err = dmu_objset_open(name, DMU_OST_ANY, DS_MODE_USER, &os); + err = dmu_objset_hold(name, sn, &os); if (err != 0) return (err); - /* If the objset is in an inconsistent state, return busy */ - if (os->os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { - dmu_objset_close(os); - return (EBUSY); + /* + * If the objset is in an inconsistent state (eg, in the process + * of being destroyed), don't snapshot it. As with %hidden + * datasets, we return EBUSY if this name was explicitly + * requested (ie, not recursive), and otherwise ignore it. + */ + if (os->os_dsl_dataset->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) { + dmu_objset_rele(os, sn); + return (sn->recursive ? 0 : EBUSY); } /* @@ -747,26 +828,21 @@ dmu_objset_snapshot_one(char *name, void *arg) */ err = zil_suspend(dmu_objset_zil(os)); if (err == 0) { - struct osnode *osn; - dsl_sync_task_create(sn->dstg, dsl_dataset_snapshot_check, - dsl_dataset_snapshot_sync, os->os->os_dsl_dataset, - sn->snapname, 3); - osn = kmem_alloc(sizeof (struct osnode), KM_SLEEP); - osn->os = os; - list_insert_tail(&sn->objsets, osn); + dsl_sync_task_create(sn->dstg, snapshot_check, + snapshot_sync, os, sn, 3); } else { - dmu_objset_close(os); + dmu_objset_rele(os, sn); } return (err); } int -dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) +dmu_objset_snapshot(char *fsname, char *snapname, + nvlist_t *props, boolean_t recursive) { dsl_sync_task_t *dst; - struct osnode *osn; - struct snaparg sn = { 0 }; + struct snaparg sn; spa_t *spa; int err; @@ -778,39 +854,29 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) sn.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); sn.snapname = snapname; - list_create(&sn.objsets, sizeof (struct osnode), - offsetof(struct osnode, node)); + sn.props = props; + sn.recursive = recursive; if (recursive) { - sn.checkperms = B_TRUE; err = dmu_objset_find(fsname, dmu_objset_snapshot_one, &sn, DS_FIND_CHILDREN); } else { - sn.checkperms = B_FALSE; err = dmu_objset_snapshot_one(fsname, &sn); } - if (err) - goto out; - - err = dsl_sync_task_group_wait(sn.dstg); + if (err == 0) + err = dsl_sync_task_group_wait(sn.dstg); for (dst = list_head(&sn.dstg->dstg_tasks); dst; dst = list_next(&sn.dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; + objset_t *os = dst->dst_arg1; + dsl_dataset_t *ds = os->os_dsl_dataset; if (dst->dst_err) dsl_dataset_name(ds, sn.failed); + zil_resume(dmu_objset_zil(os)); + dmu_objset_rele(os, &sn); } -out: - while (osn = list_head(&sn.objsets)) { - list_remove(&sn.objsets, osn); - zil_resume(dmu_objset_zil(osn->os)); - dmu_objset_close(osn->os); - kmem_free(osn, sizeof (struct osnode)); - } - list_destroy(&sn.objsets); - if (err) (void) strcpy(fsname, sn.failed); dsl_sync_task_group_destroy(sn.dstg); @@ -819,7 +885,7 @@ dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive) } static void -dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) +dmu_objset_sync_dnodes(list_t *list, list_t *newlist, dmu_tx_t *tx) { dnode_t *dn; @@ -827,25 +893,30 @@ dmu_objset_sync_dnodes(list_t *list, dmu_tx_t *tx) ASSERT(dn->dn_object != DMU_META_DNODE_OBJECT); ASSERT(dn->dn_dbuf->db_data_pending); /* - * Initialize dn_zio outside dnode_sync() - * to accomodate meta-dnode + * Initialize dn_zio outside dnode_sync() because the + * meta-dnode needs to set it ouside dnode_sync(). */ dn->dn_zio = dn->dn_dbuf->db_data_pending->dr_zio; ASSERT(dn->dn_zio); ASSERT3U(dn->dn_nlevels, <=, DN_MAX_LEVELS); list_remove(list, dn); + + if (newlist) { + (void) dnode_add_ref(dn, newlist); + list_insert_tail(newlist, dn); + } + dnode_sync(dn, tx); } } /* ARGSUSED */ static void -ready(zio_t *zio, arc_buf_t *abuf, void *arg) +dmu_objset_write_ready(zio_t *zio, arc_buf_t *abuf, void *arg) { blkptr_t *bp = zio->io_bp; - blkptr_t *bp_orig = &zio->io_bp_orig; - objset_impl_t *os = arg; + objset_t *os = arg; dnode_phys_t *dnp = &os->os_phys->os_meta_dnode; ASSERT(bp == os->os_rootbp); @@ -853,31 +924,45 @@ ready(zio_t *zio, arc_buf_t *abuf, void *arg) ASSERT(BP_GET_LEVEL(bp) == 0); /* - * Update rootbp fill count. + * Update rootbp fill count: it should be the number of objects + * allocated in the object set (not counting the "special" + * objects that are stored in the objset_phys_t -- the meta + * dnode and user/group accounting objects). */ - bp->blk_fill = 1; /* count the meta-dnode */ + bp->blk_fill = 0; for (int i = 0; i < dnp->dn_nblkptr; i++) bp->blk_fill += dnp->dn_blkptr[i].blk_fill; +} + +/* ARGSUSED */ +static void +dmu_objset_write_done(zio_t *zio, arc_buf_t *abuf, void *arg) +{ + blkptr_t *bp = zio->io_bp; + blkptr_t *bp_orig = &zio->io_bp_orig; + objset_t *os = arg; if (zio->io_flags & ZIO_FLAG_IO_REWRITE) { - ASSERT(DVA_EQUAL(BP_IDENTITY(bp), BP_IDENTITY(bp_orig))); + ASSERT(BP_EQUAL(bp, bp_orig)); } else { - if (zio->io_bp_orig.blk_birth == os->os_synctx->tx_txg) - (void) dsl_dataset_block_kill(os->os_dsl_dataset, - &zio->io_bp_orig, zio, os->os_synctx); - dsl_dataset_block_born(os->os_dsl_dataset, bp, os->os_synctx); + dsl_dataset_t *ds = os->os_dsl_dataset; + dmu_tx_t *tx = os->os_synctx; + + (void) dsl_dataset_block_kill(ds, bp_orig, tx, B_TRUE); + dsl_dataset_block_born(ds, bp, tx); } } /* called from dsl */ void -dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) +dmu_objset_sync(objset_t *os, zio_t *pio, dmu_tx_t *tx) { int txgoff; zbookmark_t zb; - writeprops_t wp = { 0 }; + zio_prop_t zp; zio_t *zio; list_t *list; + list_t *newlist = NULL; dbuf_dirty_record_t *dr; dprintf_ds(os->os_dsl_dataset, "txg=%llu\n", tx->tx_txg); @@ -898,37 +983,49 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) /* * Create the root block IO */ - zb.zb_objset = os->os_dsl_dataset ? os->os_dsl_dataset->ds_object : 0; - zb.zb_object = 0; - zb.zb_level = -1; /* for block ordering; it's level 0 on disk */ - zb.zb_blkid = 0; - - wp.wp_type = DMU_OT_OBJSET; - wp.wp_level = 0; /* on-disk BP level; see above */ - wp.wp_copies = os->os_copies; - wp.wp_oschecksum = os->os_checksum; - wp.wp_oscompress = os->os_compress; - - if (BP_IS_OLDER(os->os_rootbp, tx->tx_txg)) { - (void) dsl_dataset_block_kill(os->os_dsl_dataset, - os->os_rootbp, pio, tx); - } - arc_release(os->os_phys_buf, &os->os_phys_buf); - zio = arc_write(pio, os->os_spa, &wp, DMU_OS_IS_L2CACHEABLE(os), - tx->tx_txg, os->os_rootbp, os->os_phys_buf, ready, NULL, os, + + SET_BOOKMARK(&zb, os->os_dsl_dataset ? + os->os_dsl_dataset->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + dmu_write_policy(os, NULL, 0, 0, &zp); + + zio = arc_write(pio, os->os_spa, tx->tx_txg, + os->os_rootbp, os->os_phys_buf, DMU_OS_IS_L2CACHEABLE(os), &zp, + dmu_objset_write_ready, dmu_objset_write_done, os, ZIO_PRIORITY_ASYNC_WRITE, ZIO_FLAG_MUSTSUCCEED, &zb); /* - * Sync meta-dnode - the parent IO for the sync is the root block + * Sync special dnodes - the parent IO for the sync is the root block */ os->os_meta_dnode->dn_zio = zio; dnode_sync(os->os_meta_dnode, tx); + os->os_phys->os_flags = os->os_flags; + + if (os->os_userused_dnode && + os->os_userused_dnode->dn_type != DMU_OT_NONE) { + os->os_userused_dnode->dn_zio = zio; + dnode_sync(os->os_userused_dnode, tx); + os->os_groupused_dnode->dn_zio = zio; + dnode_sync(os->os_groupused_dnode, tx); + } + txgoff = tx->tx_txg & TXG_MASK; - dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], tx); - dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], tx); + if (dmu_objset_userused_enabled(os)) { + newlist = &os->os_synced_dnodes; + /* + * We must create the list here because it uses the + * dn_dirty_link[] of this txg. + */ + list_create(newlist, sizeof (dnode_t), + offsetof(dnode_t, dn_dirty_link[txgoff])); + } + + dmu_objset_sync_dnodes(&os->os_free_dnodes[txgoff], newlist, tx); + dmu_objset_sync_dnodes(&os->os_dirty_dnodes[txgoff], newlist, tx); list = &os->os_meta_dnode->dn_dirty_records[txgoff]; while (dr = list_head(list)) { @@ -945,46 +1042,199 @@ dmu_objset_sync(objset_impl_t *os, zio_t *pio, dmu_tx_t *tx) zio_nowait(zio); } +boolean_t +dmu_objset_is_dirty(objset_t *os, uint64_t txg) +{ + return (!list_is_empty(&os->os_dirty_dnodes[txg & TXG_MASK]) || + !list_is_empty(&os->os_free_dnodes[txg & TXG_MASK])); +} + +static objset_used_cb_t *used_cbs[DMU_OST_NUMTYPES]; + +void +dmu_objset_register_type(dmu_objset_type_t ost, objset_used_cb_t *cb) +{ + used_cbs[ost] = cb; +} + +boolean_t +dmu_objset_userused_enabled(objset_t *os) +{ + return (spa_version(os->os_spa) >= SPA_VERSION_USERSPACE && + used_cbs[os->os_phys->os_type] && + os->os_userused_dnode); +} + +static void +do_userquota_callback(objset_t *os, dnode_phys_t *dnp, + boolean_t subtract, dmu_tx_t *tx) +{ + static const char zerobuf[DN_MAX_BONUSLEN] = {0}; + uint64_t user, group; + + ASSERT(dnp->dn_type != 0 || + (bcmp(DN_BONUS(dnp), zerobuf, DN_MAX_BONUSLEN) == 0 && + DN_USED_BYTES(dnp) == 0)); + + if ((dnp->dn_flags & DNODE_FLAG_USERUSED_ACCOUNTED) && + 0 == used_cbs[os->os_phys->os_type](dnp->dn_bonustype, + DN_BONUS(dnp), &user, &group)) { + int64_t delta = DNODE_SIZE + DN_USED_BYTES(dnp); + if (subtract) + delta = -delta; + VERIFY3U(0, ==, zap_increment_int(os, DMU_USERUSED_OBJECT, + user, delta, tx)); + VERIFY3U(0, ==, zap_increment_int(os, DMU_GROUPUSED_OBJECT, + group, delta, tx)); + } +} + +void +dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx) +{ + dnode_t *dn; + list_t *list = &os->os_synced_dnodes; + + ASSERT(list_head(list) == NULL || dmu_objset_userused_enabled(os)); + + while (dn = list_head(list)) { + ASSERT(!DMU_OBJECT_IS_SPECIAL(dn->dn_object)); + ASSERT(dn->dn_oldphys); + ASSERT(dn->dn_phys->dn_type == DMU_OT_NONE || + dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED); + + /* Allocate the user/groupused objects if necessary. */ + if (os->os_userused_dnode->dn_type == DMU_OT_NONE) { + VERIFY(0 == zap_create_claim(os, + DMU_USERUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + VERIFY(0 == zap_create_claim(os, + DMU_GROUPUSED_OBJECT, + DMU_OT_USERGROUP_USED, DMU_OT_NONE, 0, tx)); + } + + /* + * We intentionally modify the zap object even if the + * net delta (due to phys-oldphys) is zero. Otherwise + * the block of the zap obj could be shared between + * datasets but need to be different between them after + * a bprewrite. + */ + do_userquota_callback(os, dn->dn_oldphys, B_TRUE, tx); + do_userquota_callback(os, dn->dn_phys, B_FALSE, tx); + + /* + * The mutex is needed here for interlock with dnode_allocate. + */ + mutex_enter(&dn->dn_mtx); + zio_buf_free(dn->dn_oldphys, sizeof (dnode_phys_t)); + dn->dn_oldphys = NULL; + mutex_exit(&dn->dn_mtx); + + list_remove(list, dn); + dnode_rele(dn, list); + } +} + +boolean_t +dmu_objset_userspace_present(objset_t *os) +{ + return (os->os_phys->os_flags & + OBJSET_FLAG_USERACCOUNTING_COMPLETE); +} + +int +dmu_objset_userspace_upgrade(objset_t *os) +{ + uint64_t obj; + int err = 0; + + if (dmu_objset_userspace_present(os)) + return (0); + if (!dmu_objset_userused_enabled(os)) + return (ENOTSUP); + if (dmu_objset_is_snapshot(os)) + return (EINVAL); + + /* + * We simply need to mark every object dirty, so that it will be + * synced out and now accounted. If this is called + * concurrently, or if we already did some work before crashing, + * that's fine, since we track each object's accounted state + * independently. + */ + + for (obj = 0; err == 0; err = dmu_object_next(os, &obj, FALSE, 0)) { + dmu_tx_t *tx; + dmu_buf_t *db; + int objerr; + + if (issig(JUSTLOOKING) && issig(FORREAL)) + return (EINTR); + + objerr = dmu_bonus_hold(os, obj, FTAG, &db); + if (objerr) + continue; + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, obj); + objerr = dmu_tx_assign(tx, TXG_WAIT); + if (objerr) { + dmu_tx_abort(tx); + continue; + } + dmu_buf_will_dirty(db, tx); + dmu_buf_rele(db, FTAG); + dmu_tx_commit(tx); + } + + os->os_flags |= OBJSET_FLAG_USERACCOUNTING_COMPLETE; + txg_wait_synced(dmu_objset_pool(os), 0); + return (0); +} + void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp) { - dsl_dataset_space(os->os->os_dsl_dataset, refdbytesp, availbytesp, + dsl_dataset_space(os->os_dsl_dataset, refdbytesp, availbytesp, usedobjsp, availobjsp); } uint64_t dmu_objset_fsid_guid(objset_t *os) { - return (dsl_dataset_fsid_guid(os->os->os_dsl_dataset)); + return (dsl_dataset_fsid_guid(os->os_dsl_dataset)); } void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat) { - stat->dds_type = os->os->os_phys->os_type; - if (os->os->os_dsl_dataset) - dsl_dataset_fast_stat(os->os->os_dsl_dataset, stat); + stat->dds_type = os->os_phys->os_type; + if (os->os_dsl_dataset) + dsl_dataset_fast_stat(os->os_dsl_dataset, stat); } void dmu_objset_stats(objset_t *os, nvlist_t *nv) { - ASSERT(os->os->os_dsl_dataset || - os->os->os_phys->os_type == DMU_OST_META); + ASSERT(os->os_dsl_dataset || + os->os_phys->os_type == DMU_OST_META); - if (os->os->os_dsl_dataset != NULL) - dsl_dataset_stats(os->os->os_dsl_dataset, nv); + if (os->os_dsl_dataset != NULL) + dsl_dataset_stats(os->os_dsl_dataset, nv); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_TYPE, - os->os->os_phys->os_type); + os->os_phys->os_type); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERACCOUNTING, + dmu_objset_userspace_present(os)); } int dmu_objset_is_snapshot(objset_t *os) { - if (os->os->os_dsl_dataset != NULL) - return (dsl_dataset_is_snapshot(os->os->os_dsl_dataset)); + if (os->os_dsl_dataset != NULL) + return (dsl_dataset_is_snapshot(os->os_dsl_dataset)); else return (B_FALSE); } @@ -993,7 +1243,7 @@ int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; uint64_t ignored; if (ds->ds_phys->ds_snapnames_zapobj == 0) @@ -1008,7 +1258,7 @@ int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp, boolean_t *case_conflict) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; + dsl_dataset_t *ds = os->os_dsl_dataset; zap_cursor_t cursor; zap_attribute_t attr; @@ -1045,12 +1295,12 @@ int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp) { - dsl_dir_t *dd = os->os->os_dsl_dataset->ds_dir; + dsl_dir_t *dd = os->os_dsl_dataset->ds_dir; zap_cursor_t cursor; zap_attribute_t attr; /* there is no next dir on a snapshot! */ - if (os->os->os_dsl_dataset->ds_object != + if (os->os_dsl_dataset->ds_object != dd->dd_phys->dd_head_dataset_obj) return (ENOENT); @@ -1079,7 +1329,7 @@ dmu_dir_list_next(objset_t *os, int namelen, char *name, } struct findarg { - int (*func)(char *, void *); + int (*func)(const char *, void *); void *arg; }; @@ -1088,7 +1338,7 @@ static int findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) { struct findarg *fa = arg; - return (fa->func((char *)dsname, fa->arg)); + return (fa->func(dsname, fa->arg)); } /* @@ -1096,7 +1346,8 @@ findfunc(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) * Perhaps change all callers to use dmu_objset_find_spa()? */ int -dmu_objset_find(char *name, int func(char *, void *), void *arg, int flags) +dmu_objset_find(char *name, int func(const char *, void *), void *arg, + int flags) { struct findarg fa; fa.func = func; @@ -1147,12 +1398,9 @@ dmu_objset_find_spa(spa_t *spa, const char *name, ASSERT(attr->za_integer_length == sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strcpy(child, name); - (void) strcat(child, "/"); - (void) strcat(child, attr->za_name); + child = kmem_asprintf("%s/%s", name, attr->za_name); err = dmu_objset_find_spa(spa, child, func, arg, flags); - kmem_free(child, MAXPATHLEN); + strfree(child); if (err) break; } @@ -1186,13 +1434,11 @@ dmu_objset_find_spa(spa_t *spa, const char *name, sizeof (uint64_t)); ASSERT(attr->za_num_integers == 1); - child = kmem_alloc(MAXPATHLEN, KM_SLEEP); - (void) strcpy(child, name); - (void) strcat(child, "@"); - (void) strcat(child, attr->za_name); + child = kmem_asprintf("%s@%s", + name, attr->za_name); err = func(spa, attr->za_first_integer, child, arg); - kmem_free(child, MAXPATHLEN); + strfree(child); if (err) break; } @@ -1215,46 +1461,45 @@ dmu_objset_find_spa(spa_t *spa, const char *name, /* ARGSUSED */ int -dmu_objset_prefetch(char *name, void *arg) +dmu_objset_prefetch(const char *name, void *arg) { - objset_t *os; dsl_dataset_t *ds; - os = kmem_alloc(sizeof (objset_t), KM_SLEEP); - if (dsl_dataset_hold(name, os, &ds)) { - kmem_free(os, sizeof (objset_t)); + if (dsl_dataset_hold(name, FTAG, &ds)) return (0); - } if (!BP_IS_HOLE(&ds->ds_phys->ds_bp)) { - uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; - zbookmark_t zb; - - zb.zb_objset = ds->ds_object; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = 0; - - (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds), - &ds->ds_phys->ds_bp, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &aflags, &zb); + mutex_enter(&ds->ds_opening_lock); + if (ds->ds_objset == NULL) { + uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; + zbookmark_t zb; + + SET_BOOKMARK(&zb, ds->ds_object, ZB_ROOT_OBJECT, + ZB_ROOT_LEVEL, ZB_ROOT_BLKID); + + (void) arc_read_nolock(NULL, dsl_dataset_get_spa(ds), + &ds->ds_phys->ds_bp, NULL, NULL, + ZIO_PRIORITY_ASYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zb); + } + mutex_exit(&ds->ds_opening_lock); } - dsl_dataset_rele(ds, os); - kmem_free(os, sizeof (objset_t)); + dsl_dataset_rele(ds, FTAG); return (0); } void dmu_objset_set_user(objset_t *os, void *user_ptr) { - ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); - os->os->os_user_ptr = user_ptr; + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + os->os_user_ptr = user_ptr; } void * dmu_objset_get_user(objset_t *os) { - ASSERT(MUTEX_HELD(&os->os->os_user_ptr_lock)); - return (os->os->os_user_ptr); + ASSERT(MUTEX_HELD(&os->os_user_ptr_lock)); + return (os->os_user_ptr); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c index 857b9a343fd2c..b23db0c83c2a9 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_send.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -33,21 +33,38 @@ #include #include #include +#include #include #include #include #include #include +#include +#include static char *dmu_recv_tag = "dmu_recv_tag"; +/* + * The list of data whose inclusion in a send stream can be pending from + * one call to backup_cb to another. Multiple calls to dump_free() and + * dump_freeobjects() can be aggregated into a single DRR_FREE or + * DRR_FREEOBJECTS replay record. + */ +typedef enum { + PENDING_NONE, + PENDING_FREE, + PENDING_FREEOBJECTS +} pendop_t; + struct backuparg { dmu_replay_record_t *drr; vnode_t *vp; offset_t *off; objset_t *os; zio_cksum_t zc; + uint64_t toguid; int err; + pendop_t pending_op; }; static int @@ -68,33 +85,99 @@ static int dump_free(struct backuparg *ba, uint64_t object, uint64_t offset, uint64_t length) { - /* write a FREE record */ + struct drr_free *drrf = &(ba->drr->drr_u.drr_free); + + /* + * If there is a pending op, but it's not PENDING_FREE, push it out, + * since free block aggregation can only be done for blocks of the + * same type (i.e., DRR_FREE records can only be aggregated with + * other DRR_FREE records. DRR_FREEOBJECTS records can only be + * aggregated with other DRR_FREEOBJECTS records. + */ + if (ba->pending_op != PENDING_NONE && ba->pending_op != PENDING_FREE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + + if (ba->pending_op == PENDING_FREE) { + /* + * There should never be a PENDING_FREE if length is -1 + * (because dump_dnode is the only place where this + * function is called with a -1, and only after flushing + * any pending record). + */ + ASSERT(length != -1ULL); + /* + * Check to see whether this free block can be aggregated + * with pending one. + */ + if (drrf->drr_object == object && drrf->drr_offset + + drrf->drr_length == offset) { + drrf->drr_length += length; + return (0); + } else { + /* not a continuation. Push out pending record */ + if (dump_bytes(ba, ba->drr, + sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + } + /* create a FREE record and make it pending */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_FREE; - ba->drr->drr_u.drr_free.drr_object = object; - ba->drr->drr_u.drr_free.drr_offset = offset; - ba->drr->drr_u.drr_free.drr_length = length; + drrf->drr_object = object; + drrf->drr_offset = offset; + drrf->drr_length = length; + drrf->drr_toguid = ba->toguid; + if (length == -1ULL) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + } else { + ba->pending_op = PENDING_FREE; + } - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); return (0); } static int dump_data(struct backuparg *ba, dmu_object_type_t type, - uint64_t object, uint64_t offset, int blksz, void *data) + uint64_t object, uint64_t offset, int blksz, const blkptr_t *bp, void *data) { + struct drr_write *drrw = &(ba->drr->drr_u.drr_write); + + + /* + * If there is any kind of pending aggregation (currently either + * a grouping of free objects or free blocks), push it out to + * the stream, since aggregation can't be done across operations + * of different types. + */ + if (ba->pending_op != PENDING_NONE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } /* write a DATA record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_WRITE; - ba->drr->drr_u.drr_write.drr_object = object; - ba->drr->drr_u.drr_write.drr_type = type; - ba->drr->drr_u.drr_write.drr_offset = offset; - ba->drr->drr_u.drr_write.drr_length = blksz; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + drrw->drr_object = object; + drrw->drr_type = type; + drrw->drr_offset = offset; + drrw->drr_length = blksz; + drrw->drr_toguid = ba->toguid; + drrw->drr_checksumtype = BP_GET_CHECKSUM(bp); + if (zio_checksum_table[drrw->drr_checksumtype].ci_dedup) + drrw->drr_checksumflags |= DRR_CHECKSUM_DEDUP; + DDK_SET_LSIZE(&drrw->drr_key, BP_GET_LSIZE(bp)); + DDK_SET_PSIZE(&drrw->drr_key, BP_GET_PSIZE(bp)); + DDK_SET_COMPRESS(&drrw->drr_key, BP_GET_COMPRESS(bp)); + drrw->drr_key.ddk_cksum = bp->blk_cksum; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); - if (dump_bytes(ba, data, blksz)) + if (dump_bytes(ba, data, blksz) != 0) return (EINTR); return (0); } @@ -102,39 +185,80 @@ dump_data(struct backuparg *ba, dmu_object_type_t type, static int dump_freeobjects(struct backuparg *ba, uint64_t firstobj, uint64_t numobjs) { + struct drr_freeobjects *drrfo = &(ba->drr->drr_u.drr_freeobjects); + + /* + * If there is a pending op, but it's not PENDING_FREEOBJECTS, + * push it out, since free block aggregation can only be done for + * blocks of the same type (i.e., DRR_FREE records can only be + * aggregated with other DRR_FREE records. DRR_FREEOBJECTS records + * can only be aggregated with other DRR_FREEOBJECTS records. + */ + if (ba->pending_op != PENDING_NONE && + ba->pending_op != PENDING_FREEOBJECTS) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + if (ba->pending_op == PENDING_FREEOBJECTS) { + /* + * See whether this free object array can be aggregated + * with pending one + */ + if (drrfo->drr_firstobj + drrfo->drr_numobjs == firstobj) { + drrfo->drr_numobjs += numobjs; + return (0); + } else { + /* can't be aggregated. Push out pending record */ + if (dump_bytes(ba, ba->drr, + sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + } + /* write a FREEOBJECTS record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_FREEOBJECTS; - ba->drr->drr_u.drr_freeobjects.drr_firstobj = firstobj; - ba->drr->drr_u.drr_freeobjects.drr_numobjs = numobjs; + drrfo->drr_firstobj = firstobj; + drrfo->drr_numobjs = numobjs; + drrfo->drr_toguid = ba->toguid; + + ba->pending_op = PENDING_FREEOBJECTS; - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) - return (EINTR); return (0); } static int dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) { + struct drr_object *drro = &(ba->drr->drr_u.drr_object); + if (dnp == NULL || dnp->dn_type == DMU_OT_NONE) return (dump_freeobjects(ba, object, 1)); + if (ba->pending_op != PENDING_NONE) { + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) + return (EINTR); + ba->pending_op = PENDING_NONE; + } + /* write an OBJECT record */ bzero(ba->drr, sizeof (dmu_replay_record_t)); ba->drr->drr_type = DRR_OBJECT; - ba->drr->drr_u.drr_object.drr_object = object; - ba->drr->drr_u.drr_object.drr_type = dnp->dn_type; - ba->drr->drr_u.drr_object.drr_bonustype = dnp->dn_bonustype; - ba->drr->drr_u.drr_object.drr_blksz = - dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; - ba->drr->drr_u.drr_object.drr_bonuslen = dnp->dn_bonuslen; - ba->drr->drr_u.drr_object.drr_checksum = dnp->dn_checksum; - ba->drr->drr_u.drr_object.drr_compress = dnp->dn_compress; - - if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t))) + drro->drr_object = object; + drro->drr_type = dnp->dn_type; + drro->drr_bonustype = dnp->dn_bonustype; + drro->drr_blksz = dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT; + drro->drr_bonuslen = dnp->dn_bonuslen; + drro->drr_checksumtype = dnp->dn_checksum; + drro->drr_compress = dnp->dn_compress; + drro->drr_toguid = ba->toguid; + + if (dump_bytes(ba, ba->drr, sizeof (dmu_replay_record_t)) != 0) return (EINTR); - if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8))) + if (dump_bytes(ba, DN_BONUS(dnp), P2ROUNDUP(dnp->dn_bonuslen, 8)) != 0) return (EINTR); /* free anything past the end of the file */ @@ -150,9 +274,10 @@ dump_dnode(struct backuparg *ba, uint64_t object, dnode_phys_t *dnp) (((uint64_t)dnp->dn_datablkszsec) << (SPA_MINBLOCKSHIFT + \ (level) * (dnp->dn_indblkshift - SPA_BLKPTRSHIFT))) +/* ARGSUSED */ static int -backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, - const dnode_phys_t *dnp, void *arg) +backup_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct backuparg *ba = arg; dmu_object_type_t type = bp ? BP_GET_TYPE(bp) : DMU_OT_NONE; @@ -161,7 +286,10 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, if (issig(JUSTLOOKING) && issig(FORREAL)) return (EINTR); - if (bp == NULL && zb->zb_object == 0) { + if (zb->zb_object != DMU_META_DNODE_OBJECT && + DMU_OBJECT_IS_SPECIAL(zb->zb_object)) { + return (0); + } else if (bp == NULL && zb->zb_object == DMU_META_DNODE_OBJECT) { uint64_t span = BP_SPAN(dnp, zb->zb_level); uint64_t dnobj = (zb->zb_blkid * span) >> DNODE_SHIFT; err = dump_freeobjects(ba, dnobj, span >> DNODE_SHIFT); @@ -202,7 +330,7 @@ backup_cb(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, return (EIO); err = dump_data(ba, type, zb->zb_object, zb->zb_blkid * blksz, - blksz, abuf->b_data); + blksz, bp, abuf->b_data); (void) arc_buf_remove_ref(abuf, &abuf); } @@ -214,8 +342,8 @@ int dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, vnode_t *vp, offset_t *off) { - dsl_dataset_t *ds = tosnap->os->os_dsl_dataset; - dsl_dataset_t *fromds = fromsnap ? fromsnap->os->os_dsl_dataset : NULL; + dsl_dataset_t *ds = tosnap->os_dsl_dataset; + dsl_dataset_t *fromds = fromsnap ? fromsnap->os_dsl_dataset : NULL; dmu_replay_record_t *drr; struct backuparg ba; int err; @@ -252,10 +380,11 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, drr = kmem_zalloc(sizeof (dmu_replay_record_t), KM_SLEEP); drr->drr_type = DRR_BEGIN; drr->drr_u.drr_begin.drr_magic = DMU_BACKUP_MAGIC; - drr->drr_u.drr_begin.drr_version = DMU_BACKUP_STREAM_VERSION; + DMU_SET_STREAM_HDRTYPE(drr->drr_u.drr_begin.drr_versioninfo, + DMU_SUBSTREAM); drr->drr_u.drr_begin.drr_creation_time = ds->ds_phys->ds_creation_time; - drr->drr_u.drr_begin.drr_type = tosnap->os->os_phys->os_type; + drr->drr_u.drr_begin.drr_type = tosnap->os_phys->os_type; if (fromorigin) drr->drr_u.drr_begin.drr_flags |= DRR_FLAG_CLONE; drr->drr_u.drr_begin.drr_toguid = ds->ds_phys->ds_guid; @@ -275,9 +404,11 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, ba.vp = vp; ba.os = tosnap; ba.off = off; + ba.toguid = ds->ds_phys->ds_guid; ZIO_SET_CHECKSUM(&ba.zc, 0, 0, 0, 0); + ba.pending_op = PENDING_NONE; - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); return (ba.err); } @@ -285,6 +416,10 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, err = traverse_dataset(ds, fromtxg, TRAVERSE_PRE | TRAVERSE_PREFETCH, backup_cb, &ba); + if (ba.pending_op != PENDING_NONE) + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) + err = EINTR; + if (err) { if (err == EINTR && ba.err) err = ba.err; @@ -295,8 +430,9 @@ dmu_sendbackup(objset_t *tosnap, objset_t *fromsnap, boolean_t fromorigin, bzero(drr, sizeof (dmu_replay_record_t)); drr->drr_type = DRR_END; drr->drr_u.drr_end.drr_checksum = ba.zc; + drr->drr_u.drr_end.drr_toguid = ba.toguid; - if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t))) { + if (dump_bytes(&ba, drr, sizeof (dmu_replay_record_t)) != 0) { kmem_free(drr, sizeof (dmu_replay_record_t)); return (ba.err); } @@ -319,31 +455,9 @@ struct recvbeginsyncarg { dsl_dataset_t *ds; /* the ds to recv into; returned from the syncfunc */ }; -static dsl_dataset_t * -recv_full_sync_impl(dsl_pool_t *dp, uint64_t dsobj, dmu_objset_type_t type, - cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds; - - /* This should always work, since we just created it */ - /* XXX - create should return an owned ds */ - VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, - DS_MODE_INCONSISTENT, dmu_recv_tag, &ds)); - - if (type != DMU_OST_NONE) { - (void) dmu_objset_create_impl(dp->dp_spa, - ds, &ds->ds_phys->ds_bp, type, tx); - } - - spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, - dp->dp_spa, tx, cr, "dataset = %lld", dsobj); - - return (ds); -} - /* ARGSUSED */ static int -recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_new_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -361,7 +475,7 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) /* make sure it's a snap in the same pool */ if (rbsa->origin->ds_dir->dd_pool != dd->dd_pool) return (EXDEV); - if (rbsa->origin->ds_phys->ds_num_children == 0) + if (!dsl_dataset_is_snapshot(rbsa->origin)) return (EINVAL); if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) return (ENODEV); @@ -371,77 +485,31 @@ recv_full_check(void *arg1, void *arg2, dmu_tx_t *tx) } static void -recv_full_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_new_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dir_t *dd = arg1; struct recvbeginsyncarg *rbsa = arg2; uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; + /* Create and open new dataset. */ dsobj = dsl_dataset_create_sync(dd, strrchr(rbsa->tofs, '/') + 1, rbsa->origin, flags, cr, tx); + VERIFY(0 == dsl_dataset_own_obj(dd->dd_pool, dsobj, + B_TRUE, dmu_recv_tag, &rbsa->ds)); - rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, - rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); -} - -static int -recv_full_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - int err; - - /* must be a head ds */ - if (ds->ds_phys->ds_next_snap_obj != 0) - return (EINVAL); - - /* must not be a clone ds */ - if (dsl_dir_is_clone(ds->ds_dir)) - return (EINVAL); - - err = dsl_dataset_destroy_check(ds, rbsa->tag, tx); - if (err) - return (err); - - if (rbsa->origin) { - /* make sure it's a snap in the same pool */ - if (rbsa->origin->ds_dir->dd_pool != ds->ds_dir->dd_pool) - return (EXDEV); - if (rbsa->origin->ds_phys->ds_num_children == 0) - return (EINVAL); - if (rbsa->origin->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); + if (rbsa->origin == NULL) { + (void) dmu_objset_create_impl(dd->dd_pool->dp_spa, + rbsa->ds, &rbsa->ds->ds_phys->ds_bp, rbsa->type, tx); } - return (0); -} - -static void -recv_full_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - struct recvbeginsyncarg *rbsa = arg2; - dsl_dir_t *dd = ds->ds_dir; - uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; - uint64_t dsobj; - - /* - * NB: caller must provide an extra hold on the dsl_dir_t, so it - * won't go away when dsl_dataset_destroy_sync() closes the - * dataset. - */ - dsl_dataset_destroy_sync(ds, rbsa->tag, cr, tx); - - dsobj = dsl_dataset_create_sync_dd(dd, rbsa->origin, flags, tx); - - rbsa->ds = recv_full_sync_impl(dd->dd_pool, dsobj, - rbsa->origin ? DMU_OST_NONE : rbsa->type, cr, tx); + spa_history_internal_log(LOG_DS_REPLAY_FULL_SYNC, + dd->dd_pool->dp_spa, tx, cr, "dataset = %lld", dsobj); } /* ARGSUSED */ static int -recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) +recv_existing_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; struct recvbeginsyncarg *rbsa = arg2; @@ -452,13 +520,43 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) if (!rbsa->force && dsl_dataset_modified_since_lastsnap(ds)) return (ETXTBSY); - /* must already be a snapshot of this fs */ - if (ds->ds_phys->ds_prev_snap_obj == 0) - return (ENODEV); + if (rbsa->fromguid) { + /* if incremental, most recent snapshot must match fromguid */ + if (ds->ds_prev == NULL) + return (ENODEV); - /* most recent snapshot must match fromguid */ - if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) - return (ENODEV); + /* + * most recent snapshot must match fromguid, or there are no + * changes since the fromguid one + */ + if (ds->ds_prev->ds_phys->ds_guid != rbsa->fromguid) { + uint64_t birth = ds->ds_prev->ds_phys->ds_bp.blk_birth; + uint64_t obj = ds->ds_prev->ds_phys->ds_prev_snap_obj; + while (obj != 0) { + dsl_dataset_t *snap; + err = dsl_dataset_hold_obj(ds->ds_dir->dd_pool, + obj, FTAG, &snap); + if (err) + return (ENODEV); + if (snap->ds_phys->ds_creation_txg < birth) { + dsl_dataset_rele(snap, FTAG); + return (ENODEV); + } + if (snap->ds_phys->ds_guid == rbsa->fromguid) { + dsl_dataset_rele(snap, FTAG); + break; /* it's ok */ + } + obj = snap->ds_phys->ds_prev_snap_obj; + dsl_dataset_rele(snap, FTAG); + } + if (obj == 0) + return (ENODEV); + } + } else { + /* if full, most recent snapshot must be $ORIGIN */ + if (ds->ds_phys->ds_prev_snap_txg >= TXG_INITIAL) + return (ENODEV); + } /* temporary clone name must not exist */ err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, @@ -481,29 +579,28 @@ recv_incremental_check(void *arg1, void *arg2, dmu_tx_t *tx) /* ARGSUSED */ static void -recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +recv_existing_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ohds = arg1; struct recvbeginsyncarg *rbsa = arg2; dsl_pool_t *dp = ohds->ds_dir->dd_pool; - dsl_dataset_t *ods, *cds; + dsl_dataset_t *cds; uint64_t flags = DS_FLAG_INCONSISTENT | rbsa->dsflags; uint64_t dsobj; - /* create the temporary clone */ - VERIFY(0 == dsl_dataset_hold_obj(dp, ohds->ds_phys->ds_prev_snap_obj, - FTAG, &ods)); - dsobj = dsl_dataset_create_sync(ohds->ds_dir, - rbsa->clonelastname, ods, flags, cr, tx); - dsl_dataset_rele(ods, FTAG); + /* create and open the temporary clone */ + dsobj = dsl_dataset_create_sync(ohds->ds_dir, rbsa->clonelastname, + ohds->ds_prev, flags, cr, tx); + VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, B_TRUE, dmu_recv_tag, &cds)); - /* open the temporary clone */ - VERIFY(0 == dsl_dataset_own_obj(dp, dsobj, - DS_MODE_INCONSISTENT, dmu_recv_tag, &cds)); - - /* copy the refquota from the target fs to the clone */ - if (ohds->ds_quota > 0) - dsl_dataset_set_quota_sync(cds, &ohds->ds_quota, cr, tx); + /* + * If we actually created a non-clone, we need to create the + * objset in our new dataset. + */ + if (BP_IS_HOLE(dsl_dataset_get_blkptr(cds))) { + (void) dmu_objset_create_impl(dp->dp_spa, + cds, dsl_dataset_get_blkptr(cds), rbsa->type, tx); + } rbsa->ds = cds; @@ -511,32 +608,18 @@ recv_online_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_spa, tx, cr, "dataset = %lld", dsobj); } -/* ARGSUSED */ -static void -recv_offline_incremental_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - ds->ds_phys->ds_flags |= DS_FLAG_INCONSISTENT; - - spa_history_internal_log(LOG_DS_REPLAY_INC_SYNC, - ds->ds_dir->dd_pool->dp_spa, tx, cr, "dataset = %lld", - ds->ds_object); -} - /* * NB: callers *MUST* call dmu_recv_stream() if dmu_recv_begin() * succeeds; otherwise we will leak the holds on the datasets. */ int -dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, - boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *drc) +dmu_recv_begin(char *tofs, char *tosnap, char *top_ds, struct drr_begin *drrb, + boolean_t force, objset_t *origin, dmu_recv_cookie_t *drc) { int err = 0; boolean_t byteswap; - struct recvbeginsyncarg rbsa; - uint64_t version; + struct recvbeginsyncarg rbsa = { 0 }; + uint64_t versioninfo; int flags; dsl_dataset_t *ds; @@ -549,22 +632,22 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, rbsa.tofs = tofs; rbsa.tosnap = tosnap; - rbsa.origin = origin ? origin->os->os_dsl_dataset : NULL; + rbsa.origin = origin ? origin->os_dsl_dataset : NULL; rbsa.fromguid = drrb->drr_fromguid; rbsa.type = drrb->drr_type; rbsa.tag = FTAG; rbsa.dsflags = 0; - version = drrb->drr_version; + versioninfo = drrb->drr_versioninfo; flags = drrb->drr_flags; if (byteswap) { rbsa.type = BSWAP_32(rbsa.type); rbsa.fromguid = BSWAP_64(rbsa.fromguid); - version = BSWAP_64(version); + versioninfo = BSWAP_64(versioninfo); flags = BSWAP_32(flags); } - if (version != DMU_BACKUP_STREAM_VERSION || + if (DMU_GET_STREAM_HDRTYPE(versioninfo) == DMU_COMPOUNDSTREAM || rbsa.type >= DMU_OST_NUMTYPES || ((flags & DRR_FLAG_CLONE) && origin == NULL)) return (EINVAL); @@ -575,102 +658,72 @@ dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *drrb, bzero(drc, sizeof (dmu_recv_cookie_t)); drc->drc_drrb = drrb; drc->drc_tosnap = tosnap; + drc->drc_top_ds = top_ds; drc->drc_force = force; /* * Process the begin in syncing context. */ - if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE) && !online) { - /* offline incremental receive */ - err = dsl_dataset_own(tofs, 0, dmu_recv_tag, &ds); - if (err) - return (err); - /* - * Only do the rollback if the most recent snapshot - * matches the incremental source - */ - if (force) { - if (ds->ds_prev == NULL || - ds->ds_prev->ds_phys->ds_guid != - rbsa.fromguid) { - dsl_dataset_disown(ds, dmu_recv_tag); - return (ENODEV); - } - (void) dsl_dataset_rollback(ds, DMU_OST_NONE); + /* open the dataset we are logically receiving into */ + err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); + if (err == 0) { + /* target fs already exists; recv into temp clone */ + + /* Can't recv a clone into an existing fs */ + if (flags & DRR_FLAG_CLONE) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (EINVAL); } - rbsa.force = B_FALSE; - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_incremental_check, - recv_offline_incremental_sync, ds, &rbsa, 1); - if (err) { - dsl_dataset_disown(ds, dmu_recv_tag); - return (err); + + /* must not have an incremental recv already in progress */ + if (!mutex_tryenter(&ds->ds_recvlock)) { + dsl_dataset_rele(ds, dmu_recv_tag); + return (EBUSY); } - drc->drc_logical_ds = drc->drc_real_ds = ds; - } else if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) { - /* online incremental receive */ /* tmp clone name is: tofs/%tosnap" */ (void) snprintf(rbsa.clonelastname, sizeof (rbsa.clonelastname), "%%%s", tosnap); - - /* open the dataset we are logically receiving into */ - err = dsl_dataset_hold(tofs, dmu_recv_tag, &ds); - if (err) - return (err); - rbsa.force = force; err = dsl_sync_task_do(ds->ds_dir->dd_pool, - recv_incremental_check, - recv_online_incremental_sync, ds, &rbsa, 5); + recv_existing_check, recv_existing_sync, ds, &rbsa, 5); if (err) { + mutex_exit(&ds->ds_recvlock); dsl_dataset_rele(ds, dmu_recv_tag); return (err); } drc->drc_logical_ds = ds; drc->drc_real_ds = rbsa.ds; - } else { - /* create new fs -- full backup or clone */ - dsl_dir_t *dd = NULL; - const char *tail; + } else if (err == ENOENT) { + /* target fs does not exist; must be a full backup or clone */ + char *cp; - err = dsl_dir_open(tofs, FTAG, &dd, &tail); + /* + * If it's a non-clone incremental, we are missing the + * target fs, so fail the recv. + */ + if (rbsa.fromguid && !(flags & DRR_FLAG_CLONE)) + return (ENOENT); + + /* Open the parent of tofs */ + cp = strrchr(tofs, '/'); + *cp = '\0'; + err = dsl_dataset_hold(tofs, FTAG, &ds); + *cp = '/'; if (err) return (err); - if (tail == NULL) { - if (!force) { - dsl_dir_close(dd, FTAG); - return (EEXIST); - } - rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); - err = dsl_dataset_own_obj(dd->dd_pool, - dd->dd_phys->dd_head_dataset_obj, - DS_MODE_INCONSISTENT, FTAG, &ds); - rw_exit(&dd->dd_pool->dp_config_rwlock); - if (err) { - dsl_dir_close(dd, FTAG); - return (err); - } - - dsl_dataset_make_exclusive(ds, FTAG); - err = dsl_sync_task_do(dd->dd_pool, - recv_full_existing_check, - recv_full_existing_sync, ds, &rbsa, 5); - dsl_dataset_disown(ds, FTAG); - } else { - err = dsl_sync_task_do(dd->dd_pool, recv_full_check, - recv_full_sync, dd, &rbsa, 5); - } - dsl_dir_close(dd, FTAG); + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_new_check, recv_new_sync, ds->ds_dir, &rbsa, 5); + dsl_dataset_rele(ds, FTAG); if (err) return (err); drc->drc_logical_ds = drc->drc_real_ds = rbsa.ds; drc->drc_newfs = B_TRUE; } - return (0); + return (err); } struct restorearg { @@ -681,8 +734,83 @@ struct restorearg { uint64_t voff; int bufsize; /* amount of memory allocated for buf */ zio_cksum_t cksum; + avl_tree_t guid_to_ds_map; }; +typedef struct guid_map_entry { + uint64_t guid; + dsl_dataset_t *gme_ds; + avl_node_t avlnode; +} guid_map_entry_t; + +static int +guid_compare(const void *arg1, const void *arg2) +{ + const guid_map_entry_t *gmep1 = arg1; + const guid_map_entry_t *gmep2 = arg2; + + if (gmep1->guid < gmep2->guid) + return (-1); + else if (gmep1->guid > gmep2->guid) + return (1); + return (0); +} + +/* + * This function is a callback used by dmu_objset_find() (which + * enumerates the object sets) to build an avl tree that maps guids + * to datasets. The resulting table is used when processing DRR_WRITE_BYREF + * send stream records. These records, which are used in dedup'ed + * streams, do not contain data themselves, but refer to a copy + * of the data block that has already been written because it was + * earlier in the stream. That previous copy is identified by the + * guid of the dataset with the referenced data. + */ +int +find_ds_by_guid(const char *name, void *arg) +{ + avl_tree_t *guid_map = arg; + dsl_dataset_t *ds, *snapds; + guid_map_entry_t *gmep; + dsl_pool_t *dp; + int err; + uint64_t lastobj, firstobj; + + if (dsl_dataset_hold(name, FTAG, &ds) != 0) + return (0); + + dp = ds->ds_dir->dd_pool; + rw_enter(&dp->dp_config_rwlock, RW_READER); + firstobj = ds->ds_dir->dd_phys->dd_origin_obj; + lastobj = ds->ds_phys->ds_prev_snap_obj; + + while (lastobj != firstobj) { + err = dsl_dataset_hold_obj(dp, lastobj, guid_map, &snapds); + if (err) { + /* + * Skip this snapshot and move on. It's not + * clear why this would ever happen, but the + * remainder of the snapshot streadm can be + * processed. + */ + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_rele(ds, FTAG); + return (0); + } + + gmep = kmem_alloc(sizeof (guid_map_entry_t), KM_SLEEP); + gmep->guid = snapds->ds_phys->ds_guid; + gmep->gme_ds = snapds; + avl_add(guid_map, gmep); + lastobj = snapds->ds_phys->ds_prev_snap_obj; + } + + rw_exit(&dp->dp_config_rwlock); + dsl_dataset_rele(ds, FTAG); + + return (0); +} + static void * restore_read(struct restorearg *ra, int len) { @@ -727,7 +855,7 @@ backup_byteswap(dmu_replay_record_t *drr) switch (drr->drr_type) { case DRR_BEGIN: DO64(drr_begin.drr_magic); - DO64(drr_begin.drr_version); + DO64(drr_begin.drr_versioninfo); DO64(drr_begin.drr_creation_time); DO32(drr_begin.drr_type); DO32(drr_begin.drr_flags); @@ -741,27 +869,51 @@ backup_byteswap(dmu_replay_record_t *drr) DO32(drr_object.drr_bonustype); DO32(drr_object.drr_blksz); DO32(drr_object.drr_bonuslen); + DO64(drr_object.drr_toguid); break; case DRR_FREEOBJECTS: DO64(drr_freeobjects.drr_firstobj); DO64(drr_freeobjects.drr_numobjs); + DO64(drr_freeobjects.drr_toguid); break; case DRR_WRITE: DO64(drr_write.drr_object); DO32(drr_write.drr_type); DO64(drr_write.drr_offset); DO64(drr_write.drr_length); + DO64(drr_write.drr_toguid); + DO64(drr_write.drr_key.ddk_cksum.zc_word[0]); + DO64(drr_write.drr_key.ddk_cksum.zc_word[1]); + DO64(drr_write.drr_key.ddk_cksum.zc_word[2]); + DO64(drr_write.drr_key.ddk_cksum.zc_word[3]); + DO64(drr_write.drr_key.ddk_prop); + break; + case DRR_WRITE_BYREF: + DO64(drr_write_byref.drr_object); + DO64(drr_write_byref.drr_offset); + DO64(drr_write_byref.drr_length); + DO64(drr_write_byref.drr_toguid); + DO64(drr_write_byref.drr_refguid); + DO64(drr_write_byref.drr_refobject); + DO64(drr_write_byref.drr_refoffset); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[0]); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[1]); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[2]); + DO64(drr_write_byref.drr_key.ddk_cksum.zc_word[3]); + DO64(drr_write_byref.drr_key.ddk_prop); break; case DRR_FREE: DO64(drr_free.drr_object); DO64(drr_free.drr_offset); DO64(drr_free.drr_length); + DO64(drr_free.drr_toguid); break; case DRR_END: DO64(drr_end.drr_checksum.zc_word[0]); DO64(drr_end.drr_checksum.zc_word[1]); DO64(drr_end.drr_checksum.zc_word[2]); DO64(drr_end.drr_checksum.zc_word[3]); + DO64(drr_end.drr_toguid); break; } #undef DO64 @@ -775,15 +927,10 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) dmu_tx_t *tx; void *data = NULL; - err = dmu_object_info(os, drro->drr_object, NULL); - - if (err != 0 && err != ENOENT) - return (EINVAL); - if (drro->drr_type == DMU_OT_NONE || drro->drr_type >= DMU_OT_NUMTYPES || drro->drr_bonustype >= DMU_OT_NUMTYPES || - drro->drr_checksum >= ZIO_CHECKSUM_FUNCTIONS || + drro->drr_checksumtype >= ZIO_CHECKSUM_FUNCTIONS || drro->drr_compress >= ZIO_COMPRESS_FUNCTIONS || P2PHASE(drro->drr_blksz, SPA_MINBLOCKSIZE) || drro->drr_blksz < SPA_MINBLOCKSIZE || @@ -792,18 +939,21 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) return (EINVAL); } + err = dmu_object_info(os, drro->drr_object, NULL); + + if (err != 0 && err != ENOENT) + return (EINVAL); + if (drro->drr_bonuslen) { data = restore_read(ra, P2ROUNDUP(drro->drr_bonuslen, 8)); if (ra->err) return (ra->err); } - tx = dmu_tx_create(os); - if (err == ENOENT) { /* currently free, want to be allocated */ + tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, 1); err = dmu_tx_assign(tx, TXG_WAIT); if (err) { dmu_tx_abort(tx); @@ -812,30 +962,26 @@ restore_object(struct restorearg *ra, objset_t *os, struct drr_object *drro) err = dmu_object_claim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, drro->drr_bonustype, drro->drr_bonuslen, tx); + dmu_tx_commit(tx); } else { /* currently allocated, want to be allocated */ - dmu_tx_hold_bonus(tx, drro->drr_object); - /* - * We may change blocksize, so need to - * hold_write - */ - dmu_tx_hold_write(tx, drro->drr_object, 0, 1); - err = dmu_tx_assign(tx, TXG_WAIT); - if (err) { - dmu_tx_abort(tx); - return (err); - } - err = dmu_object_reclaim(os, drro->drr_object, drro->drr_type, drro->drr_blksz, - drro->drr_bonustype, drro->drr_bonuslen, tx); + drro->drr_bonustype, drro->drr_bonuslen); } - if (err) { - dmu_tx_commit(tx); + if (err) return (EINVAL); + + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, drro->drr_object); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); } - dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksum, tx); + dmu_object_set_checksum(os, drro->drr_object, drro->drr_checksumtype, + tx); dmu_object_set_compress(os, drro->drr_object, drro->drr_compress, tx); if (data != NULL) { @@ -917,6 +1063,64 @@ restore_write(struct restorearg *ra, objset_t *os, return (0); } +/* + * Handle a DRR_WRITE_BYREF record. This record is used in dedup'ed + * streams to refer to a copy of the data that is already on the + * system because it came in earlier in the stream. This function + * finds the earlier copy of the data, and uses that copy instead of + * data from the stream to fulfill this write. + */ +static int +restore_write_byref(struct restorearg *ra, objset_t *os, + struct drr_write_byref *drrwbr) +{ + dmu_tx_t *tx; + int err; + guid_map_entry_t gmesrch; + guid_map_entry_t *gmep; + avl_index_t where; + objset_t *ref_os = NULL; + dmu_buf_t *dbp; + + if (drrwbr->drr_offset + drrwbr->drr_length < drrwbr->drr_offset) + return (EINVAL); + + /* + * If the GUID of the referenced dataset is different from the + * GUID of the target dataset, find the referenced dataset. + */ + if (drrwbr->drr_toguid != drrwbr->drr_refguid) { + gmesrch.guid = drrwbr->drr_refguid; + if ((gmep = avl_find(&ra->guid_to_ds_map, &gmesrch, + &where)) == NULL) { + return (EINVAL); + } + if (dmu_objset_from_ds(gmep->gme_ds, &ref_os)) + return (EINVAL); + } else { + ref_os = os; + } + + if (err = dmu_buf_hold(ref_os, drrwbr->drr_refobject, + drrwbr->drr_refoffset, FTAG, &dbp)) + return (err); + + tx = dmu_tx_create(os); + + dmu_tx_hold_write(tx, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + dmu_write(os, drrwbr->drr_object, + drrwbr->drr_offset, drrwbr->drr_length, dbp->db_data, tx); + dmu_buf_rele(dbp, FTAG); + dmu_tx_commit(tx); + return (0); +} + /* ARGSUSED */ static int restore_free(struct restorearg *ra, objset_t *os, @@ -936,26 +1140,6 @@ restore_free(struct restorearg *ra, objset_t *os, return (err); } -void -dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc) -{ - if (drc->drc_newfs || drc->drc_real_ds != drc->drc_logical_ds) { - /* - * online incremental or new fs: destroy the fs (which - * may be a clone) that we created - */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); - if (drc->drc_real_ds != drc->drc_logical_ds) - dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); - } else { - /* - * offline incremental: rollback to most recent snapshot. - */ - (void) dsl_dataset_rollback(drc->drc_real_ds, DMU_OST_NONE); - dsl_dataset_disown(drc->drc_real_ds, dmu_recv_tag); - } -} - /* * NB: callers *must* call dmu_recv_end() if this succeeds. */ @@ -966,6 +1150,8 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) dmu_replay_record_t *drr; objset_t *os; zio_cksum_t pcksum; + guid_map_entry_t *gmep; + int featureflags; if (drc->drc_drrb->drr_magic == BSWAP_64(DMU_BACKUP_MAGIC)) ra.byteswap = TRUE; @@ -990,7 +1176,7 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) if (ra.byteswap) { struct drr_begin *drrb = drc->drc_drrb; drrb->drr_magic = BSWAP_64(drrb->drr_magic); - drrb->drr_version = BSWAP_64(drrb->drr_version); + drrb->drr_versioninfo = BSWAP_64(drrb->drr_versioninfo); drrb->drr_creation_time = BSWAP_64(drrb->drr_creation_time); drrb->drr_type = BSWAP_32(drrb->drr_type); drrb->drr_toguid = BSWAP_64(drrb->drr_toguid); @@ -1003,16 +1189,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) ra.buf = kmem_alloc(ra.bufsize, KM_SLEEP); /* these were verified in dmu_recv_begin */ - ASSERT(drc->drc_drrb->drr_version == DMU_BACKUP_STREAM_VERSION); + ASSERT(DMU_GET_STREAM_HDRTYPE(drc->drc_drrb->drr_versioninfo) == + DMU_SUBSTREAM); ASSERT(drc->drc_drrb->drr_type < DMU_OST_NUMTYPES); /* * Open the objset we are modifying. */ - VERIFY(dmu_objset_open_ds(drc->drc_real_ds, DMU_OST_ANY, &os) == 0); + VERIFY(dmu_objset_from_ds(drc->drc_real_ds, &os) == 0); ASSERT(drc->drc_real_ds->ds_phys->ds_flags & DS_FLAG_INCONSISTENT); + featureflags = DMU_GET_FEATUREFLAGS(drc->drc_drrb->drr_versioninfo); + + /* if this stream is dedup'ed, set up the avl tree for guid mapping */ + if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { + avl_create(&ra.guid_to_ds_map, guid_compare, + sizeof (guid_map_entry_t), + offsetof(guid_map_entry_t, avlnode)); + (void) dmu_objset_find(drc->drc_top_ds, find_ds_by_guid, + (void *)&ra.guid_to_ds_map, + DS_FIND_CHILDREN); + } + /* * Read records and process them. */ @@ -1052,6 +1251,13 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) ra.err = restore_write(&ra, os, &drrw); break; } + case DRR_WRITE_BYREF: + { + struct drr_write_byref drrwbr = + drr->drr_u.drr_write_byref; + ra.err = restore_write_byref(&ra, os, &drrwbr); + break; + } case DRR_FREE: { struct drr_free drrf = drr->drr_u.drr_free; @@ -1079,15 +1285,29 @@ dmu_recv_stream(dmu_recv_cookie_t *drc, vnode_t *vp, offset_t *voffp) ASSERT(ra.err != 0); out: - dmu_objset_close(os); - if (ra.err != 0) { /* - * rollback or destroy what we created, so we don't - * leave it in the restoring state. + * destroy what we created, so we don't leave it in the + * inconsistent restoring state. */ txg_wait_synced(drc->drc_real_ds->ds_dir->dd_pool, 0); - dmu_recv_abort_cleanup(drc); + + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + if (drc->drc_real_ds != drc->drc_logical_ds) { + mutex_exit(&drc->drc_logical_ds->ds_recvlock); + dsl_dataset_rele(drc->drc_logical_ds, dmu_recv_tag); + } + } + + if (featureflags & DMU_BACKUP_FEATURE_DEDUP) { + void *cookie = NULL; + + while (gmep = avl_destroy_nodes(&ra.guid_to_ds_map, &cookie)) { + dsl_dataset_rele(gmep->gme_ds, &ra.guid_to_ds_map); + kmem_free(gmep, sizeof (guid_map_entry_t)); + } + avl_destroy(&ra.guid_to_ds_map); } kmem_free(ra.buf, ra.bufsize); @@ -1128,35 +1348,31 @@ recv_end_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_phys->ds_flags &= ~DS_FLAG_INCONSISTENT; } -int -dmu_recv_end(dmu_recv_cookie_t *drc) +static int +dmu_recv_existing_end(dmu_recv_cookie_t *drc) { struct recvendsyncarg resa; dsl_dataset_t *ds = drc->drc_logical_ds; int err; /* - * XXX hack; seems the ds is still dirty and - * dsl_pool_zil_clean() expects it to have a ds_user_ptr - * (and zil), but clone_swap() can close it. + * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() + * expects it to have a ds_user_ptr (and zil), but clone_swap() + * can close it. */ txg_wait_synced(ds->ds_dir->dd_pool, 0); - if (ds != drc->drc_real_ds) { - /* we are doing an online recv */ - if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { - err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, - drc->drc_force); - if (err) - dsl_dataset_disown(ds, dmu_recv_tag); - } else { - err = EBUSY; - dsl_dataset_rele(ds, dmu_recv_tag); - } - /* dsl_dataset_destroy() will disown the ds */ - (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag); + if (dsl_dataset_tryown(ds, FALSE, dmu_recv_tag)) { + err = dsl_dataset_clone_swap(drc->drc_real_ds, ds, + drc->drc_force); if (err) - return (err); + goto out; + } else { + mutex_exit(&ds->ds_recvlock); + dsl_dataset_rele(ds, dmu_recv_tag); + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, + B_FALSE); + return (EBUSY); } resa.creation_time = drc->drc_drrb->drr_creation_time; @@ -1166,16 +1382,52 @@ dmu_recv_end(dmu_recv_cookie_t *drc) err = dsl_sync_task_do(ds->ds_dir->dd_pool, recv_end_check, recv_end_sync, ds, &resa, 3); if (err) { - if (drc->drc_newfs) { - ASSERT(ds == drc->drc_real_ds); - (void) dsl_dataset_destroy(ds, dmu_recv_tag); - return (err); - } else { - (void) dsl_dataset_rollback(ds, DMU_OST_NONE); - } + /* swap back */ + (void) dsl_dataset_clone_swap(drc->drc_real_ds, ds, B_TRUE); } - /* release the hold from dmu_recv_begin */ +out: + mutex_exit(&ds->ds_recvlock); dsl_dataset_disown(ds, dmu_recv_tag); + (void) dsl_dataset_destroy(drc->drc_real_ds, dmu_recv_tag, B_FALSE); return (err); } + +static int +dmu_recv_new_end(dmu_recv_cookie_t *drc) +{ + struct recvendsyncarg resa; + dsl_dataset_t *ds = drc->drc_logical_ds; + int err; + + /* + * XXX hack; seems the ds is still dirty and dsl_pool_zil_clean() + * expects it to have a ds_user_ptr (and zil), but clone_swap() + * can close it. + */ + txg_wait_synced(ds->ds_dir->dd_pool, 0); + + resa.creation_time = drc->drc_drrb->drr_creation_time; + resa.toguid = drc->drc_drrb->drr_toguid; + resa.tosnap = drc->drc_tosnap; + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + recv_end_check, recv_end_sync, ds, &resa, 3); + if (err) { + /* clean up the fs we just recv'd into */ + (void) dsl_dataset_destroy(ds, dmu_recv_tag, B_FALSE); + } else { + /* release the hold from dmu_recv_begin */ + dsl_dataset_disown(ds, dmu_recv_tag); + } + return (err); +} + +int +dmu_recv_end(dmu_recv_cookie_t *drc) +{ + if (drc->drc_logical_ds != drc->drc_real_ds) + return (dmu_recv_existing_end(drc)); + else + return (dmu_recv_new_end(drc)); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c index 5124014707731..692feb6809b1b 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_traverse.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,14 +35,6 @@ #include #include -#define SET_BOOKMARK(zb, objset, object, level, blkid) \ -{ \ - (zb)->zb_objset = objset; \ - (zb)->zb_object = object; \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - struct prefetch_data { kmutex_t pd_mtx; kcondvar_t pd_cv; @@ -64,28 +56,32 @@ struct traverse_data { void *td_arg; }; +static int traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, + arc_buf_t *buf, uint64_t objset, uint64_t object); + /* ARGSUSED */ -static void +static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { struct traverse_data *td = arg; zbookmark_t zb; if (bp->blk_birth == 0) - return; + return (0); if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(td->td_spa)) - return; + return (0); - zb.zb_objset = td->td_objset; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; - VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); + SET_BOOKMARK(&zb, td->td_objset, ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, td->td_arg); + + return (0); } /* ARGSUSED */ -static void +static int traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { struct traverse_data *td = arg; @@ -96,17 +92,18 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) zbookmark_t zb; if (bp->blk_birth == 0) - return; + return (0); if (claim_txg == 0 || bp->blk_birth < claim_txg) - return; + return (0); + + SET_BOOKMARK(&zb, td->td_objset, lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); - zb.zb_objset = td->td_objset; - zb.zb_object = lr->lr_foid; - zb.zb_level = BP_GET_LEVEL(bp); - zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); - VERIFY(0 == td->td_func(td->td_spa, bp, &zb, NULL, td->td_arg)); + (void) td->td_func(td->td_spa, zilog, bp, &zb, NULL, + td->td_arg); } + return (0); } static void @@ -117,9 +114,9 @@ traverse_zil(struct traverse_data *td, zil_header_t *zh) /* * We only want to visit blocks that have been claimed but not yet - * replayed (or, in read-only mode, blocks that *would* be claimed). + * replayed; plus, in read-only mode, blocks that are already stable. */ - if (claim_txg == 0 && (spa_mode & FWRITE)) + if (claim_txg == 0 && spa_writeable(td->td_spa)) return; zilog = zil_alloc(spa_get_dsl(td->td_spa)->dp_meta_objset, zh); @@ -135,12 +132,13 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) { zbookmark_t czb; - int err = 0; + int err = 0, lasterr = 0; arc_buf_t *buf = NULL; struct prefetch_data *pd = td->td_pfd; + boolean_t hard = td->td_flags & TRAVERSE_HARD; if (bp->blk_birth == 0) { - err = td->td_func(td->td_spa, NULL, zb, dnp, td->td_arg); + err = td->td_func(td->td_spa, NULL, NULL, zb, dnp, td->td_arg); return (err); } @@ -160,7 +158,7 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, } if (td->td_flags & TRAVERSE_PRE) { - err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); if (err) return (err); } @@ -184,12 +182,15 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, zb->zb_level - 1, zb->zb_blkid * epb + i); err = traverse_visitbp(td, dnp, buf, cbp, &czb); - if (err) - break; + if (err) { + if (!hard) + break; + lasterr = err; + } } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_WAIT; - int i, j; + int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; err = arc_read(NULL, td->td_spa, bp, pbuf, @@ -200,21 +201,19 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, /* recursively visitbp() blocks below this */ dnp = buf->b_data; - for (i = 0; i < epb && err == 0; i++, dnp++) { - for (j = 0; j < dnp->dn_nblkptr; j++) { - SET_BOOKMARK(&czb, zb->zb_objset, - zb->zb_blkid * epb + i, - dnp->dn_nlevels - 1, j); - err = traverse_visitbp(td, dnp, buf, - (blkptr_t *)&dnp->dn_blkptr[j], &czb); - if (err) + for (i = 0; i < epb; i++, dnp++) { + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + zb->zb_blkid * epb + i); + if (err) { + if (!hard) break; + lasterr = err; } } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; objset_phys_t *osp; - int j; + dnode_phys_t *dnp; err = arc_read_nolock(NULL, td->td_spa, bp, arc_getbuf_func, &buf, @@ -223,36 +222,65 @@ traverse_visitbp(struct traverse_data *td, const dnode_phys_t *dnp, return (err); osp = buf->b_data; - /* - * traverse_zil is just here for zdb's leak checking. - * For other consumers, there will be no ZIL blocks. - */ traverse_zil(td, &osp->os_zil_header); - for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { - SET_BOOKMARK(&czb, zb->zb_objset, 0, - osp->os_meta_dnode.dn_nlevels - 1, j); - err = traverse_visitbp(td, &osp->os_meta_dnode, buf, - (blkptr_t *)&osp->os_meta_dnode.dn_blkptr[j], - &czb); - if (err) - break; + dnp = &osp->os_meta_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_META_DNODE_OBJECT); + if (err && hard) { + lasterr = err; + err = 0; + } + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + dnp = &osp->os_userused_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_USERUSED_OBJECT); + } + if (err && hard) { + lasterr = err; + err = 0; + } + if (err == 0 && arc_buf_size(buf) >= sizeof (objset_phys_t)) { + dnp = &osp->os_groupused_dnode; + err = traverse_dnode(td, dnp, buf, zb->zb_objset, + DMU_GROUPUSED_OBJECT); } } if (buf) (void) arc_buf_remove_ref(buf, &buf); - if (err == 0 && (td->td_flags & TRAVERSE_POST)) - err = td->td_func(td->td_spa, bp, zb, dnp, td->td_arg); + if (err == 0 && lasterr == 0 && (td->td_flags & TRAVERSE_POST)) + err = td->td_func(td->td_spa, NULL, bp, zb, dnp, td->td_arg); - return (err); + return (err != 0 ? err : lasterr); +} + +static int +traverse_dnode(struct traverse_data *td, const dnode_phys_t *dnp, + arc_buf_t *buf, uint64_t objset, uint64_t object) +{ + int j, err = 0, lasterr = 0; + zbookmark_t czb; + boolean_t hard = (td->td_flags & TRAVERSE_HARD); + + for (j = 0; j < dnp->dn_nblkptr; j++) { + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + err = traverse_visitbp(td, dnp, buf, + (blkptr_t *)&dnp->dn_blkptr[j], &czb); + if (err) { + if (!hard) + break; + lasterr = err; + } + } + return (err != 0 ? err : lasterr); } /* ARGSUSED */ static int -traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, - const dnode_phys_t *dnp, void *arg) +traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct prefetch_data *pfd = arg; uint32_t aflags = ARC_NOWAIT | ARC_PREFETCH; @@ -262,7 +290,8 @@ traverse_prefetcher(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, return (EINTR); if (bp == NULL || !((pfd->pd_flags & TRAVERSE_PREFETCH_DATA) || - BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0)) + BP_GET_TYPE(bp) == DMU_OT_DNODE || BP_GET_LEVEL(bp) > 0) || + BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG) return (0); mutex_enter(&pfd->pd_mtx); @@ -291,7 +320,8 @@ traverse_prefetch_thread(void *arg) td.td_arg = td_main->td_pfd; td.td_pfd = NULL; - SET_BOOKMARK(&czb, td.td_objset, 0, -1, 0); + SET_BOOKMARK(&czb, td.td_objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); (void) traverse_visitbp(&td, NULL, NULL, td.td_rootbp, &czb); mutex_enter(&td_main->td_pfd->pd_mtx); @@ -332,7 +362,8 @@ traverse_impl(spa_t *spa, uint64_t objset, blkptr_t *rootbp, &td, TQ_NOQUEUE)) pd.pd_exited = B_TRUE; - SET_BOOKMARK(&czb, objset, 0, -1, 0); + SET_BOOKMARK(&czb, objset, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); err = traverse_visitbp(&td, NULL, NULL, rootbp, &czb); mutex_enter(&pd.pd_mtx); @@ -364,43 +395,59 @@ traverse_dataset(dsl_dataset_t *ds, uint64_t txg_start, int flags, * NB: pool must not be changing on-disk (eg, from zdb or sync context). */ int -traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg) +traverse_pool(spa_t *spa, uint64_t txg_start, int flags, + blkptr_cb_t func, void *arg) { - int err; + int err, lasterr = 0; uint64_t obj; dsl_pool_t *dp = spa_get_dsl(spa); objset_t *mos = dp->dp_meta_objset; + boolean_t hard = (flags & TRAVERSE_HARD); /* visit the MOS */ err = traverse_impl(spa, 0, spa_get_rootblkptr(spa), - 0, TRAVERSE_PRE, func, arg); + txg_start, flags, func, arg); if (err) return (err); /* visit each dataset */ - for (obj = 1; err == 0; err = dmu_object_next(mos, &obj, FALSE, 0)) { + for (obj = 1; err == 0 || (err != ESRCH && hard); + err = dmu_object_next(mos, &obj, FALSE, txg_start)) { dmu_object_info_t doi; err = dmu_object_info(mos, obj, &doi); - if (err) - return (err); + if (err) { + if (!hard) + return (err); + lasterr = err; + continue; + } if (doi.doi_type == DMU_OT_DSL_DATASET) { dsl_dataset_t *ds; + uint64_t txg = txg_start; + rw_enter(&dp->dp_config_rwlock, RW_READER); err = dsl_dataset_hold_obj(dp, obj, FTAG, &ds); rw_exit(&dp->dp_config_rwlock); - if (err) - return (err); - err = traverse_dataset(ds, - ds->ds_phys->ds_prev_snap_txg, TRAVERSE_PRE, - func, arg); + if (err) { + if (!hard) + return (err); + lasterr = err; + continue; + } + if (ds->ds_phys->ds_prev_snap_txg > txg) + txg = ds->ds_phys->ds_prev_snap_txg; + err = traverse_dataset(ds, txg, flags, func, arg); dsl_dataset_rele(ds, FTAG); - if (err) - return (err); + if (err) { + if (!hard) + return (err); + lasterr = err; + } } } if (err == ESRCH) err = 0; - return (err); + return (err != 0 ? err : lasterr); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c index bf560e5657c1c..87907a6e33bae 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_tx.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,6 +48,8 @@ dmu_tx_create_dd(dsl_dir_t *dd) tx->tx_pool = dd->dd_pool; list_create(&tx->tx_holds, sizeof (dmu_tx_hold_t), offsetof(dmu_tx_hold_t, txh_node)); + list_create(&tx->tx_callbacks, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); #ifdef ZFS_DEBUG refcount_create(&tx->tx_space_written); refcount_create(&tx->tx_space_freed); @@ -58,9 +60,9 @@ dmu_tx_create_dd(dsl_dir_t *dd) dmu_tx_t * dmu_tx_create(objset_t *os) { - dmu_tx_t *tx = dmu_tx_create_dd(os->os->os_dsl_dataset->ds_dir); + dmu_tx_t *tx = dmu_tx_create_dd(os->os_dsl_dataset->ds_dir); tx->tx_objset = os; - tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os->os_dsl_dataset); + tx->tx_lastsnap_txg = dsl_dataset_prev_snap_txg(os->os_dsl_dataset); return (tx); } @@ -98,7 +100,7 @@ dmu_tx_hold_object_impl(dmu_tx_t *tx, objset_t *os, uint64_t object, int err; if (object != DMU_NEW_OBJECT) { - err = dnode_hold(os->os, object, tx, &dn); + err = dnode_hold(os, object, tx, &dn); if (err) { tx->tx_err = err; return (NULL); @@ -160,6 +162,50 @@ dmu_tx_check_ioerr(zio_t *zio, dnode_t *dn, int level, uint64_t blkid) return (err); } +static void +dmu_tx_count_twig(dmu_tx_hold_t *txh, dnode_t *dn, dmu_buf_impl_t *db, + int level, uint64_t blkid, boolean_t freeable, uint64_t *history) +{ + objset_t *os = dn->dn_objset; + dsl_dataset_t *ds = os->os_dsl_dataset; + int epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + dmu_buf_impl_t *parent = NULL; + blkptr_t *bp = NULL; + uint64_t space; + + if (level >= dn->dn_nlevels || history[level] == blkid) + return; + + history[level] = blkid; + + space = (level == 0) ? dn->dn_datablksz : (1ULL << dn->dn_indblkshift); + + if (db == NULL || db == dn->dn_dbuf) { + ASSERT(level != 0); + db = NULL; + } else { + ASSERT(db->db_dnode == dn); + ASSERT(db->db_level == level); + ASSERT(db->db.db_size == space); + ASSERT(db->db_blkid == blkid); + bp = db->db_blkptr; + parent = db->db_parent; + } + + freeable = (bp && (freeable || + dsl_dataset_block_freeable(ds, bp->blk_birth))); + + if (freeable) + txh->txh_space_tooverwrite += space; + else + txh->txh_space_towrite += space; + if (bp) + txh->txh_space_tounref += bp_get_dsize(os->os_spa, bp); + + dmu_tx_count_twig(txh, dn, parent, level + 1, + blkid >> epbs, freeable, history); +} + /* ARGSUSED */ static void dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) @@ -177,18 +223,24 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) min_ibs = DN_MIN_INDBLKSHIFT; max_ibs = DN_MAX_INDBLKSHIFT; - /* - * For i/o error checking, read the first and last level-0 - * blocks (if they are not aligned), and all the level-1 blocks. - */ - if (dn) { + uint64_t history[DN_MAX_LEVELS]; + int nlvls = dn->dn_nlevels; + int delta; + + /* + * For i/o error checking, read the first and last level-0 + * blocks (if they are not aligned), and all the level-1 blocks. + */ if (dn->dn_maxblkid == 0) { - if ((off > 0 || len < dn->dn_datablksz) && - off < dn->dn_datablksz) { + delta = dn->dn_datablksz; + start = (off < dn->dn_datablksz) ? 0 : 1; + end = (off+len <= dn->dn_datablksz) ? 0 : 1; + if (start == 0 && (off > 0 || len < dn->dn_datablksz)) { err = dmu_tx_check_ioerr(NULL, dn, 0, 0); if (err) goto out; + delta -= off; } } else { zio_t *zio = zio_root(dn->dn_objset->os_spa, @@ -213,10 +265,9 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) } /* level-1 blocks */ - if (dn->dn_nlevels > 1) { - start >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - end >>= dn->dn_indblkshift - SPA_BLKPTRSHIFT; - for (i = start+1; i < end; i++) { + if (nlvls > 1) { + int shft = dn->dn_indblkshift - SPA_BLKPTRSHIFT; + for (i = (start>>shft)+1; i < end>>shft; i++) { err = dmu_tx_check_ioerr(zio, dn, 1, i); if (err) goto out; @@ -226,20 +277,59 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) err = zio_wait(zio); if (err) goto out; + delta = P2NPHASE(off, dn->dn_datablksz); } - } - /* - * If there's more than one block, the blocksize can't change, - * so we can make a more precise estimate. Alternatively, - * if the dnode's ibs is larger than max_ibs, always use that. - * This ensures that if we reduce DN_MAX_INDBLKSHIFT, - * the code will still work correctly on existing pools. - */ - if (dn && (dn->dn_maxblkid != 0 || dn->dn_indblkshift > max_ibs)) { - min_ibs = max_ibs = dn->dn_indblkshift; - if (dn->dn_datablkshift != 0) + if (dn->dn_maxblkid > 0) { + /* + * The blocksize can't change, + * so we can make a more precise estimate. + */ + ASSERT(dn->dn_datablkshift != 0); min_bs = max_bs = dn->dn_datablkshift; + min_ibs = max_ibs = dn->dn_indblkshift; + } else if (dn->dn_indblkshift > max_ibs) { + /* + * This ensures that if we reduce DN_MAX_INDBLKSHIFT, + * the code will still work correctly on older pools. + */ + min_ibs = max_ibs = dn->dn_indblkshift; + } + + /* + * If this write is not off the end of the file + * we need to account for overwrites/unref. + */ + if (start <= dn->dn_maxblkid) { + for (int l = 0; l < DN_MAX_LEVELS; l++) + history[l] = -1ULL; + } + while (start <= dn->dn_maxblkid) { + dmu_buf_impl_t *db; + + rw_enter(&dn->dn_struct_rwlock, RW_READER); + db = dbuf_hold_level(dn, 0, start, FTAG); + rw_exit(&dn->dn_struct_rwlock); + dmu_tx_count_twig(txh, dn, db, 0, start, B_FALSE, + history); + dbuf_rele(db, FTAG); + if (++start > end) { + /* + * Account for new indirects appearing + * before this IO gets assigned into a txg. + */ + bits = 64 - min_bs; + epbs = min_ibs - SPA_BLKPTRSHIFT; + for (bits -= epbs * (nlvls - 1); + bits >= 0; bits -= epbs) + txh->txh_fudge += 1ULL << max_ibs; + goto out; + } + off += delta; + if (len >= delta) + len -= delta; + delta = dn->dn_datablksz; + } } /* @@ -262,20 +352,22 @@ dmu_tx_count_write(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) for (bits = 64 - min_bs; bits >= 0; bits -= epbs) { start >>= epbs; end >>= epbs; - /* - * If we increase the number of levels of indirection, - * we'll need new blkid=0 indirect blocks. If start == 0, - * we're already accounting for that blocks; and if end == 0, - * we can't increase the number of levels beyond that. - */ - if (start != 0 && end != 0) - txh->txh_space_towrite += 1ULL << max_ibs; + ASSERT3U(end, >=, start); txh->txh_space_towrite += (end - start + 1) << max_ibs; + if (start != 0) { + /* + * We also need a new blkid=0 indirect block + * to reference any existing file data. + */ + txh->txh_space_towrite += 1ULL << max_ibs; + } } - ASSERT(txh->txh_space_towrite < 2 * DMU_MAX_ACCESS); - out: + if (txh->txh_space_towrite + txh->txh_space_tooverwrite > + 2 * DMU_MAX_ACCESS) + err = EFBIG; + if (err) txh->txh_tx->tx_err = err; } @@ -284,7 +376,7 @@ static void dmu_tx_count_dnode(dmu_tx_hold_t *txh) { dnode_t *dn = txh->txh_dnode; - dnode_t *mdn = txh->txh_tx->tx_objset->os->os_meta_dnode; + dnode_t *mdn = txh->txh_tx->tx_objset->os_meta_dnode; uint64_t space = mdn->dn_datablksz + ((mdn->dn_nlevels-1) << mdn->dn_indblkshift); @@ -292,6 +384,7 @@ dmu_tx_count_dnode(dmu_tx_hold_t *txh) dsl_dataset_block_freeable(dn->dn_objset->os_dsl_dataset, dn->dn_dbuf->db_blkptr->blk_birth)) { txh->txh_space_tooverwrite += space; + txh->txh_space_tounref += space; } else { txh->txh_space_towrite += space; if (dn && dn->dn_dbuf->db_blkptr) @@ -366,7 +459,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) bp += blkid + i; if (dsl_dataset_block_freeable(ds, bp->blk_birth)) { dprintf_bp(bp, "can free old%s", ""); - space += bp_get_dasize(spa, bp); + space += bp_get_dsize(spa, bp); } unref += BP_GET_ASIZE(bp); } @@ -425,11 +518,15 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) dbuf = dbuf_hold_level(dn, 1, blkid >> epbs, FTAG); txh->txh_memory_tohold += dbuf->db.db_size; - if (txh->txh_memory_tohold > DMU_MAX_ACCESS) { - txh->txh_tx->tx_err = E2BIG; - dbuf_rele(dbuf, FTAG); - break; - } + + /* + * We don't check memory_tohold against DMU_MAX_ACCESS because + * memory_tohold is an over-estimation (especially the >L1 + * indirect blocks), so it could fail. Callers should have + * already verified that they will not be holding too much + * memory. + */ + err = dbuf_read(dbuf, NULL, DB_RF_HAVESTRUCT | DB_RF_CANFAIL); if (err != 0) { txh->txh_tx->tx_err = err; @@ -443,7 +540,7 @@ dmu_tx_count_free(dmu_tx_hold_t *txh, uint64_t off, uint64_t len) for (i = 0; i < tochk; i++) { if (dsl_dataset_block_freeable(ds, bp[i].blk_birth)) { dprintf_bp(&bp[i], "can free old%s", ""); - space += bp_get_dasize(spa, &bp[i]); + space += bp_get_dsize(spa, &bp[i]); } unref += BP_GET_ASIZE(bp); } @@ -488,6 +585,8 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) if (len != DMU_OBJECT_END) dmu_tx_count_write(txh, off+len, 1); + dmu_tx_count_dnode(txh); + if (off >= (dn->dn_maxblkid+1) * dn->dn_datablksz) return; if (len == DMU_OBJECT_END) @@ -530,12 +629,11 @@ dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len) } } - dmu_tx_count_dnode(txh); dmu_tx_count_free(txh, off, len); } void -dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) +dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name) { dmu_tx_hold_t *txh; dnode_t *dn; @@ -584,9 +682,9 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) txh->txh_space_tooverwrite += SPA_MAXBLOCKSIZE; } else { txh->txh_space_towrite += SPA_MAXBLOCKSIZE; - txh->txh_space_tounref += - BP_GET_ASIZE(dn->dn_phys->dn_blkptr); } + if (dn->dn_phys->dn_blkptr[0].blk_birth) + txh->txh_space_tounref += SPA_MAXBLOCKSIZE; return; } @@ -595,7 +693,7 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) * access the name in this fat-zap so that we'll check * for i/o errors to the leaf blocks, etc. */ - err = zap_lookup(&dn->dn_objset->os, dn->dn_object, name, + err = zap_lookup(dn->dn_objset, dn->dn_object, name, 8, 0, NULL); if (err == EIO) { tx->tx_err = err; @@ -603,12 +701,8 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) } } - /* - * 3 blocks overwritten: target leaf, ptrtbl block, header block - * 3 new blocks written if adding: new split leaf, 2 grown ptrtbl blocks - */ - dmu_tx_count_write(txh, dn->dn_maxblkid * dn->dn_datablksz, - (3 + (add ? 3 : 0)) << dn->dn_datablkshift); + err = zap_count_write(dn->dn_objset, dn->dn_object, name, add, + &txh->txh_space_towrite, &txh->txh_space_tooverwrite); /* * If the modified blocks are scattered to the four winds, @@ -616,7 +710,10 @@ dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name) */ epbs = dn->dn_indblkshift - SPA_BLKPTRSHIFT; for (nblocks = dn->dn_maxblkid >> epbs; nblocks != 0; nblocks >>= epbs) - txh->txh_space_towrite += 3 << dn->dn_indblkshift; + if (dn->dn_objset->os_dsl_dataset->ds_phys->ds_prev_snap_obj) + txh->txh_space_towrite += 3 << dn->dn_indblkshift; + else + txh->txh_space_tooverwrite += 3 << dn->dn_indblkshift; } void @@ -679,7 +776,7 @@ dmu_tx_dirty_buf(dmu_tx_t *tx, dmu_buf_impl_t *db) dnode_t *dn = db->db_dnode; ASSERT(tx->tx_txg != 0); - ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset->os); + ASSERT(tx->tx_objset == NULL || dn->dn_objset == tx->tx_objset); ASSERT3U(dn->dn_object, ==, db->db.db_object); if (tx->tx_anyobj) @@ -839,7 +936,7 @@ dmu_tx_try_assign(dmu_tx_t *tx, uint64_t txg_how) * assume that we won't be able to free or overwrite anything. */ if (tx->tx_objset && - dsl_dataset_prev_snap_txg(tx->tx_objset->os->os_dsl_dataset) > + dsl_dataset_prev_snap_txg(tx->tx_objset->os_dsl_dataset) > tx->tx_lastsnap_txg) { towrite += tooverwrite; tooverwrite = tofree = 0; @@ -1020,8 +1117,13 @@ dmu_tx_commit(dmu_tx_t *tx) if (tx->tx_tempreserve_cookie) dsl_dir_tempreserve_clear(tx->tx_tempreserve_cookie, tx); + if (!list_is_empty(&tx->tx_callbacks)) + txg_register_callbacks(&tx->tx_txgh, &tx->tx_callbacks); + if (tx->tx_anyobj == FALSE) txg_rele_to_sync(&tx->tx_txgh); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG dprintf("towrite=%llu written=%llu tofree=%llu freed=%llu\n", @@ -1050,6 +1152,14 @@ dmu_tx_abort(dmu_tx_t *tx) if (dn != NULL) dnode_rele(dn, tx); } + + /* + * Call any registered callbacks with an error code. + */ + if (!list_is_empty(&tx->tx_callbacks)) + dmu_tx_do_callbacks(&tx->tx_callbacks, ECANCELED); + + list_destroy(&tx->tx_callbacks); list_destroy(&tx->tx_holds); #ifdef ZFS_DEBUG refcount_destroy_many(&tx->tx_space_written, @@ -1066,3 +1176,31 @@ dmu_tx_get_txg(dmu_tx_t *tx) ASSERT(tx->tx_txg != 0); return (tx->tx_txg); } + +void +dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *func, void *data) +{ + dmu_tx_callback_t *dcb; + + dcb = kmem_alloc(sizeof (dmu_tx_callback_t), KM_SLEEP); + + dcb->dcb_func = func; + dcb->dcb_data = data; + + list_insert_tail(&tx->tx_callbacks, dcb); +} + +/* + * Call all the commit callbacks on a list, with a given error code. + */ +void +dmu_tx_do_callbacks(list_t *cb_list, int error) +{ + dmu_tx_callback_t *dcb; + + while (dcb = list_head(cb_list)) { + list_remove(cb_list, dcb); + dcb->dcb_func(dcb->dcb_data, error); + kmem_free(dcb, sizeof (dmu_tx_callback_t)); + } +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c index 4d79fe98e17ee..37037c30f6235 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dmu_zfetch.c @@ -19,18 +19,17 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include #include #include #include +#include /* * I'm against tune-ables, but these should probably exist as tweakable globals @@ -59,6 +58,41 @@ static zstream_t *dmu_zfetch_stream_reclaim(zfetch_t *); static void dmu_zfetch_stream_remove(zfetch_t *, zstream_t *); static int dmu_zfetch_streams_equal(zstream_t *, zstream_t *); +typedef struct zfetch_stats { + kstat_named_t zfetchstat_hits; + kstat_named_t zfetchstat_misses; + kstat_named_t zfetchstat_colinear_hits; + kstat_named_t zfetchstat_colinear_misses; + kstat_named_t zfetchstat_stride_hits; + kstat_named_t zfetchstat_stride_misses; + kstat_named_t zfetchstat_reclaim_successes; + kstat_named_t zfetchstat_reclaim_failures; + kstat_named_t zfetchstat_stream_resets; + kstat_named_t zfetchstat_stream_noresets; + kstat_named_t zfetchstat_bogus_streams; +} zfetch_stats_t; + +static zfetch_stats_t zfetch_stats = { + { "hits", KSTAT_DATA_UINT64 }, + { "misses", KSTAT_DATA_UINT64 }, + { "colinear_hits", KSTAT_DATA_UINT64 }, + { "colinear_misses", KSTAT_DATA_UINT64 }, + { "stride_hits", KSTAT_DATA_UINT64 }, + { "stride_misses", KSTAT_DATA_UINT64 }, + { "reclaim_successes", KSTAT_DATA_UINT64 }, + { "reclaim_failures", KSTAT_DATA_UINT64 }, + { "streams_resets", KSTAT_DATA_UINT64 }, + { "streams_noresets", KSTAT_DATA_UINT64 }, + { "bogus_streams", KSTAT_DATA_UINT64 }, +}; + +#define ZFETCHSTAT_INCR(stat, val) \ + atomic_add_64(&zfetch_stats.stat.value.ui64, (val)); + +#define ZFETCHSTAT_BUMP(stat) ZFETCHSTAT_INCR(stat, 1); + +kstat_t *zfetch_ksp; + /* * Given a zfetch structure and a zstream structure, determine whether the * blocks to be read are part of a co-linear pair of existing prefetch @@ -192,7 +226,30 @@ dmu_zfetch_dofetch(zfetch_t *zf, zstream_t *zs) break; } zs->zst_ph_offset = prefetch_tail; - zs->zst_last = lbolt; + zs->zst_last = ddi_get_lbolt(); +} + +void +zfetch_init(void) +{ + + zfetch_ksp = kstat_create("zfs", 0, "zfetchstats", "misc", + KSTAT_TYPE_NAMED, sizeof (zfetch_stats) / sizeof (kstat_named_t), + KSTAT_FLAG_VIRTUAL); + + if (zfetch_ksp != NULL) { + zfetch_ksp->ks_data = &zfetch_stats; + kstat_install(zfetch_ksp); + } +} + +void +zfetch_fini(void) +{ + if (zfetch_ksp != NULL) { + kstat_delete(zfetch_ksp); + zfetch_ksp = NULL; + } } /* @@ -265,7 +322,7 @@ dmu_zfetch_fetchsz(dnode_t *dn, uint64_t blkid, uint64_t nblks) } /* - * given a zfetch and a zsearch structure, see if there is an associated zstream + * given a zfetch and a zstream structure, see if there is an associated zstream * for this block read. If so, it starts a prefetch for the stream it * located and returns true, otherwise it returns false */ @@ -297,6 +354,7 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) */ if (zs->zst_len == 0) { /* bogus stream */ + ZFETCHSTAT_BUMP(zfetchstat_bogus_streams); continue; } @@ -306,9 +364,14 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) */ if (zh->zst_offset >= zs->zst_offset && zh->zst_offset < zs->zst_offset + zs->zst_len) { - /* already fetched */ - rc = 1; - goto out; + if (prefetched) { + /* already fetched */ + ZFETCHSTAT_BUMP(zfetchstat_stride_hits); + rc = 1; + goto out; + } else { + ZFETCHSTAT_BUMP(zfetchstat_stride_misses); + } } /* @@ -413,6 +476,7 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) if (reset) { zstream_t *remove = zs; + ZFETCHSTAT_BUMP(zfetchstat_stream_resets); rc = 0; mutex_exit(&zs->zst_lock); rw_exit(&zf->zf_rwlock); @@ -431,6 +495,7 @@ dmu_zfetch_find(zfetch_t *zf, zstream_t *zh, int prefetched) } } } else { + ZFETCHSTAT_BUMP(zfetchstat_stream_noresets); rc = 1; dmu_zfetch_dofetch(zf, zs); mutex_exit(&zs->zst_lock); @@ -487,13 +552,12 @@ dmu_zfetch_stream_insert(zfetch_t *zf, zstream_t *zs) zs_next = list_next(&zf->zf_stream, zs_walk); if (dmu_zfetch_streams_equal(zs_walk, zs)) { - return (0); + return (0); } } list_insert_head(&zf->zf_stream, zs); zf->zf_stream_cnt++; - return (1); } @@ -513,7 +577,7 @@ dmu_zfetch_stream_reclaim(zfetch_t *zf) for (zs = list_head(&zf->zf_stream); zs; zs = list_next(&zf->zf_stream, zs)) { - if (((lbolt - zs->zst_last) / hz) > zfetch_min_sec_reap) + if (((ddi_get_lbolt() - zs->zst_last)/hz) > zfetch_min_sec_reap) break; } @@ -597,8 +661,15 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) P2ALIGN(offset, blksz)) >> blkshft; fetched = dmu_zfetch_find(zf, &zst, prefetched); - if (!fetched) { - fetched = dmu_zfetch_colinear(zf, &zst); + if (fetched) { + ZFETCHSTAT_BUMP(zfetchstat_hits); + } else { + ZFETCHSTAT_BUMP(zfetchstat_misses); + if (fetched = dmu_zfetch_colinear(zf, &zst)) { + ZFETCHSTAT_BUMP(zfetchstat_colinear_hits); + } else { + ZFETCHSTAT_BUMP(zfetchstat_colinear_misses); + } } if (!fetched) { @@ -608,11 +679,14 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) * we still couldn't find a stream, drop the lock, and allocate * one if possible. Otherwise, give up and go home. */ - if (newstream == NULL) { + if (newstream) { + ZFETCHSTAT_BUMP(zfetchstat_reclaim_successes); + } else { uint64_t maxblocks; uint32_t max_streams; uint32_t cur_streams; + ZFETCHSTAT_BUMP(zfetchstat_reclaim_failures); cur_streams = zf->zf_stream_cnt; maxblocks = zf->zf_dnode->dn_maxblkid; @@ -625,7 +699,6 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) if (cur_streams >= max_streams) { return; } - newstream = kmem_zalloc(sizeof (zstream_t), KM_SLEEP); } @@ -635,7 +708,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t offset, uint64_t size, int prefetched) newstream->zst_ph_offset = zst.zst_len + zst.zst_offset; newstream->zst_cap = zst.zst_len; newstream->zst_direction = ZFETCH_FORWARD; - newstream->zst_last = lbolt; + newstream->zst_last = ddi_get_lbolt(); mutex_init(&newstream->zst_lock, NULL, MUTEX_DEFAULT, NULL); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c index e77834d60dcc3..d15fe8d86243b 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dnode.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -56,6 +56,8 @@ dnode_cons(void *arg, void *unused, int kmflag) rw_init(&dn->dn_struct_rwlock, NULL, RW_DEFAULT, NULL); mutex_init(&dn->dn_mtx, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dn->dn_dbufs_mtx, NULL, MUTEX_DEFAULT, NULL); + cv_init(&dn->dn_notxholds, NULL, CV_DEFAULT, NULL); + refcount_create(&dn->dn_holds); refcount_create(&dn->dn_tx_holds); @@ -84,6 +86,7 @@ dnode_dest(void *arg, void *unused) rw_destroy(&dn->dn_struct_rwlock); mutex_destroy(&dn->dn_mtx); mutex_destroy(&dn->dn_dbufs_mtx); + cv_destroy(&dn->dn_notxholds); refcount_destroy(&dn->dn_holds); refcount_destroy(&dn->dn_tx_holds); @@ -153,7 +156,7 @@ dnode_verify(dnode_t *dn) } if (dn->dn_phys->dn_type != DMU_OT_NONE) ASSERT3U(dn->dn_phys->dn_nlevels, <=, dn->dn_nlevels); - ASSERT(dn->dn_object == DMU_META_DNODE_OBJECT || dn->dn_dbuf != NULL); + ASSERT(DMU_OBJECT_IS_SPECIAL(dn->dn_object) || dn->dn_dbuf != NULL); if (dn->dn_dbuf != NULL) { ASSERT3P(dn->dn_phys, ==, (dnode_phys_t *)dn->dn_dbuf->db.db_data + @@ -269,7 +272,7 @@ dnode_setdblksz(dnode_t *dn, int size) } static dnode_t * -dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, +dnode_create(objset_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, uint64_t object) { dnode_t *dn = kmem_cache_alloc(dnode_cache, KM_SLEEP); @@ -299,14 +302,14 @@ dnode_create(objset_impl_t *os, dnode_phys_t *dnp, dmu_buf_impl_t *db, list_insert_head(&os->os_dnodes, dn); mutex_exit(&os->os_lock); - arc_space_consume(sizeof (dnode_t)); + arc_space_consume(sizeof (dnode_t), ARC_SPACE_OTHER); return (dn); } static void dnode_destroy(dnode_t *dn) { - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; #ifdef ZFS_DEBUG int i; @@ -318,6 +321,7 @@ dnode_destroy(dnode_t *dn) } ASSERT(NULL == list_head(&dn->dn_dbufs)); #endif + ASSERT(dn->dn_oldphys == NULL); mutex_enter(&os->os_lock); list_remove(&os->os_dnodes, dn); @@ -334,7 +338,7 @@ dnode_destroy(dnode_t *dn) dn->dn_bonus = NULL; } kmem_cache_free(dnode_cache, dn); - arc_space_return(sizeof (dnode_t)); + arc_space_return(sizeof (dnode_t), ARC_SPACE_OTHER); } void @@ -414,8 +418,7 @@ void dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) { - int i, old_nblkptr; - dmu_buf_impl_t *db = NULL; + int nblkptr; ASSERT3U(blocksize, >=, SPA_MINBLOCKSIZE); ASSERT3U(blocksize, <=, SPA_MAXBLOCKSIZE); @@ -427,57 +430,40 @@ dnode_reallocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, ASSERT3U(bonustype, <, DMU_OT_NUMTYPES); ASSERT3U(bonuslen, <=, DN_MAX_BONUSLEN); - for (i = 0; i < TXG_SIZE; i++) - ASSERT(!list_link_active(&dn->dn_dirty_link[i])); - /* clean up any unreferenced dbufs */ dnode_evict_dbufs(dn); - ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL); - /* - * XXX I should really have a generation number to tell if we - * need to do this... - */ - if (blocksize != dn->dn_datablksz || - dn->dn_bonustype != bonustype || dn->dn_bonuslen != bonuslen) { - /* free all old data */ - dnode_free_range(dn, 0, -1ULL, tx); - } - - /* change blocksize */ rw_enter(&dn->dn_struct_rwlock, RW_WRITER); - if (blocksize != dn->dn_datablksz && - (!BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || - list_head(&dn->dn_dbufs) != NULL)) { - db = dbuf_hold(dn, 0, FTAG); - dbuf_new_size(db, blocksize, tx); - } - dnode_setdblksz(dn, blocksize); dnode_setdirty(dn, tx); - dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; - dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; + if (dn->dn_datablksz != blocksize) { + /* change blocksize */ + ASSERT(dn->dn_maxblkid == 0 && + (BP_IS_HOLE(&dn->dn_phys->dn_blkptr[0]) || + dnode_block_freed(dn, 0))); + dnode_setdblksz(dn, blocksize); + dn->dn_next_blksz[tx->tx_txg&TXG_MASK] = blocksize; + } + if (dn->dn_bonuslen != bonuslen) + dn->dn_next_bonuslen[tx->tx_txg&TXG_MASK] = bonuslen; + nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + if (dn->dn_nblkptr != nblkptr) + dn->dn_next_nblkptr[tx->tx_txg&TXG_MASK] = nblkptr; rw_exit(&dn->dn_struct_rwlock); - if (db) - dbuf_rele(db, FTAG); /* change type */ dn->dn_type = ot; /* change bonus size and type */ mutex_enter(&dn->dn_mtx); - old_nblkptr = dn->dn_nblkptr; dn->dn_bonustype = bonustype; dn->dn_bonuslen = bonuslen; - dn->dn_nblkptr = 1 + ((DN_MAX_BONUSLEN - bonuslen) >> SPA_BLKPTRSHIFT); + dn->dn_nblkptr = nblkptr; dn->dn_checksum = ZIO_CHECKSUM_INHERIT; dn->dn_compress = ZIO_COMPRESS_INHERIT; ASSERT3U(dn->dn_nblkptr, <=, DN_MAX_NBLKPTR); - /* XXX - for now, we can't make nblkptr smaller */ - ASSERT3U(dn->dn_nblkptr, >=, old_nblkptr); - - /* fix up the bonus db_size if dn_nblkptr has changed */ - if (dn->dn_bonus && dn->dn_bonuslen != old_nblkptr) { + /* fix up the bonus db_size */ + if (dn->dn_bonus) { dn->dn_bonus->db.db_size = DN_MAX_BONUSLEN - (dn->dn_nblkptr-1) * sizeof (blkptr_t); ASSERT(dn->dn_bonuslen <= dn->dn_bonus->db.db_size); @@ -502,7 +488,7 @@ dnode_special_close(dnode_t *dn) } dnode_t * -dnode_special_open(objset_impl_t *os, dnode_phys_t *dnp, uint64_t object) +dnode_special_open(objset_t *os, dnode_phys_t *dnp, uint64_t object) { dnode_t *dn = dnode_create(os, dnp, NULL, object); DNODE_VERIFY(dn); @@ -549,7 +535,7 @@ dnode_buf_pageout(dmu_buf_t *db, void *arg) * succeeds even for free dnodes. */ int -dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, +dnode_hold_impl(objset_t *os, uint64_t object, int flag, void *tag, dnode_t **dnp) { int epb, idx, err; @@ -566,6 +552,22 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, */ ASSERT(spa_config_held(os->os_spa, SCL_ALL, RW_WRITER) == 0); + if (object == DMU_USERUSED_OBJECT || object == DMU_GROUPUSED_OBJECT) { + dn = (object == DMU_USERUSED_OBJECT) ? + os->os_userused_dnode : os->os_groupused_dnode; + if (dn == NULL) + return (ENOENT); + type = dn->dn_type; + if ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) + return (ENOENT); + if ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE) + return (EEXIST); + DNODE_VERIFY(dn); + (void) refcount_add(&dn->dn_holds, tag); + *dnp = dn; + return (0); + } + if (object == 0 || object >= DN_MAX_OBJECT) return (EINVAL); @@ -624,7 +626,8 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, type = dn->dn_type; if (dn->dn_free_txg || ((flag & DNODE_MUST_BE_ALLOCATED) && type == DMU_OT_NONE) || - ((flag & DNODE_MUST_BE_FREE) && type != DMU_OT_NONE)) { + ((flag & DNODE_MUST_BE_FREE) && + (type != DMU_OT_NONE || dn->dn_oldphys))) { mutex_exit(&dn->dn_mtx); dbuf_rele(db, FTAG); return (type == DMU_OT_NONE ? ENOENT : EEXIST); @@ -647,7 +650,7 @@ dnode_hold_impl(objset_impl_t *os, uint64_t object, int flag, * Return held dnode if the object is allocated, NULL if not. */ int -dnode_hold(objset_impl_t *os, uint64_t object, void *tag, dnode_t **dnp) +dnode_hold(objset_t *os, uint64_t object, void *tag, dnode_t **dnp) { return (dnode_hold_impl(os, object, DNODE_MUST_BE_ALLOCATED, tag, dnp)); } @@ -686,11 +689,13 @@ dnode_rele(dnode_t *dn, void *tag) void dnode_setdirty(dnode_t *dn, dmu_tx_t *tx) { - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; uint64_t txg = tx->tx_txg; - if (dn->dn_object == DMU_META_DNODE_OBJECT) + if (DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + dsl_dataset_dirty(os->os_dsl_dataset, tx); return; + } DNODE_VERIFY(dn); @@ -1186,11 +1191,6 @@ dnode_block_freed(dnode_t *dn, uint64_t blkid) if (dn->dn_free_txg) return (TRUE); - /* - * If dn_datablkshift is not set, then there's only a single - * block, in which case there will never be a free range so it - * won't matter. - */ range_tofind.fr_blkid = blkid; mutex_enter(&dn->dn_mtx); for (i = 0; i < TXG_SIZE; i++) { @@ -1248,7 +1248,7 @@ dnode_diduse_space(dnode_t *dn, int64_t delta) void dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) { - objset_impl_t *os = dn->dn_objset; + objset_t *os = dn->dn_objset; dsl_dataset_t *ds = os->os_dsl_dataset; if (space > 0) @@ -1260,6 +1260,22 @@ dnode_willuse_space(dnode_t *dn, int64_t space, dmu_tx_t *tx) dmu_tx_willuse_space(tx, space); } +/* + * This function scans a block at the indicated "level" looking for + * a hole or data (depending on 'flags'). If level > 0, then we are + * scanning an indirect block looking at its pointers. If level == 0, + * then we are looking at a block of dnodes. If we don't find what we + * are looking for in the block, we return ESRCH. Otherwise, return + * with *offset pointing to the beginning (if searching forwards) or + * end (if searching backwards) of the range covered by the block + * pointer we matched on (or dnode). + * + * The basic search algorithm used below by dnode_next_offset() is to + * use this function to search up the block tree (widen the search) until + * we find something (i.e., we don't return ESRCH) and then search back + * down the tree (narrow the search) until we reach our original search + * level. + */ static int dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, int lvl, uint64_t blkfill, uint64_t txg) @@ -1275,7 +1291,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, dprintf("probing object %llu offset %llx level %d of %u\n", dn->dn_object, *offset, lvl, dn->dn_phys->dn_nlevels); - hole = flags & DNODE_FIND_HOLE; + hole = ((flags & DNODE_FIND_HOLE) != 0); inc = (flags & DNODE_FIND_BACKWARDS) ? -1 : 1; ASSERT(txg == 0 || !hole); @@ -1322,16 +1338,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, for (i = (*offset >> span) & (blkfill - 1); i >= 0 && i < blkfill; i += inc) { - boolean_t newcontents = B_TRUE; - if (txg) { - int j; - newcontents = B_FALSE; - for (j = 0; j < dnp[i].dn_nblkptr; j++) { - if (dnp[i].dn_blkptr[j].blk_birth > txg) - newcontents = B_TRUE; - } - } - if (!dnp[i].dn_type == hole && newcontents) + if ((dnp[i].dn_type == DMU_OT_NONE) == hole) break; *offset += (1ULL << span) * inc; } @@ -1339,6 +1346,7 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, error = ESRCH; } else { blkptr_t *bp = data; + uint64_t start = *offset; span = (lvl - 1) * epbs + dn->dn_datablkshift; minfill = 0; maxfill = blkfill << ((lvl - 1) * epbs); @@ -1348,18 +1356,25 @@ dnode_next_offset_level(dnode_t *dn, int flags, uint64_t *offset, else minfill++; - for (i = (*offset >> span) & ((1ULL << epbs) - 1); + *offset = *offset >> span; + for (i = BF64_GET(*offset, 0, epbs); i >= 0 && i < epb; i += inc) { if (bp[i].blk_fill >= minfill && bp[i].blk_fill <= maxfill && (hole || bp[i].blk_birth > txg)) break; - if (inc < 0 && *offset < (1ULL << span)) - *offset = 0; - else - *offset += (1ULL << span) * inc; + if (inc > 0 || *offset > 0) + *offset += inc; + } + *offset = *offset << span; + if (inc < 0) { + /* traversing backwards; position offset at the end */ + ASSERT3U(*offset, <=, start); + *offset = MIN(*offset + (1ULL << span) - 1, start); + } else if (*offset < start) { + *offset = start; } - if (i < 0 || i == epb) + if (i < 0 || i >= epb) error = ESRCH; } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c index 779cfc96f9e3c..b2d121ee60483 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dnode_sync.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -122,7 +120,7 @@ free_blocks(dnode_t *dn, blkptr_t *bp, int num, dmu_tx_t *tx) if (BP_IS_HOLE(bp)) continue; - bytesfreed += dsl_dataset_block_kill(ds, bp, dn->dn_zio, tx); + bytesfreed += dsl_dataset_block_kill(ds, bp, tx, B_FALSE); ASSERT3U(bytesfreed, <=, DN_USED_BYTES(dn->dn_phys)); bzero(bp, sizeof (blkptr_t)); blocks_freed += 1; @@ -426,6 +424,9 @@ dnode_undirty_dbufs(list_t *list) dmu_buf_impl_t *db = dr->dr_dbuf; uint64_t txg = dr->dr_txg; + if (db->db_level != 0) + dnode_undirty_dbufs(&dr->dt.di.dr_children); + mutex_enter(&db->db_mtx); /* XXX - use dbuf_undirty()? */ list_remove(list, dr); @@ -436,13 +437,9 @@ dnode_undirty_dbufs(list_t *list) ASSERT(db->db_blkid == DB_BONUS_BLKID || dr->dt.dl.dr_data == db->db_buf); dbuf_unoverride(dr); - mutex_exit(&db->db_mtx); - } else { - mutex_exit(&db->db_mtx); - dnode_undirty_dbufs(&dr->dt.di.dr_children); } kmem_free(dr, sizeof (dbuf_dirty_record_t)); - dbuf_rele(db, (void *)(uintptr_t)txg); + dbuf_rele_and_unlock(db, (void *)(uintptr_t)txg); } } @@ -506,9 +503,6 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx) /* * Write out the dnode's dirty buffers. - * - * NOTE: The dnode is kept in memory by being dirty. Once the - * dirty bit is cleared, it may be evicted. Beware of this! */ void dnode_sync(dnode_t *dn, dmu_tx_t *tx) @@ -517,33 +511,40 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dnode_phys_t *dnp = dn->dn_phys; int txgoff = tx->tx_txg & TXG_MASK; list_t *list = &dn->dn_dirty_records[txgoff]; + static const dnode_phys_t zerodn = { 0 }; ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dnp->dn_type != DMU_OT_NONE || dn->dn_allocated_txg); + ASSERT(dnp->dn_type != DMU_OT_NONE || + bcmp(dnp, &zerodn, DNODE_SIZE) == 0); DNODE_VERIFY(dn); ASSERT(dn->dn_dbuf == NULL || arc_released(dn->dn_dbuf->db_buf)); + if (dmu_objset_userused_enabled(dn->dn_objset) && + !DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { + ASSERT(dn->dn_oldphys == NULL); + dn->dn_oldphys = zio_buf_alloc(sizeof (dnode_phys_t)); + *dn->dn_oldphys = *dn->dn_phys; /* struct assignment */ + dn->dn_phys->dn_flags |= DNODE_FLAG_USERUSED_ACCOUNTED; + } else { + /* Once we account for it, we should always account for it. */ + ASSERT(!(dn->dn_phys->dn_flags & + DNODE_FLAG_USERUSED_ACCOUNTED)); + } + mutex_enter(&dn->dn_mtx); if (dn->dn_allocated_txg == tx->tx_txg) { /* The dnode is newly allocated or reallocated */ if (dnp->dn_type == DMU_OT_NONE) { /* this is a first alloc, not a realloc */ - /* XXX shouldn't the phys already be zeroed? */ - bzero(dnp, DNODE_CORE_SIZE); dnp->dn_nlevels = 1; + dnp->dn_nblkptr = dn->dn_nblkptr; } - if (dn->dn_nblkptr > dnp->dn_nblkptr) { - /* zero the new blkptrs we are gaining */ - bzero(dnp->dn_blkptr + dnp->dn_nblkptr, - sizeof (blkptr_t) * - (dn->dn_nblkptr - dnp->dn_nblkptr)); - } dnp->dn_type = dn->dn_type; dnp->dn_bonustype = dn->dn_bonustype; dnp->dn_bonuslen = dn->dn_bonuslen; - dnp->dn_nblkptr = dn->dn_nblkptr; } ASSERT(dnp->dn_nlevels > 1 || @@ -603,6 +604,30 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) return; } + if (dn->dn_next_nblkptr[txgoff]) { + /* this should only happen on a realloc */ + ASSERT(dn->dn_allocated_txg == tx->tx_txg); + if (dn->dn_next_nblkptr[txgoff] > dnp->dn_nblkptr) { + /* zero the new blkptrs we are gaining */ + bzero(dnp->dn_blkptr + dnp->dn_nblkptr, + sizeof (blkptr_t) * + (dn->dn_next_nblkptr[txgoff] - dnp->dn_nblkptr)); +#ifdef ZFS_DEBUG + } else { + int i; + ASSERT(dn->dn_next_nblkptr[txgoff] < dnp->dn_nblkptr); + /* the blkptrs we are losing better be unallocated */ + for (i = dn->dn_next_nblkptr[txgoff]; + i < dnp->dn_nblkptr; i++) + ASSERT(BP_IS_HOLE(&dnp->dn_blkptr[i])); +#endif + } + mutex_enter(&dn->dn_mtx); + dnp->dn_nblkptr = dn->dn_next_nblkptr[txgoff]; + dn->dn_next_nblkptr[txgoff] = 0; + mutex_exit(&dn->dn_mtx); + } + if (dn->dn_next_nlevels[txgoff]) { dnode_increase_indirection(dn, tx); dn->dn_next_nlevels[txgoff] = 0; @@ -610,7 +635,7 @@ dnode_sync(dnode_t *dn, dmu_tx_t *tx) dbuf_sync_list(list, tx); - if (dn->dn_object != DMU_META_DNODE_OBJECT) { + if (!DMU_OBJECT_IS_SPECIAL(dn->dn_object)) { ASSERT3P(list_head(list), ==, NULL); dnode_rele(dn, (void *)(uintptr_t)tx->tx_txg); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c index 93ea8aa111731..30b3811a8ae9c 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dataset.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,14 +38,12 @@ #include #include #include -#include +#include static char *dsl_reaper = "the grim reaper"; static dsl_checkfunc_t dsl_dataset_destroy_begin_check; static dsl_syncfunc_t dsl_dataset_destroy_begin_sync; -static dsl_checkfunc_t dsl_dataset_rollback_check; -static dsl_syncfunc_t dsl_dataset_rollback_sync; static dsl_syncfunc_t dsl_dataset_set_reservation_sync; #define DS_REF_MAX (1ULL << 62) @@ -76,9 +74,9 @@ parent_delta(dsl_dataset_t *ds, int64_t delta) } void -dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) +dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx) { - int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); + int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); int compressed = BP_GET_PSIZE(bp); int uncompressed = BP_GET_UCSIZE(bp); int64_t delta; @@ -119,29 +117,26 @@ dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) } int -dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, - dmu_tx_t *tx) +dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx, + boolean_t async) { - int used = bp_get_dasize(tx->tx_pool->dp_spa, bp); - int compressed = BP_GET_PSIZE(bp); - int uncompressed = BP_GET_UCSIZE(bp); - - ASSERT(pio != NULL); - ASSERT(dmu_tx_is_syncing(tx)); - /* No block pointer => nothing to free */ if (BP_IS_HOLE(bp)) return (0); + ASSERT(dmu_tx_is_syncing(tx)); + ASSERT(bp->blk_birth <= tx->tx_txg); + + int used = bp_get_dsize_sync(tx->tx_pool->dp_spa, bp); + int compressed = BP_GET_PSIZE(bp); + int uncompressed = BP_GET_UCSIZE(bp); + ASSERT(used > 0); if (ds == NULL) { - int err; /* * Account for the meta-objset space in its placeholder * dataset. */ - err = dsl_free(pio, tx->tx_pool, - tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); - ASSERT(err == 0); + dsl_free(tx->tx_pool, tx->tx_txg, bp); dsl_dir_diduse_space(tx->tx_pool->dp_mos_dir, DD_USED_HEAD, -used, -compressed, -uncompressed, tx); @@ -154,13 +149,10 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, dmu_buf_will_dirty(ds->ds_dbuf, tx); if (bp->blk_birth > ds->ds_phys->ds_prev_snap_txg) { - int err; int64_t delta; dprintf_bp(bp, "freeing: %s", ""); - err = dsl_free(pio, tx->tx_pool, - tx->tx_txg, bp, NULL, NULL, ARC_NOWAIT); - ASSERT(err == 0); + dsl_free(tx->tx_pool, tx->tx_txg, bp); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); @@ -176,7 +168,18 @@ dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, mutex_exit(&ds->ds_dir->dd_lock); } else { dprintf_bp(bp, "putting on dead list: %s", ""); - VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); + if (async) { + /* + * We are here as part of zio's write done callback, + * which means we're a zio interrupt thread. We can't + * call bplist_enqueue() now because it may block + * waiting for I/O. Instead, put bp on the deferred + * queue and let dsl_pool_sync() finish the job. + */ + bplist_enqueue_deferred(&ds->ds_deadlist, bp); + } else { + VERIFY(0 == bplist_enqueue(&ds->ds_deadlist, bp, tx)); + } ASSERT3U(ds->ds_prev->ds_object, ==, ds->ds_phys->ds_prev_snap_obj); ASSERT(ds->ds_prev->ds_phys->ds_num_children > 0); @@ -229,7 +232,7 @@ dsl_dataset_prev_snap_txg(dsl_dataset_t *ds) return (MAX(ds->ds_phys->ds_prev_snap_txg, trysnap)); } -int +boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth) { return (blk_birth > dsl_dataset_prev_snap_txg(ds)); @@ -243,12 +246,10 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) ASSERT(ds->ds_owner == NULL || DSL_DATASET_IS_DESTROYED(ds)); - dprintf_ds(ds, "evicting %s\n", ""); - unique_remove(ds->ds_fsid_guid); - if (ds->ds_user_ptr != NULL) - ds->ds_user_evict_func(ds, ds->ds_user_ptr); + if (ds->ds_objset != NULL) + dmu_objset_evict(ds->ds_objset); if (ds->ds_prev) { dsl_dataset_drop_ref(ds->ds_prev, ds); @@ -262,10 +263,11 @@ dsl_dataset_evict(dmu_buf_t *db, void *dsv) ASSERT(!list_link_active(&ds->ds_synced_link)); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); + bplist_fini(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); } @@ -323,6 +325,8 @@ dsl_dataset_snap_remove(dsl_dataset_t *ds, char *name, dmu_tx_t *tx) matchtype_t mt; int err; + dsl_dir_snap_cmtime_update(ds->ds_dir); + if (ds->ds_phys->ds_flags & DS_FLAG_CI_DATASET) mt = MT_FIRST; else @@ -359,11 +363,11 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, ds->ds_phys = dbuf->db_data; mutex_init(&ds->ds_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&ds->ds_recvlock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&ds->ds_opening_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&ds->ds_deadlist.bpl_lock, NULL, MUTEX_DEFAULT, - NULL); rw_init(&ds->ds_rwlock, 0, 0, 0); cv_init(&ds->ds_exclusive_cv, NULL, CV_DEFAULT, NULL); + bplist_init(&ds->ds_deadlist); err = bplist_open(&ds->ds_deadlist, mos, ds->ds_phys->ds_deadlist_obj); @@ -377,10 +381,11 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, * just opened it. */ mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); + bplist_fini(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); dmu_buf_rele(dbuf, tag); return (err); @@ -406,8 +411,15 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_rele(origin, FTAG); } } - } else if (zfs_flags & ZFS_DEBUG_SNAPNAMES) { - err = dsl_dataset_get_snapname(ds); + } else { + if (zfs_flags & ZFS_DEBUG_SNAPNAMES) + err = dsl_dataset_get_snapname(ds); + if (err == 0 && ds->ds_phys->ds_userrefs_obj != 0) { + err = zap_count( + ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj, + &ds->ds_userrefs); + } } if (err == 0 && !dsl_dataset_is_snapshot(ds)) { @@ -448,10 +460,11 @@ dsl_dataset_get_ref(dsl_pool_t *dp, uint64_t dsobj, void *tag, dsl_dataset_drop_ref(ds->ds_prev, ds); dsl_dir_close(ds->ds_dir, ds); mutex_destroy(&ds->ds_lock); + mutex_destroy(&ds->ds_recvlock); mutex_destroy(&ds->ds_opening_lock); - mutex_destroy(&ds->ds_deadlist.bpl_lock); rw_destroy(&ds->ds_rwlock); cv_destroy(&ds->ds_exclusive_cv); + bplist_fini(&ds->ds_deadlist); kmem_free(ds, sizeof (dsl_dataset_t)); if (err) { dmu_buf_rele(dbuf, tag); @@ -519,7 +532,15 @@ dsl_dataset_hold_ref(dsl_dataset_t *ds, void *tag) rw_enter(&dp->dp_config_rwlock, RW_READER); return (ENOENT); } + /* + * The dp_config_rwlock lives above the ds_lock. And + * we need to check DSL_DATASET_IS_DESTROYED() while + * holding the ds_lock, so we have to drop and reacquire + * the ds_lock here. + */ + mutex_exit(&ds->ds_lock); rw_enter(&dp->dp_config_rwlock, RW_READER); + mutex_enter(&ds->ds_lock); } mutex_exit(&ds->ds_lock); return (0); @@ -537,17 +558,15 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag, } int -dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, int flags, void *owner, - dsl_dataset_t **dsp) +dsl_dataset_own_obj(dsl_pool_t *dp, uint64_t dsobj, boolean_t inconsistentok, + void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold_obj(dp, dsobj, owner, dsp); - - ASSERT(DS_MODE_TYPE(flags) != DS_MODE_USER); - + int err = dsl_dataset_hold_obj(dp, dsobj, tag, dsp); if (err) return (err); - if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { - dsl_dataset_rele(*dsp, owner); + if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { + dsl_dataset_rele(*dsp, tag); + *dsp = NULL; return (EBUSY); } return (0); @@ -613,18 +632,14 @@ dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp) } int -dsl_dataset_own(const char *name, int flags, void *owner, dsl_dataset_t **dsp) +dsl_dataset_own(const char *name, boolean_t inconsistentok, + void *tag, dsl_dataset_t **dsp) { - int err = dsl_dataset_hold(name, owner, dsp); + int err = dsl_dataset_hold(name, tag, dsp); if (err) return (err); - if ((*dsp)->ds_phys->ds_num_children > 0 && - !DS_MODE_IS_READONLY(flags)) { - dsl_dataset_rele(*dsp, owner); - return (EROFS); - } - if (!dsl_dataset_tryown(*dsp, DS_MODE_IS_INCONSISTENT(flags), owner)) { - dsl_dataset_rele(*dsp, owner); + if (!dsl_dataset_tryown(*dsp, inconsistentok, tag)) { + dsl_dataset_rele(*dsp, tag); return (EBUSY); } return (0); @@ -696,9 +711,9 @@ dsl_dataset_rele(dsl_dataset_t *ds, void *tag) } void -dsl_dataset_disown(dsl_dataset_t *ds, void *owner) +dsl_dataset_disown(dsl_dataset_t *ds, void *tag) { - ASSERT((ds->ds_owner == owner && ds->ds_dbuf) || + ASSERT((ds->ds_owner == tag && ds->ds_dbuf) || (DSL_DATASET_IS_DESTROYED(ds) && ds->ds_dbuf == NULL)); mutex_enter(&ds->ds_lock); @@ -709,20 +724,20 @@ dsl_dataset_disown(dsl_dataset_t *ds, void *owner) } mutex_exit(&ds->ds_lock); if (ds->ds_dbuf) - dsl_dataset_drop_ref(ds, owner); + dsl_dataset_drop_ref(ds, tag); else dsl_dataset_evict(ds->ds_dbuf, ds); } boolean_t -dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *owner) +dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, void *tag) { boolean_t gotit = FALSE; mutex_enter(&ds->ds_lock); if (ds->ds_owner == NULL && (!DS_IS_INCONSISTENT(ds) || inconsistentok)) { - ds->ds_owner = owner; + ds->ds_owner = tag; if (!dsl_pool_sync_context(ds->ds_dir->dd_pool)) rw_exit(&ds->ds_rwlock); gotit = TRUE; @@ -844,30 +859,33 @@ struct destroyarg { dsl_sync_task_group_t *dstg; char *snapname; char *failed; + boolean_t defer; }; static int -dsl_snapshot_destroy_one(char *name, void *arg) +dsl_snapshot_destroy_one(const char *name, void *arg) { struct destroyarg *da = arg; dsl_dataset_t *ds; - char *cp; int err; + char *dsname; - (void) strcat(name, "@"); - (void) strcat(name, da->snapname); - err = dsl_dataset_own(name, DS_MODE_READONLY | DS_MODE_INCONSISTENT, - da->dstg, &ds); - cp = strchr(name, '@'); - *cp = '\0'; + dsname = kmem_asprintf("%s@%s", name, da->snapname); + err = dsl_dataset_own(dsname, B_TRUE, da->dstg, &ds); + strfree(dsname); if (err == 0) { + struct dsl_ds_destroyarg *dsda; + dsl_dataset_make_exclusive(ds, da->dstg); - if (ds->ds_user_ptr) { - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; + if (ds->ds_objset != NULL) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; } + dsda = kmem_zalloc(sizeof (struct dsl_ds_destroyarg), KM_SLEEP); + dsda->ds = ds; + dsda->defer = da->defer; dsl_sync_task_create(da->dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, da->dstg, 0); + dsl_dataset_destroy_sync, dsda, da->dstg, 0); } else if (err == ENOENT) { err = 0; } else { @@ -881,7 +899,7 @@ dsl_snapshot_destroy_one(char *name, void *arg) */ #pragma weak dmu_snapshots_destroy = dsl_snapshots_destroy int -dsl_snapshots_destroy(char *fsname, char *snapname) +dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer) { int err; struct destroyarg da; @@ -894,6 +912,7 @@ dsl_snapshots_destroy(char *fsname, char *snapname) da.dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); da.snapname = snapname; da.failed = fsname; + da.defer = defer; err = dmu_objset_find(fsname, dsl_snapshot_destroy_one, &da, DS_FIND_CHILDREN); @@ -903,7 +922,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname) for (dst = list_head(&da.dstg->dstg_tasks); dst; dst = list_next(&da.dstg->dstg_tasks, dst)) { - dsl_dataset_t *ds = dst->dst_arg1; + struct dsl_ds_destroyarg *dsda = dst->dst_arg1; + dsl_dataset_t *ds = dsda->ds; + /* * Return the file system name that triggered the error */ @@ -911,7 +932,9 @@ dsl_snapshots_destroy(char *fsname, char *snapname) dsl_dataset_name(ds, fsname); *strchr(fsname, '@') = '\0'; } + ASSERT3P(dsda->rm_origin, ==, NULL); dsl_dataset_disown(ds, da.dstg); + kmem_free(dsda, sizeof (struct dsl_ds_destroyarg)); } dsl_sync_task_group_destroy(da.dstg); @@ -919,34 +942,103 @@ dsl_snapshots_destroy(char *fsname, char *snapname) return (err); } +static boolean_t +dsl_dataset_might_destroy_origin(dsl_dataset_t *ds) +{ + boolean_t might_destroy = B_FALSE; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_num_children == 2 && ds->ds_userrefs == 0 && + DS_IS_DEFER_DESTROY(ds)) + might_destroy = B_TRUE; + mutex_exit(&ds->ds_lock); + + return (might_destroy); +} + +/* + * If we're removing a clone, and these three conditions are true: + * 1) the clone's origin has no other children + * 2) the clone's origin has no user references + * 3) the clone's origin has been marked for deferred destruction + * Then, prepare to remove the origin as part of this sync task group. + */ +static int +dsl_dataset_origin_rm_prep(struct dsl_ds_destroyarg *dsda, void *tag) +{ + dsl_dataset_t *ds = dsda->ds; + dsl_dataset_t *origin = ds->ds_prev; + + if (dsl_dataset_might_destroy_origin(origin)) { + char *name; + int namelen; + int error; + + namelen = dsl_dataset_namelen(origin) + 1; + name = kmem_alloc(namelen, KM_SLEEP); + dsl_dataset_name(origin, name); +#ifdef _KERNEL + error = zfs_unmount_snap(name, NULL); + if (error) { + kmem_free(name, namelen); + return (error); + } +#endif + error = dsl_dataset_own(name, B_TRUE, tag, &origin); + kmem_free(name, namelen); + if (error) + return (error); + dsda->rm_origin = origin; + dsl_dataset_make_exclusive(origin, tag); + + if (origin->ds_objset != NULL) { + dmu_objset_evict(origin->ds_objset); + origin->ds_objset = NULL; + } + } + + return (0); +} + /* * ds must be opened as OWNER. On return (whether successful or not), * ds will be closed and caller can no longer dereference it. */ int -dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) +dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer) { int err; dsl_sync_task_group_t *dstg; objset_t *os; dsl_dir_t *dd; uint64_t obj; + struct dsl_ds_destroyarg dsda = { 0 }; + dsl_dataset_t dummy_ds = { 0 }; + + dsda.ds = ds; if (dsl_dataset_is_snapshot(ds)) { /* Destroying a snapshot is simpler */ dsl_dataset_make_exclusive(ds, tag); - if (ds->ds_user_ptr) { - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; + if (ds->ds_objset != NULL) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; } + dsda.defer = defer; err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_destroy_check, dsl_dataset_destroy_sync, - ds, tag, 0); + &dsda, tag, 0); + ASSERT3P(dsda.rm_origin, ==, NULL); + goto out; + } else if (defer) { + err = EINVAL; goto out; } dd = ds->ds_dir; + dummy_ds.ds_dir = dd; + dummy_ds.ds_object = ds->ds_object; /* * Check for errors and mark this ds as inconsistent, in @@ -957,7 +1049,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) if (err) goto out; - err = dmu_objset_open_ds(ds, DMU_OST_ANY, &os); + err = dmu_objset_from_ds(ds, &os); if (err) goto out; @@ -974,7 +1066,27 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) (void) dmu_free_object(os, obj); } - dmu_objset_close(os); + /* + * We need to sync out all in-flight IO before we try to evict + * (the dataset evict func is trying to clear the cached entries + * for this dataset in the ARC). + */ + txg_wait_synced(dd->dd_pool, 0); + + /* + * If we managed to free all the objects in open + * context, the user space accounting should be zero. + */ + if (ds->ds_phys->ds_bp.blk_fill == 0 && + dmu_objset_userused_enabled(os)) { + uint64_t count; + + ASSERT(zap_count(os, DMU_USERUSED_OBJECT, &count) != 0 || + count == 0); + ASSERT(zap_count(os, DMU_GROUPUSED_OBJECT, &count) != 0 || + count == 0); + } + if (err != ESRCH) goto out; @@ -985,7 +1097,7 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) if (err) goto out; - if (ds->ds_user_ptr) { + if (ds->ds_objset) { /* * We need to sync out all in-flight IO before we try * to evict (the dataset evict func is trying to clear @@ -998,17 +1110,49 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) * Blow away the dsl_dir + head dataset. */ dsl_dataset_make_exclusive(ds, tag); - if (ds->ds_user_ptr) { - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; - } - dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); - dsl_sync_task_create(dstg, dsl_dataset_destroy_check, - dsl_dataset_destroy_sync, ds, tag, 0); - dsl_sync_task_create(dstg, dsl_dir_destroy_check, - dsl_dir_destroy_sync, dd, FTAG, 0); - err = dsl_sync_task_group_wait(dstg); - dsl_sync_task_group_destroy(dstg); + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + + /* + * If we're removing a clone, we might also need to remove its + * origin. + */ + do { + dsda.need_prep = B_FALSE; + if (dsl_dir_is_clone(dd)) { + err = dsl_dataset_origin_rm_prep(&dsda, tag); + if (err) { + dsl_dir_close(dd, FTAG); + goto out; + } + } + + dstg = dsl_sync_task_group_create(ds->ds_dir->dd_pool); + dsl_sync_task_create(dstg, dsl_dataset_destroy_check, + dsl_dataset_destroy_sync, &dsda, tag, 0); + dsl_sync_task_create(dstg, dsl_dir_destroy_check, + dsl_dir_destroy_sync, &dummy_ds, FTAG, 0); + err = dsl_sync_task_group_wait(dstg); + dsl_sync_task_group_destroy(dstg); + + /* + * We could be racing against 'zfs release' or 'zfs destroy -d' + * on the origin snap, in which case we can get EBUSY if we + * needed to destroy the origin snap but were not ready to + * do so. + */ + if (dsda.need_prep) { + ASSERT(err == EBUSY); + ASSERT(dsl_dir_is_clone(dd)); + ASSERT(dsda.rm_origin == NULL); + } + } while (dsda.need_prep); + + if (dsda.rm_origin != NULL) + dsl_dataset_disown(dsda.rm_origin, tag); + /* if it is successful, dsl_dir_destroy_sync will close the dd */ if (err) dsl_dir_close(dd, FTAG); @@ -1017,48 +1161,6 @@ dsl_dataset_destroy(dsl_dataset_t *ds, void *tag) return (err); } -int -dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost) -{ - int err; - - ASSERT(ds->ds_owner); - - dsl_dataset_make_exclusive(ds, ds->ds_owner); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_rollback_check, dsl_dataset_rollback_sync, - ds, &ost, 0); - /* drop exclusive access */ - mutex_enter(&ds->ds_lock); - rw_exit(&ds->ds_rwlock); - cv_broadcast(&ds->ds_exclusive_cv); - mutex_exit(&ds->ds_lock); - return (err); -} - -void * -dsl_dataset_set_user_ptr(dsl_dataset_t *ds, - void *p, dsl_dataset_evict_func_t func) -{ - void *old; - - mutex_enter(&ds->ds_lock); - old = ds->ds_user_ptr; - if (old == NULL) { - ds->ds_user_ptr = p; - ds->ds_user_evict_func = func; - } - mutex_exit(&ds->ds_lock); - return (old); -} - -void * -dsl_dataset_get_user_ptr(dsl_dataset_t *ds) -{ - return (ds->ds_user_ptr); -} - - blkptr_t * dsl_dataset_get_blkptr(dsl_dataset_t *ds) { @@ -1092,7 +1194,7 @@ dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx) if (ds == NULL) /* this is the meta-objset */ return; - ASSERT(ds->ds_user_ptr != NULL); + ASSERT(ds->ds_objset != NULL); if (ds->ds_phys->ds_next_snap_obj != 0) panic("dirtying snapshot!"); @@ -1150,165 +1252,34 @@ dsl_dataset_unique(dsl_dataset_t *ds) struct killarg { dsl_dataset_t *ds; - zio_t *zio; dmu_tx_t *tx; }; /* ARGSUSED */ static int -kill_blkptr(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, - const dnode_phys_t *dnp, void *arg) +kill_blkptr(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct killarg *ka = arg; + dmu_tx_t *tx = ka->tx; if (bp == NULL) return (0); - ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); - (void) dsl_dataset_block_kill(ka->ds, bp, ka->zio, ka->tx); - - return (0); -} - -/* ARGSUSED */ -static int -dsl_dataset_rollback_check(void *arg1, void *arg2, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dmu_objset_type_t *ost = arg2; - - /* - * We can only roll back to emptyness if it is a ZPL objset. - */ - if (*ost != DMU_OST_ZFS && ds->ds_phys->ds_prev_snap_txg == 0) - return (EINVAL); - - /* - * This must not be a snapshot. - */ - if (ds->ds_phys->ds_next_snap_obj != 0) - return (EINVAL); - - /* - * If we made changes this txg, traverse_dataset won't find - * them. Try again. - */ - if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) - return (EAGAIN); - - return (0); -} - -/* ARGSUSED */ -static void -dsl_dataset_rollback_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) -{ - dsl_dataset_t *ds = arg1; - dmu_objset_type_t *ost = arg2; - objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - - dmu_buf_will_dirty(ds->ds_dbuf, tx); - - /* - * Before the roll back destroy the zil. - */ - if (ds->ds_user_ptr != NULL) { - zil_rollback_destroy( - ((objset_impl_t *)ds->ds_user_ptr)->os_zil, tx); - + if (zb->zb_level == ZB_ZIL_LEVEL) { + ASSERT(zilog != NULL); /* - * We need to make sure that the objset_impl_t is reopened after - * we do the rollback, otherwise it will have the wrong - * objset_phys_t. Normally this would happen when this - * dataset-open is closed, thus causing the - * dataset to be immediately evicted. But when doing "zfs recv - * -F", we reopen the objset before that, so that there is no - * window where the dataset is closed and inconsistent. + * It's a block in the intent log. It has no + * accounting, so just free it. */ - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; - } - - /* Transfer space that was freed since last snap back to the head. */ - { - uint64_t used; - - VERIFY(0 == bplist_space_birthrange(&ds->ds_deadlist, - ds->ds_origin_txg, UINT64_MAX, &used)); - dsl_dir_transfer_space(ds->ds_dir, used, - DD_USED_SNAP, DD_USED_HEAD, tx); - } - - /* Zero out the deadlist. */ - bplist_close(&ds->ds_deadlist); - bplist_destroy(mos, ds->ds_phys->ds_deadlist_obj, tx); - ds->ds_phys->ds_deadlist_obj = - bplist_create(mos, DSL_DEADLIST_BLOCKSIZE, tx); - VERIFY(0 == bplist_open(&ds->ds_deadlist, mos, - ds->ds_phys->ds_deadlist_obj)); - - { - /* Free blkptrs that we gave birth to */ - zio_t *zio; - struct killarg ka; - - zio = zio_root(tx->tx_pool->dp_spa, NULL, NULL, - ZIO_FLAG_MUSTSUCCEED); - ka.ds = ds; - ka.zio = zio; - ka.tx = tx; - (void) traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, - TRAVERSE_POST, kill_blkptr, &ka); - (void) zio_wait(zio); - } - - ASSERT(!(ds->ds_phys->ds_flags & DS_FLAG_UNIQUE_ACCURATE) || - ds->ds_phys->ds_unique_bytes == 0); - - if (ds->ds_prev && ds->ds_prev != ds->ds_dir->dd_pool->dp_origin_snap) { - /* Change our contents to that of the prev snapshot */ - - ASSERT3U(ds->ds_prev->ds_object, ==, - ds->ds_phys->ds_prev_snap_obj); - ASSERT3U(ds->ds_phys->ds_used_bytes, <=, - ds->ds_prev->ds_phys->ds_used_bytes); - - ds->ds_phys->ds_bp = ds->ds_prev->ds_phys->ds_bp; - ds->ds_phys->ds_used_bytes = - ds->ds_prev->ds_phys->ds_used_bytes; - ds->ds_phys->ds_compressed_bytes = - ds->ds_prev->ds_phys->ds_compressed_bytes; - ds->ds_phys->ds_uncompressed_bytes = - ds->ds_prev->ds_phys->ds_uncompressed_bytes; - ds->ds_phys->ds_flags = ds->ds_prev->ds_phys->ds_flags; - - if (ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) { - dmu_buf_will_dirty(ds->ds_prev->ds_dbuf, tx); - ds->ds_prev->ds_phys->ds_unique_bytes = 0; - } + dsl_free(ka->tx->tx_pool, ka->tx->tx_txg, bp); } else { - objset_impl_t *osi; - - ASSERT3U(ds->ds_phys->ds_used_bytes, ==, 0); - ASSERT3U(ds->ds_phys->ds_compressed_bytes, ==, 0); - ASSERT3U(ds->ds_phys->ds_uncompressed_bytes, ==, 0); - - bzero(&ds->ds_phys->ds_bp, sizeof (blkptr_t)); - ds->ds_phys->ds_flags = 0; - ds->ds_phys->ds_unique_bytes = 0; - if (spa_version(ds->ds_dir->dd_pool->dp_spa) >= - SPA_VERSION_UNIQUE_ACCURATE) - ds->ds_phys->ds_flags |= DS_FLAG_UNIQUE_ACCURATE; - - osi = dmu_objset_create_impl(ds->ds_dir->dd_pool->dp_spa, ds, - &ds->ds_phys->ds_bp, *ost, tx); -#ifdef _KERNEL - zfs_create_fs(&osi->os, kcred, NULL, tx); -#endif + ASSERT(zilog == NULL); + ASSERT3U(bp->blk_birth, >, ka->ds->ds_phys->ds_prev_snap_txg); + (void) dsl_dataset_block_kill(ka->ds, bp, tx, B_FALSE); } - spa_history_internal_log(LOG_DS_ROLLBACK, ds->ds_dir->dd_pool->dp_spa, - tx, cr, "dataset = %llu", ds->ds_object); + return (0); } /* ARGSUSED */ @@ -1327,7 +1298,7 @@ dsl_dataset_destroy_begin_check(void *arg1, void *arg2, dmu_tx_t *tx) */ if (ds->ds_prev != NULL && ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EINVAL); + return (EBUSY); /* * This is really a dsl_dir thing, but check it here so that @@ -1358,18 +1329,63 @@ dsl_dataset_destroy_begin_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) cr, "dataset = %llu", ds->ds_object); } +static int +dsl_dataset_origin_check(struct dsl_ds_destroyarg *dsda, void *tag, + dmu_tx_t *tx) +{ + dsl_dataset_t *ds = dsda->ds; + dsl_dataset_t *ds_prev = ds->ds_prev; + + if (dsl_dataset_might_destroy_origin(ds_prev)) { + struct dsl_ds_destroyarg ndsda = {0}; + + /* + * If we're not prepared to remove the origin, don't remove + * the clone either. + */ + if (dsda->rm_origin == NULL) { + dsda->need_prep = B_TRUE; + return (EBUSY); + } + + ndsda.ds = ds_prev; + ndsda.is_origin_rm = B_TRUE; + return (dsl_dataset_destroy_check(&ndsda, tag, tx)); + } + + /* + * If we're not going to remove the origin after all, + * undo the open context setup. + */ + if (dsda->rm_origin != NULL) { + dsl_dataset_disown(dsda->rm_origin, tag); + dsda->rm_origin = NULL; + } + + return (0); +} + /* ARGSUSED */ int dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; + struct dsl_ds_destroyarg *dsda = arg1; + dsl_dataset_t *ds = dsda->ds; /* we have an owner hold, so noone else can destroy us */ ASSERT(!DSL_DATASET_IS_DESTROYED(ds)); - /* Can't delete a branch point. */ - if (ds->ds_phys->ds_num_children > 1) - return (EEXIST); + /* + * Only allow deferred destroy on pools that support it. + * NOTE: deferred destroy is only supported on snapshots. + */ + if (dsda->defer) { + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < + SPA_VERSION_USERREFS) + return (ENOTSUP); + ASSERT(dsl_dataset_is_snapshot(ds)); + return (0); + } /* * Can't delete a head dataset if there are snapshots of it. @@ -1378,7 +1394,7 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) */ if (ds->ds_prev != NULL && ds->ds_prev->ds_phys->ds_next_snap_obj == ds->ds_object) - return (EINVAL); + return (EBUSY); /* * If we made changes this txg, traverse_dsl_dataset won't find @@ -1387,6 +1403,31 @@ dsl_dataset_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) if (ds->ds_phys->ds_bp.blk_birth >= tx->tx_txg) return (EAGAIN); + if (dsl_dataset_is_snapshot(ds)) { + /* + * If this snapshot has an elevated user reference count, + * we can't destroy it yet. + */ + if (ds->ds_userrefs > 0 && !dsda->releasing) + return (EBUSY); + + mutex_enter(&ds->ds_lock); + /* + * Can't delete a branch point. However, if we're destroying + * a clone and removing its origin due to it having a user + * hold count of 0 and having been marked for deferred destroy, + * it's OK for the origin to have a single clone. + */ + if (ds->ds_phys->ds_num_children > + (dsda->is_origin_rm ? 2 : 1)) { + mutex_exit(&ds->ds_lock); + return (EEXIST); + } + mutex_exit(&ds->ds_lock); + } else if (dsl_dir_is_clone(ds->ds_dir)) { + return (dsl_dataset_origin_check(dsda, arg2, tx)); + } + /* XXX we should do some i/o error checking... */ return (0); } @@ -1431,11 +1472,38 @@ dsl_dataset_drain_refs(dsl_dataset_t *ds, void *tag) cv_destroy(&arg.cv); } +static void +remove_from_next_clones(dsl_dataset_t *ds, uint64_t obj, dmu_tx_t *tx) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t count; + int err; + + ASSERT(ds->ds_phys->ds_num_children >= 2); + err = zap_remove_int(mos, ds->ds_phys->ds_next_clones_obj, obj, tx); + /* + * The err should not be ENOENT, but a bug in a previous version + * of the code could cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a missing entry. + * If we knew that the pool was created after + * SPA_VERSION_NEXT_CLONES, we could assert that it isn't + * ENOENT. However, at least we can check that we don't have + * too many entries in the next_clones_obj even after failing to + * remove this one. + */ + if (err != ENOENT) { + VERIFY3U(err, ==, 0); + } + ASSERT3U(0, ==, zap_count(mos, ds->ds_phys->ds_next_clones_obj, + &count)); + ASSERT3U(count, <=, ds->ds_phys->ds_num_children - 2); +} + void dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { - dsl_dataset_t *ds = arg1; - zio_t *zio; + struct dsl_ds_destroyarg *dsda = arg1; + dsl_dataset_t *ds = dsda->ds; int err; int after_branch_point = FALSE; dsl_pool_t *dp = ds->ds_dir->dd_pool; @@ -1444,11 +1512,20 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) uint64_t obj; ASSERT(ds->ds_owner); - ASSERT3U(ds->ds_phys->ds_num_children, <=, 1); + ASSERT(dsda->defer || ds->ds_phys->ds_num_children <= 1); ASSERT(ds->ds_prev == NULL || ds->ds_prev->ds_phys->ds_next_snap_obj != ds->ds_object); ASSERT3U(ds->ds_phys->ds_bp.blk_birth, <=, tx->tx_txg); + if (dsda->defer) { + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + if (ds->ds_userrefs > 0 || ds->ds_phys->ds_num_children > 1) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_phys->ds_flags |= DS_FLAG_DEFER_DESTROY; + return; + } + } + /* signal any waiters that this dataset is going away */ mutex_enter(&ds->ds_lock); ds->ds_owner = dsl_reaper; @@ -1457,8 +1534,15 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) /* Remove our reservation */ if (ds->ds_reserved != 0) { - uint64_t val = 0; - dsl_dataset_set_reservation_sync(ds, &val, cr, tx); + dsl_prop_setarg_t psa; + uint64_t value = 0; + + dsl_prop_setarg_init_uint64(&psa, "refreservation", + (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), + &value); + psa.psa_effective_value = 0; /* predict default value */ + + dsl_dataset_set_reservation_sync(ds, &psa, cr, tx); ASSERT3U(ds->ds_reserved, ==, 0); } @@ -1481,8 +1565,7 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) dmu_buf_will_dirty(ds_prev->ds_dbuf, tx); if (after_branch_point && ds_prev->ds_phys->ds_next_clones_obj != 0) { - VERIFY(0 == zap_remove_int(mos, - ds_prev->ds_phys->ds_next_clones_obj, obj, tx)); + remove_from_next_clones(ds_prev, obj, tx); if (ds->ds_phys->ds_next_snap_obj != 0) { VERIFY(0 == zap_add_int(mos, ds_prev->ds_phys->ds_next_clones_obj, @@ -1494,14 +1577,26 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) /* This clone is toast. */ ASSERT(ds_prev->ds_phys->ds_num_children > 1); ds_prev->ds_phys->ds_num_children--; + + /* + * If the clone's origin has no other clones, no + * user holds, and has been marked for deferred + * deletion, then we should have done the necessary + * destroy setup for it. + */ + if (ds_prev->ds_phys->ds_num_children == 1 && + ds_prev->ds_userrefs == 0 && + DS_IS_DEFER_DESTROY(ds_prev)) { + ASSERT3P(dsda->rm_origin, !=, NULL); + } else { + ASSERT3P(dsda->rm_origin, ==, NULL); + } } else if (!after_branch_point) { ds_prev->ds_phys->ds_next_snap_obj = ds->ds_phys->ds_next_snap_obj; } } - zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - if (ds->ds_phys->ds_next_snap_obj != 0) { blkptr_t bp; dsl_dataset_t *ds_next; @@ -1539,15 +1634,13 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) bp.blk_birth > ds_prev->ds_phys->ds_prev_snap_txg) { ds_prev->ds_phys->ds_unique_bytes += - bp_get_dasize(dp->dp_spa, &bp); + bp_get_dsize_sync(dp->dp_spa, &bp); } } else { - used += bp_get_dasize(dp->dp_spa, &bp); + used += bp_get_dsize_sync(dp->dp_spa, &bp); compressed += BP_GET_PSIZE(&bp); uncompressed += BP_GET_UCSIZE(&bp); - /* XXX check return value? */ - (void) dsl_free(zio, dp, tx->tx_txg, - &bp, NULL, NULL, ARC_NOWAIT); + dsl_free(dp, tx->tx_txg, &bp); } } @@ -1649,17 +1742,18 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) * freed all the objects in open context. */ ka.ds = ds; - ka.zio = zio; ka.tx = tx; err = traverse_dataset(ds, ds->ds_phys->ds_prev_snap_txg, TRAVERSE_POST, kill_blkptr, &ka); ASSERT3U(err, ==, 0); - ASSERT(spa_version(dp->dp_spa) < SPA_VERSION_UNIQUE_ACCURATE || + ASSERT(!DS_UNIQUE_IS_ACCURATE(ds) || ds->ds_phys->ds_unique_bytes == 0); - } - err = zio_wait(zio); - ASSERT3U(err, ==, 0); + if (ds->ds_prev != NULL) { + dsl_dataset_rele(ds->ds_prev, ds); + ds->ds_prev = ds_prev = NULL; + } + } if (ds->ds_dir->dd_phys->dd_head_dataset_obj == ds->ds_object) { /* Erase the link in the dir */ @@ -1706,10 +1800,22 @@ dsl_dataset_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) } if (ds->ds_phys->ds_props_obj != 0) VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_props_obj, tx)); + if (ds->ds_phys->ds_userrefs_obj != 0) + VERIFY(0 == zap_destroy(mos, ds->ds_phys->ds_userrefs_obj, tx)); dsl_dir_close(ds->ds_dir, ds); ds->ds_dir = NULL; dsl_dataset_drain_refs(ds, tag); VERIFY(0 == dmu_object_free(mos, obj, tx)); + + if (dsda->rm_origin) { + /* + * Remove the origin of the clone we just destroyed. + */ + struct dsl_ds_destroyarg ndsda = {0}; + + ndsda.ds = dsda->rm_origin; + dsl_dataset_destroy_sync(&ndsda, tag, cr, tx); + } } static int @@ -1838,8 +1944,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ds->ds_prev->ds_phys->ds_creation_txg); ds->ds_prev->ds_phys->ds_next_snap_obj = dsobj; } else if (next_clones_obj != 0) { - VERIFY3U(0, ==, zap_remove_int(mos, - next_clones_obj, dsphys->ds_next_snap_obj, tx)); + remove_from_next_clones(ds->ds_prev, + dsphys->ds_next_snap_obj, tx); VERIFY3U(0, ==, zap_add_int(mos, next_clones_obj, dsobj, tx)); } @@ -1881,6 +1987,8 @@ dsl_dataset_snapshot_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dsl_pool_ds_snapshotted(ds, tx); + dsl_dir_snap_cmtime_update(ds->ds_dir); + spa_history_internal_log(LOG_DS_SNAPSHOT, dp->dp_spa, tx, cr, "dataset = %llu", dsobj); } @@ -1889,7 +1997,7 @@ void dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) { ASSERT(dmu_tx_is_syncing(tx)); - ASSERT(ds->ds_user_ptr != NULL); + ASSERT(ds->ds_objset != NULL); ASSERT(ds->ds_phys->ds_next_snap_obj == 0); /* @@ -1900,7 +2008,7 @@ dsl_dataset_sync(dsl_dataset_t *ds, zio_t *zio, dmu_tx_t *tx) ds->ds_phys->ds_fsid_guid = ds->ds_fsid_guid; dsl_dir_dirty(ds->ds_dir, tx); - dmu_objset_sync(ds->ds_user_ptr, zio, tx); + dmu_objset_sync(ds->ds_objset, zio, tx); } void @@ -1924,6 +2032,14 @@ dsl_dataset_stats(dsl_dataset_t *ds, nvlist_t *nv) ds->ds_reserved); dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_GUID, ds->ds_phys->ds_guid); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_UNIQUE, + dsl_dataset_unique(ds)); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_OBJSETID, + ds->ds_object); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_USERREFS, + ds->ds_userrefs); + dsl_prop_nvlist_add_uint64(nv, ZFS_PROP_DEFER_DESTROY, + DS_IS_DEFER_DESTROY(ds) ? 1 : 0); if (ds->ds_phys->ds_next_snap_obj) { /* @@ -1948,6 +2064,9 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) if (ds->ds_phys->ds_next_snap_obj) { stat->dds_is_snapshot = B_TRUE; stat->dds_num_clones = ds->ds_phys->ds_num_children - 1; + } else { + stat->dds_is_snapshot = B_FALSE; + stat->dds_num_clones = 0; } /* clone origin is really a dsl_dir thing... */ @@ -1959,6 +2078,8 @@ dsl_dataset_fast_stat(dsl_dataset_t *ds, dmu_objset_stats_t *stat) ds->ds_dir->dd_phys->dd_origin_obj, FTAG, &ods)); dsl_dataset_name(ods, stat->dds_origin); dsl_dataset_drop_ref(ods, FTAG); + } else { + stat->dds_origin[0] = '\0'; } rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); } @@ -2078,43 +2199,36 @@ struct renamesnaparg { }; static int -dsl_snapshot_rename_one(char *name, void *arg) +dsl_snapshot_rename_one(const char *name, void *arg) { struct renamesnaparg *ra = arg; dsl_dataset_t *ds = NULL; - char *cp; + char *snapname; int err; - cp = name + strlen(name); - *cp = '@'; - (void) strcpy(cp + 1, ra->oldsnap); + snapname = kmem_asprintf("%s@%s", name, ra->oldsnap); + (void) strlcpy(ra->failed, snapname, sizeof (ra->failed)); /* * For recursive snapshot renames the parent won't be changing * so we just pass name for both the to/from argument. */ - err = zfs_secpolicy_rename_perms(name, name, CRED()); - if (err == ENOENT) { - return (0); - } else if (err) { - (void) strcpy(ra->failed, name); - return (err); + err = zfs_secpolicy_rename_perms(snapname, snapname, CRED()); + if (err != 0) { + strfree(snapname); + return (err == ENOENT ? 0 : err); } #ifdef _KERNEL /* * For all filesystems undergoing rename, we'll need to unmount it. */ - (void) zfs_unmount_snap(name, NULL); + (void) zfs_unmount_snap(snapname, NULL); #endif - err = dsl_dataset_hold(name, ra->dstg, &ds); - *cp = '\0'; - if (err == ENOENT) { - return (0); - } else if (err) { - (void) strcpy(ra->failed, name); - return (err); - } + err = dsl_dataset_hold(snapname, ra->dstg, &ds); + strfree(snapname); + if (err != 0) + return (err == ENOENT ? 0 : err); dsl_sync_task_create(ra->dstg, dsl_dataset_snapshot_rename_check, dsl_dataset_snapshot_rename_sync, ds, ra->newsnap, 0); @@ -2130,7 +2244,7 @@ dsl_recursive_rename(char *oldname, const char *newname) dsl_sync_task_t *dst; spa_t *spa; char *cp, *fsname = spa_strdup(oldname); - int len = strlen(oldname); + int len = strlen(oldname) + 1; /* truncate the snapshot name to get the fsname */ cp = strchr(fsname, '@'); @@ -2138,7 +2252,7 @@ dsl_recursive_rename(char *oldname, const char *newname) err = spa_open(fsname, &spa, FTAG); if (err) { - kmem_free(fsname, len + 1); + kmem_free(fsname, len); return (err); } ra = kmem_alloc(sizeof (struct renamesnaparg), KM_SLEEP); @@ -2150,7 +2264,7 @@ dsl_recursive_rename(char *oldname, const char *newname) err = dmu_objset_find(fsname, dsl_snapshot_rename_one, ra, DS_FIND_CHILDREN); - kmem_free(fsname, len + 1); + kmem_free(fsname, len); if (err == 0) { err = dsl_sync_task_group_wait(ra->dstg); @@ -2161,14 +2275,15 @@ dsl_recursive_rename(char *oldname, const char *newname) dsl_dataset_t *ds = dst->dst_arg1; if (dst->dst_err) { dsl_dir_name(ds->ds_dir, ra->failed); - (void) strcat(ra->failed, "@"); - (void) strcat(ra->failed, ra->newsnap); + (void) strlcat(ra->failed, "@", sizeof (ra->failed)); + (void) strlcat(ra->failed, ra->newsnap, + sizeof (ra->failed)); } dsl_dataset_rele(ds, ra->dstg); } if (err) - (void) strcpy(oldname, ra->failed); + (void) strlcpy(oldname, ra->failed, sizeof (ra->failed)); dsl_sync_task_group_destroy(ra->dstg); kmem_free(ra, sizeof (struct renamesnaparg)); @@ -2177,7 +2292,7 @@ dsl_recursive_rename(char *oldname, const char *newname) } static int -dsl_valid_rename(char *oldname, void *arg) +dsl_valid_rename(const char *oldname, void *arg) { int delta = *(int *)arg; @@ -2199,6 +2314,7 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) err = dsl_dir_open(oldname, FTAG, &dd, &tail); if (err) return (err); + if (tail == NULL) { int delta = strlen(newname) - strlen(oldname); @@ -2207,13 +2323,23 @@ dsl_dataset_rename(char *oldname, const char *newname, boolean_t recursive) err = dmu_objset_find(oldname, dsl_valid_rename, &delta, DS_FIND_CHILDREN | DS_FIND_SNAPSHOTS); - if (!err) + if (!err) { + /* + * If there are more than 2 references there may be + * holds hanging around that haven't been cleared + * out yet. + */ + if (dmu_buf_refcount(dd->dd_dbuf) > 2) + txg_wait_synced(dd->dd_pool, 0); + err = dsl_dir_rename(dd, newname); + } dsl_dir_close(dd, FTAG); return (err); } + if (tail[0] != '@') { - /* the name ended in a nonexistant component */ + /* the name ended in a nonexistent component */ dsl_dir_close(dd, FTAG); return (ENOENT); } @@ -2254,6 +2380,7 @@ struct promotearg { list_t shared_snaps, origin_snaps, clone_snaps; dsl_dataset_t *origin_origin, *origin_head; uint64_t used, comp, uncomp, unique, cloneusedsnap, originusedsnap; + char *err_ds; }; static int snaplist_space(list_t *l, uint64_t mintxg, uint64_t *spacep); @@ -2313,10 +2440,12 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) /* Check that the snapshot name does not conflict */ VERIFY(0 == dsl_dataset_get_snapname(ds)); err = dsl_dataset_snap_lookup(hds, ds->ds_snapname, &val); - if (err == 0) - return (EEXIST); + if (err == 0) { + err = EEXIST; + goto out; + } if (err != ENOENT) - return (err); + goto out; /* The very first snapshot does not have a deadlist */ if (ds->ds_phys->ds_prev_snap_obj == 0) @@ -2324,7 +2453,7 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) if (err = bplist_space(&ds->ds_deadlist, &dlused, &dlcomp, &dluncomp)) - return (err); + goto out; pa->used += dlused; pa->comp += dlcomp; pa->uncomp += dluncomp; @@ -2382,6 +2511,9 @@ dsl_dataset_promote_check(void *arg1, void *arg2, dmu_tx_t *tx) } return (0); +out: + pa->err_ds = snap->ds->ds_snapname; + return (err); } static void @@ -2419,9 +2551,7 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) /* change the origin's next clone */ if (origin_ds->ds_phys->ds_next_clones_obj) { - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, - origin_ds->ds_phys->ds_next_clones_obj, - origin_ds->ds_phys->ds_next_snap_obj, tx)); + remove_from_next_clones(origin_ds, snap->ds->ds_object, tx); VERIFY3U(0, ==, zap_add_int(dp->dp_meta_objset, origin_ds->ds_phys->ds_next_clones_obj, oldnext_obj, tx)); @@ -2442,9 +2572,9 @@ dsl_dataset_promote_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dsl_dataset_t *ds = snap->ds; /* unregister props as dsl_dir is changing */ - if (ds->ds_user_ptr) { - ds->ds_user_evict_func(ds, ds->ds_user_ptr); - ds->ds_user_ptr = NULL; + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; } /* move snap name entry */ VERIFY(0 == dsl_dataset_get_snapname(ds)); @@ -2572,7 +2702,7 @@ snaplist_destroy(list_t *l, boolean_t own) { struct promotenode *snap; - if (!list_link_active(&l->list_head)) + if (!l || !list_link_active(&l->list_head)) return; while ((snap = list_tail(l)) != NULL) { @@ -2596,7 +2726,7 @@ snaplist_destroy(list_t *l, boolean_t own) * NULL, indicating that the clone is not a clone of a clone). */ int -dsl_dataset_promote(const char *name) +dsl_dataset_promote(const char *name, char *conflsnap) { dsl_dataset_t *ds; dsl_dir_t *dd; @@ -2667,7 +2797,9 @@ dsl_dataset_promote(const char *name) if (err == 0) { err = dsl_sync_task_do(dp, dsl_dataset_promote_check, dsl_dataset_promote_sync, ds, &pa, - 2 + 2 * doi.doi_physical_blks); + 2 + 2 * doi.doi_physical_blocks_512); + if (err && pa.err_ds && conflsnap) + (void) strncpy(conflsnap, pa.err_ds, MAXNAMELEN); } snaplist_destroy(&pa.shared_snaps, B_TRUE); @@ -2701,9 +2833,11 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) if (csa->cds->ds_prev != csa->ohds->ds_prev) return (EINVAL); - /* cds should be the clone */ - if (csa->cds->ds_prev->ds_phys->ds_next_snap_obj != - csa->ohds->ds_object) + /* cds should be the clone (unless they are unrelated) */ + if (csa->cds->ds_prev != NULL && + csa->cds->ds_prev != csa->cds->ds_dir->dd_pool->dp_origin_snap && + csa->ohds->ds_object != + csa->cds->ds_prev->ds_phys->ds_next_snap_obj) return (EINVAL); /* the clone should be a child of the origin */ @@ -2726,6 +2860,10 @@ dsl_dataset_clone_swap_check(void *arg1, void *arg2, dmu_tx_t *tx) dsl_dir_space_available(csa->ohds->ds_dir, NULL, 0, TRUE)) return (ENOSPC); + if (csa->ohds->ds_quota != 0 && + csa->cds->ds_phys->ds_unique_bytes > csa->ohds->ds_quota) + return (EDQUOT); + return (0); } @@ -2737,27 +2875,32 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dsl_pool_t *dp = csa->cds->ds_dir->dd_pool; ASSERT(csa->cds->ds_reserved == 0); - ASSERT(csa->cds->ds_quota == csa->ohds->ds_quota); + ASSERT(csa->ohds->ds_quota == 0 || + csa->cds->ds_phys->ds_unique_bytes <= csa->ohds->ds_quota); dmu_buf_will_dirty(csa->cds->ds_dbuf, tx); dmu_buf_will_dirty(csa->ohds->ds_dbuf, tx); - dmu_buf_will_dirty(csa->cds->ds_prev->ds_dbuf, tx); - if (csa->cds->ds_user_ptr != NULL) { - csa->cds->ds_user_evict_func(csa->cds, csa->cds->ds_user_ptr); - csa->cds->ds_user_ptr = NULL; + if (csa->cds->ds_objset != NULL) { + dmu_objset_evict(csa->cds->ds_objset); + csa->cds->ds_objset = NULL; } - if (csa->ohds->ds_user_ptr != NULL) { - csa->ohds->ds_user_evict_func(csa->ohds, - csa->ohds->ds_user_ptr); - csa->ohds->ds_user_ptr = NULL; + if (csa->ohds->ds_objset != NULL) { + dmu_objset_evict(csa->ohds->ds_objset); + csa->ohds->ds_objset = NULL; } - /* reset origin's unique bytes */ - VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, - csa->cds->ds_prev->ds_phys->ds_prev_snap_txg, UINT64_MAX, - &csa->cds->ds_prev->ds_phys->ds_unique_bytes)); + /* + * Reset origin's unique bytes, if it exists. + */ + if (csa->cds->ds_prev) { + dsl_dataset_t *origin = csa->cds->ds_prev; + dmu_buf_will_dirty(origin->ds_dbuf, tx); + VERIFY(0 == bplist_space_birthrange(&csa->cds->ds_deadlist, + origin->ds_phys->ds_prev_snap_txg, UINT64_MAX, + &origin->ds_phys->ds_unique_bytes)); + } /* swap blkptrs */ { @@ -2843,8 +2986,10 @@ dsl_dataset_clone_swap_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) } /* - * Swap 'clone' with its origin head file system. Used at the end - * of "online recv" to swizzle the file system to the new version. + * Swap 'clone' with its origin head datasets. Used at the end of "zfs + * recv" into an existing fs to swizzle the file system to the new + * version, and by "zfs rollback". Can also be used to swap two + * independent head datasets if neither has any snapshots. */ int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, @@ -2953,62 +3098,70 @@ static int dsl_dataset_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; + dsl_prop_setarg_t *psa = arg2; + int err; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFQUOTA) return (ENOTSUP); - if (new_quota == 0) + if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) + return (err); + + if (psa->psa_effective_value == 0) return (0); - if (new_quota < ds->ds_phys->ds_used_bytes || - new_quota < ds->ds_reserved) + if (psa->psa_effective_value < ds->ds_phys->ds_used_bytes || + psa->psa_effective_value < ds->ds_reserved) return (ENOSPC); return (0); } -/* ARGSUSED */ +extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *); + void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value = psa->psa_effective_value; - dmu_buf_will_dirty(ds->ds_dbuf, tx); + dsl_prop_set_sync(ds, psa, cr, tx); + DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); - ds->ds_quota = new_quota; - - dsl_prop_set_uint64_sync(ds->ds_dir, "refquota", new_quota, cr, tx); + if (ds->ds_quota != effective_value) { + dmu_buf_will_dirty(ds->ds_dbuf, tx); + ds->ds_quota = effective_value; - spa_history_internal_log(LOG_DS_REFQUOTA, ds->ds_dir->dd_pool->dp_spa, - tx, cr, "%lld dataset = %llu ", - (longlong_t)new_quota, ds->ds_object); + spa_history_internal_log(LOG_DS_REFQUOTA, + ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ", + (longlong_t)ds->ds_quota, ds->ds_object); + } } int -dsl_dataset_set_quota(const char *dsname, uint64_t quota) +dsl_dataset_set_quota(const char *dsname, zprop_source_t source, uint64_t quota) { dsl_dataset_t *ds; + dsl_prop_setarg_t psa; int err; + dsl_prop_setarg_init_uint64(&psa, "refquota", source, "a); + err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); - if (quota != ds->ds_quota) { - /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. - */ - txg_wait_open(ds->ds_dir->dd_pool, 0); + /* + * If someone removes a file, then tries to set the quota, we + * want to make sure the file freeing takes effect. + */ + txg_wait_open(ds->ds_dir->dd_pool, 0); + + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, + ds, &psa, 0); - err = dsl_sync_task_do(ds->ds_dir->dd_pool, - dsl_dataset_set_quota_check, dsl_dataset_set_quota_sync, - ds, "a, 0); - } dsl_dataset_rele(ds, FTAG); return (err); } @@ -3017,13 +3170,10 @@ static int dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; - int64_t delta; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value; uint64_t unique; - - if (new_reservation > INT64_MAX) - return (EOVERFLOW); + int err; if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_REFRESERVATION) @@ -3032,6 +3182,11 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) if (dsl_dataset_is_snapshot(ds)) return (EINVAL); + if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) + return (err); + + effective_value = psa->psa_effective_value; + /* * If we are doing the preliminary check in open context, the * space estimates may be inaccurate. @@ -3041,15 +3196,18 @@ dsl_dataset_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) mutex_enter(&ds->ds_lock); unique = dsl_dataset_unique(ds); - delta = MAX(unique, new_reservation) - MAX(unique, ds->ds_reserved); mutex_exit(&ds->ds_lock); - if (delta > 0 && - delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) - return (ENOSPC); - if (delta > 0 && ds->ds_quota > 0 && - new_reservation > ds->ds_quota) - return (ENOSPC); + if (MAX(unique, effective_value) > MAX(unique, ds->ds_reserved)) { + uint64_t delta = MAX(unique, effective_value) - + MAX(unique, ds->ds_reserved); + + if (delta > dsl_dir_space_available(ds->ds_dir, NULL, 0, TRUE)) + return (ENOSPC); + if (ds->ds_quota > 0 && + effective_value > ds->ds_quota) + return (ENOSPC); + } return (0); } @@ -3060,44 +3218,546 @@ dsl_dataset_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value = psa->psa_effective_value; uint64_t unique; int64_t delta; + dsl_prop_set_sync(ds, psa, cr, tx); + DSL_PROP_CHECK_PREDICTION(ds->ds_dir, psa); + dmu_buf_will_dirty(ds->ds_dbuf, tx); mutex_enter(&ds->ds_dir->dd_lock); mutex_enter(&ds->ds_lock); unique = dsl_dataset_unique(ds); - delta = MAX(0, (int64_t)(new_reservation - unique)) - + delta = MAX(0, (int64_t)(effective_value - unique)) - MAX(0, (int64_t)(ds->ds_reserved - unique)); - ds->ds_reserved = new_reservation; + ds->ds_reserved = effective_value; mutex_exit(&ds->ds_lock); dsl_dir_diduse_space(ds->ds_dir, DD_USED_REFRSRV, delta, 0, 0, tx); mutex_exit(&ds->ds_dir->dd_lock); - dsl_prop_set_uint64_sync(ds->ds_dir, "refreservation", - new_reservation, cr, tx); spa_history_internal_log(LOG_DS_REFRESERV, ds->ds_dir->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", - (longlong_t)new_reservation, ds->ds_object); + (longlong_t)effective_value, ds->ds_object); } int -dsl_dataset_set_reservation(const char *dsname, uint64_t reservation) +dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, + uint64_t reservation) { dsl_dataset_t *ds; + dsl_prop_setarg_t psa; int err; + dsl_prop_setarg_init_uint64(&psa, "refreservation", source, + &reservation); + err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); err = dsl_sync_task_do(ds->ds_dir->dd_pool, dsl_dataset_set_reservation_check, - dsl_dataset_set_reservation_sync, ds, &reservation, 0); + dsl_dataset_set_reservation_sync, ds, &psa, 0); + dsl_dataset_rele(ds, FTAG); return (err); } + +struct dsl_ds_holdarg { + dsl_sync_task_group_t *dstg; + char *htag; + char *snapname; + boolean_t recursive; + boolean_t gotone; + boolean_t temphold; + char failed[MAXPATHLEN]; +}; + +/* + * The max length of a temporary tag prefix is the number of hex digits + * required to express UINT64_MAX plus one for the hyphen. + */ +#define MAX_TAG_PREFIX_LEN 17 + +static int +dsl_dataset_user_hold_check(void *arg1, void *arg2, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct dsl_ds_holdarg *ha = arg2; + char *htag = ha->htag; + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int error = 0; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + if (!dsl_dataset_is_snapshot(ds)) + return (EINVAL); + + /* tags must be unique */ + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj) { + error = zap_lookup(mos, ds->ds_phys->ds_userrefs_obj, htag, + 8, 1, tx); + if (error == 0) + error = EEXIST; + else if (error == ENOENT) + error = 0; + } + mutex_exit(&ds->ds_lock); + + if (error == 0 && ha->temphold && + strlen(htag) + MAX_TAG_PREFIX_LEN >= MAXNAMELEN) + error = E2BIG; + + return (error); +} + +static void +dsl_dataset_user_hold_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + struct dsl_ds_holdarg *ha = arg2; + char *htag = ha->htag; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t now = gethrestime_sec(); + uint64_t zapobj; + + mutex_enter(&ds->ds_lock); + if (ds->ds_phys->ds_userrefs_obj == 0) { + /* + * This is the first user hold for this dataset. Create + * the userrefs zap object. + */ + dmu_buf_will_dirty(ds->ds_dbuf, tx); + zapobj = ds->ds_phys->ds_userrefs_obj = + zap_create(mos, DMU_OT_USERREFS, DMU_OT_NONE, 0, tx); + } else { + zapobj = ds->ds_phys->ds_userrefs_obj; + } + ds->ds_userrefs++; + mutex_exit(&ds->ds_lock); + + VERIFY(0 == zap_add(mos, zapobj, htag, 8, 1, &now, tx)); + + if (ha->temphold) { + VERIFY(0 == dsl_pool_user_hold(dp, ds->ds_object, + htag, &now, tx)); + } + + spa_history_internal_log(LOG_DS_USER_HOLD, + dp->dp_spa, tx, cr, "<%s> temp = %d dataset = %llu", htag, + (int)ha->temphold, ds->ds_object); +} + +static int +dsl_dataset_user_hold_one(const char *dsname, void *arg) +{ + struct dsl_ds_holdarg *ha = arg; + dsl_dataset_t *ds; + int error; + char *name; + + /* alloc a buffer to hold dsname@snapname plus terminating NULL */ + name = kmem_asprintf("%s@%s", dsname, ha->snapname); + error = dsl_dataset_hold(name, ha->dstg, &ds); + strfree(name); + if (error == 0) { + ha->gotone = B_TRUE; + dsl_sync_task_create(ha->dstg, dsl_dataset_user_hold_check, + dsl_dataset_user_hold_sync, ds, ha, 0); + } else if (error == ENOENT && ha->recursive) { + error = 0; + } else { + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + } + return (error); +} + +int +dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, + boolean_t recursive, boolean_t temphold) +{ + struct dsl_ds_holdarg *ha; + dsl_sync_task_t *dst; + spa_t *spa; + int error; + + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + + error = spa_open(dsname, &spa, FTAG); + if (error) { + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + return (error); + } + + ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + ha->htag = htag; + ha->snapname = snapname; + ha->recursive = recursive; + ha->temphold = temphold; + if (recursive) { + error = dmu_objset_find(dsname, dsl_dataset_user_hold_one, + ha, DS_FIND_CHILDREN); + } else { + error = dsl_dataset_user_hold_one(dsname, ha); + } + if (error == 0) + error = dsl_sync_task_group_wait(ha->dstg); + + for (dst = list_head(&ha->dstg->dstg_tasks); dst; + dst = list_next(&ha->dstg->dstg_tasks, dst)) { + dsl_dataset_t *ds = dst->dst_arg1; + + if (dst->dst_err) { + dsl_dataset_name(ds, ha->failed); + *strchr(ha->failed, '@') = '\0'; + } + dsl_dataset_rele(ds, ha->dstg); + } + + if (error == 0 && recursive && !ha->gotone) + error = ENOENT; + + if (error) + (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); + + dsl_sync_task_group_destroy(ha->dstg); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + spa_close(spa, FTAG); + return (error); +} + +struct dsl_ds_releasearg { + dsl_dataset_t *ds; + const char *htag; + boolean_t own; /* do we own or just hold ds? */ +}; + +static int +dsl_dataset_release_might_destroy(dsl_dataset_t *ds, const char *htag, + boolean_t *might_destroy) +{ + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + uint64_t zapobj; + uint64_t tmp; + int error; + + *might_destroy = B_FALSE; + + mutex_enter(&ds->ds_lock); + zapobj = ds->ds_phys->ds_userrefs_obj; + if (zapobj == 0) { + /* The tag can't possibly exist */ + mutex_exit(&ds->ds_lock); + return (ESRCH); + } + + /* Make sure the tag exists */ + error = zap_lookup(mos, zapobj, htag, 8, 1, &tmp); + if (error) { + mutex_exit(&ds->ds_lock); + if (error == ENOENT) + error = ESRCH; + return (error); + } + + if (ds->ds_userrefs == 1 && ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)) + *might_destroy = B_TRUE; + + mutex_exit(&ds->ds_lock); + return (0); +} + +static int +dsl_dataset_user_release_check(void *arg1, void *tag, dmu_tx_t *tx) +{ + struct dsl_ds_releasearg *ra = arg1; + dsl_dataset_t *ds = ra->ds; + boolean_t might_destroy; + int error; + + if (spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_USERREFS) + return (ENOTSUP); + + error = dsl_dataset_release_might_destroy(ds, ra->htag, &might_destroy); + if (error) + return (error); + + if (might_destroy) { + struct dsl_ds_destroyarg dsda = {0}; + + if (dmu_tx_is_syncing(tx)) { + /* + * If we're not prepared to remove the snapshot, + * we can't allow the release to happen right now. + */ + if (!ra->own) + return (EBUSY); + if (ds->ds_objset) { + dmu_objset_evict(ds->ds_objset); + ds->ds_objset = NULL; + } + } + dsda.ds = ds; + dsda.releasing = B_TRUE; + return (dsl_dataset_destroy_check(&dsda, tag, tx)); + } + + return (0); +} + +static void +dsl_dataset_user_release_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) +{ + struct dsl_ds_releasearg *ra = arg1; + dsl_dataset_t *ds = ra->ds; + dsl_pool_t *dp = ds->ds_dir->dd_pool; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj; + uint64_t dsobj = ds->ds_object; + uint64_t refs; + int error; + + mutex_enter(&ds->ds_lock); + ds->ds_userrefs--; + refs = ds->ds_userrefs; + mutex_exit(&ds->ds_lock); + error = dsl_pool_user_release(dp, ds->ds_object, ra->htag, tx); + VERIFY(error == 0 || error == ENOENT); + zapobj = ds->ds_phys->ds_userrefs_obj; + VERIFY(0 == zap_remove(mos, zapobj, ra->htag, tx)); + if (ds->ds_userrefs == 0 && ds->ds_phys->ds_num_children == 1 && + DS_IS_DEFER_DESTROY(ds)) { + struct dsl_ds_destroyarg dsda = {0}; + + ASSERT(ra->own); + dsda.ds = ds; + dsda.releasing = B_TRUE; + /* We already did the destroy_check */ + dsl_dataset_destroy_sync(&dsda, tag, cr, tx); + } + + spa_history_internal_log(LOG_DS_USER_RELEASE, + dp->dp_spa, tx, cr, "<%s> %lld dataset = %llu", + ra->htag, (longlong_t)refs, dsobj); +} + +static int +dsl_dataset_user_release_one(const char *dsname, void *arg) +{ + struct dsl_ds_holdarg *ha = arg; + struct dsl_ds_releasearg *ra; + dsl_dataset_t *ds; + int error; + void *dtag = ha->dstg; + char *name; + boolean_t own = B_FALSE; + boolean_t might_destroy; + + /* alloc a buffer to hold dsname@snapname, plus the terminating NULL */ + name = kmem_asprintf("%s@%s", dsname, ha->snapname); + error = dsl_dataset_hold(name, dtag, &ds); + strfree(name); + if (error == ENOENT && ha->recursive) + return (0); + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + if (error) + return (error); + + ha->gotone = B_TRUE; + + ASSERT(dsl_dataset_is_snapshot(ds)); + + error = dsl_dataset_release_might_destroy(ds, ha->htag, &might_destroy); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } + + if (might_destroy) { +#ifdef _KERNEL + error = zfs_unmount_snap(name, NULL); + if (error) { + dsl_dataset_rele(ds, dtag); + return (error); + } +#endif + if (!dsl_dataset_tryown(ds, B_TRUE, dtag)) { + dsl_dataset_rele(ds, dtag); + return (EBUSY); + } else { + own = B_TRUE; + dsl_dataset_make_exclusive(ds, dtag); + } + } + + ra = kmem_alloc(sizeof (struct dsl_ds_releasearg), KM_SLEEP); + ra->ds = ds; + ra->htag = ha->htag; + ra->own = own; + dsl_sync_task_create(ha->dstg, dsl_dataset_user_release_check, + dsl_dataset_user_release_sync, ra, dtag, 0); + + return (0); +} + +int +dsl_dataset_user_release(char *dsname, char *snapname, char *htag, + boolean_t recursive) +{ + struct dsl_ds_holdarg *ha; + dsl_sync_task_t *dst; + spa_t *spa; + int error; + +top: + ha = kmem_zalloc(sizeof (struct dsl_ds_holdarg), KM_SLEEP); + + (void) strlcpy(ha->failed, dsname, sizeof (ha->failed)); + + error = spa_open(dsname, &spa, FTAG); + if (error) { + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + return (error); + } + + ha->dstg = dsl_sync_task_group_create(spa_get_dsl(spa)); + ha->htag = htag; + ha->snapname = snapname; + ha->recursive = recursive; + if (recursive) { + error = dmu_objset_find(dsname, dsl_dataset_user_release_one, + ha, DS_FIND_CHILDREN); + } else { + error = dsl_dataset_user_release_one(dsname, ha); + } + if (error == 0) + error = dsl_sync_task_group_wait(ha->dstg); + + for (dst = list_head(&ha->dstg->dstg_tasks); dst; + dst = list_next(&ha->dstg->dstg_tasks, dst)) { + struct dsl_ds_releasearg *ra = dst->dst_arg1; + dsl_dataset_t *ds = ra->ds; + + if (dst->dst_err) + dsl_dataset_name(ds, ha->failed); + + if (ra->own) + dsl_dataset_disown(ds, ha->dstg); + else + dsl_dataset_rele(ds, ha->dstg); + + kmem_free(ra, sizeof (struct dsl_ds_releasearg)); + } + + if (error == 0 && recursive && !ha->gotone) + error = ENOENT; + + if (error && error != EBUSY) + (void) strlcpy(dsname, ha->failed, sizeof (ha->failed)); + + dsl_sync_task_group_destroy(ha->dstg); + kmem_free(ha, sizeof (struct dsl_ds_holdarg)); + spa_close(spa, FTAG); + + /* + * We can get EBUSY if we were racing with deferred destroy and + * dsl_dataset_user_release_check() hadn't done the necessary + * open context setup. We can also get EBUSY if we're racing + * with destroy and that thread is the ds_owner. Either way + * the busy condition should be transient, and we should retry + * the release operation. + */ + if (error == EBUSY) + goto top; + + return (error); +} + +/* + * Called at spa_load time to release a stale temporary user hold. + */ +int +dsl_dataset_user_release_tmp(dsl_pool_t *dp, uint64_t dsobj, char *htag) +{ + dsl_dataset_t *ds; + char *snap; + char *name; + int namelen; + int error; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + error = dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds); + rw_exit(&dp->dp_config_rwlock); + if (error) + return (error); + namelen = dsl_dataset_namelen(ds)+1; + name = kmem_alloc(namelen, KM_SLEEP); + dsl_dataset_name(ds, name); + dsl_dataset_rele(ds, FTAG); + + snap = strchr(name, '@'); + *snap = '\0'; + ++snap; + return (dsl_dataset_user_release(name, snap, htag, B_FALSE)); +} + +int +dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp) +{ + dsl_dataset_t *ds; + int err; + + err = dsl_dataset_hold(dsname, FTAG, &ds); + if (err) + return (err); + + VERIFY(0 == nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP)); + if (ds->ds_phys->ds_userrefs_obj != 0) { + zap_attribute_t *za; + zap_cursor_t zc; + + za = kmem_alloc(sizeof (zap_attribute_t), KM_SLEEP); + for (zap_cursor_init(&zc, ds->ds_dir->dd_pool->dp_meta_objset, + ds->ds_phys->ds_userrefs_obj); + zap_cursor_retrieve(&zc, za) == 0; + zap_cursor_advance(&zc)) { + VERIFY(0 == nvlist_add_uint64(*nvp, za->za_name, + za->za_first_integer)); + } + zap_cursor_fini(&zc); + kmem_free(za, sizeof (zap_attribute_t)); + } + dsl_dataset_rele(ds, FTAG); + return (0); +} + +/* + * Note, this fuction is used as the callback for dmu_objset_find(). We + * always return 0 so that we will continue to find and process + * inconsistent datasets, even if we encounter an error trying to + * process one of them. + */ +/* ARGSUSED */ +int +dsl_destroy_inconsistent(const char *dsname, void *arg) +{ + dsl_dataset_t *ds; + + if (dsl_dataset_own(dsname, B_TRUE, FTAG, &ds) == 0) { + if (DS_IS_INCONSISTENT(ds)) + (void) dsl_dataset_destroy(ds, FTAG, B_FALSE); + else + dsl_dataset_disown(ds, FTAG); + } + return (0); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c index da5d15787570f..04053fdf206ec 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_deleg.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -66,8 +66,6 @@ * The ZAP OBJ is referred to as the jump object. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -77,8 +75,6 @@ #include #include #include -#include -#include /* for the default checksum value */ #include #include #include @@ -540,7 +536,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) dsl_pool_t *dp; void *cookie; int error; - char checkflag = ZFS_DELEG_LOCAL; + char checkflag; objset_t *mos; avl_tree_t permsets; perm_set_t *setnode; @@ -563,6 +559,16 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) return (EPERM); } + if (dsl_dataset_is_snapshot(ds)) { + /* + * Snapshots are treated as descendents only, + * local permissions do not apply. + */ + checkflag = ZFS_DELEG_DESCENDENT; + } else { + checkflag = ZFS_DELEG_LOCAL; + } + avl_create(&permsets, perm_set_compare, sizeof (perm_set_t), offsetof(perm_set_t, p_node)); @@ -581,7 +587,7 @@ dsl_deleg_access(const char *dsname, const char *perm, cred_t *cr) if (dsl_prop_get_dd(dd, zfs_prop_to_name(ZFS_PROP_ZONED), - 8, 1, &zoned, NULL) != 0) + 8, 1, &zoned, NULL, B_FALSE) != 0) break; if (!zoned) break; @@ -731,5 +737,5 @@ dsl_deleg_destroy(objset_t *mos, uint64_t zapobj, dmu_tx_t *tx) boolean_t dsl_delegation_on(objset_t *os) { - return (os->os->os_spa->spa_delegation); + return (!!spa_delegation(os->os_spa)); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c index 48d87f97f6698..a70fa8e4e9c11 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_dir.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -96,7 +97,6 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, #endif if (dd == NULL) { dsl_dir_t *winner; - int err; dd = kmem_zalloc(sizeof (dsl_dir_t), KM_SLEEP); dd->dd_object = ddobj; @@ -108,6 +108,8 @@ dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t), offsetof(dsl_prop_cb_record_t, cbr_node)); + dsl_dir_snap_cmtime_update(dd); + if (dd->dd_phys->dd_parent_obj) { err = dsl_dir_open_obj(dp, dd->dd_phys->dd_parent_obj, NULL, dd, &dd->dd_parent); @@ -227,24 +229,11 @@ dsl_dir_namelen(dsl_dir_t *dd) return (result); } -int -dsl_dir_is_private(dsl_dir_t *dd) -{ - int rv = FALSE; - - if (dd->dd_parent && dsl_dir_is_private(dd->dd_parent)) - rv = TRUE; - if (dataset_name_hidden(dd->dd_myname)) - rv = TRUE; - return (rv); -} - - static int getcomponent(const char *path, char *component, const char **nextp) { char *p; - if (path == NULL) + if ((path == NULL) || (path[0] == '\0')) return (ENOENT); /* This would be a good place to reserve some namespace... */ p = strpbrk(path, "/@"); @@ -441,7 +430,8 @@ dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, int dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; dsl_pool_t *dp = dd->dd_pool; objset_t *mos = dp->dp_meta_objset; int err; @@ -470,17 +460,25 @@ dsl_dir_destroy_check(void *arg1, void *arg2, dmu_tx_t *tx) void dsl_dir_destroy_sync(void *arg1, void *tag, cred_t *cr, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; objset_t *mos = dd->dd_pool->dp_meta_objset; - uint64_t val, obj; + dsl_prop_setarg_t psa; + uint64_t value = 0; + uint64_t obj; dd_used_t t; ASSERT(RW_WRITE_HELD(&dd->dd_pool->dp_config_rwlock)); ASSERT(dd->dd_phys->dd_head_dataset_obj == 0); /* Remove our reservation. */ - val = 0; - dsl_dir_set_reservation_sync(dd, &val, cr, tx); + dsl_prop_setarg_init_uint64(&psa, "reservation", + (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED), + &value); + psa.psa_effective_value = 0; /* predict default value */ + + dsl_dir_set_reservation_sync(ds, &psa, cr, tx); + ASSERT3U(dd->dd_phys->dd_used_bytes, ==, 0); ASSERT3U(dd->dd_phys->dd_reserved, ==, 0); for (t = 0; t < DD_USED_NUM; t++) @@ -662,7 +660,7 @@ dsl_dir_space_available(dsl_dir_t *dd, * dsl_pool_adjustedsize()), something is very * wrong. */ - ASSERT3U(used, <=, spa_get_space(dd->dd_pool->dp_spa)); + ASSERT3U(used, <=, spa_get_dspace(dd->dd_pool->dp_spa)); } else { /* * the lesser of the space provided by our parent and @@ -690,8 +688,9 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, { uint64_t txg = tx->tx_txg; uint64_t est_inflight, used_on_disk, quota, parent_rsrv; + uint64_t deferred = 0; struct tempreserve *tr; - int enospc = EDQUOT; + int retval = EDQUOT; int txgidx = txg & TXG_MASK; int i; uint64_t ref_rsrv = 0; @@ -717,7 +716,7 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, */ if (first && tx->tx_objset) { int error; - dsl_dataset_t *ds = tx->tx_objset->os->os_dsl_dataset; + dsl_dataset_t *ds = tx->tx_objset->os_dsl_dataset; error = dsl_dataset_check_quota(ds, checkrefquota, asize, est_inflight, &used_on_disk, &ref_rsrv); @@ -737,7 +736,8 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, quota = dd->dd_phys->dd_quota; /* - * Adjust the quota against the actual pool size at the root. + * Adjust the quota against the actual pool size at the root + * minus any outstanding deferred frees. * To ensure that it's possible to remove files from a full * pool without inducing transient overcommits, we throttle * netfree transactions against a quota that is slightly larger, @@ -746,10 +746,12 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, * removes to get through. */ if (dd->dd_parent == NULL) { + spa_t *spa = dd->dd_pool->dp_spa; uint64_t poolsize = dsl_pool_adjustedsize(dd->dd_pool, netfree); - if (poolsize < quota) { - quota = poolsize; - enospc = ENOSPC; + deferred = metaslab_class_get_deferred(spa_normal_class(spa)); + if (poolsize - deferred < quota) { + quota = poolsize - deferred; + retval = ENOSPC; } } @@ -759,15 +761,16 @@ dsl_dir_tempreserve_impl(dsl_dir_t *dd, uint64_t asize, boolean_t netfree, * on-disk is over quota and there are no pending changes (which * may free up space for us). */ - if (used_on_disk + est_inflight > quota) { - if (est_inflight > 0 || used_on_disk < quota) - enospc = ERESTART; + if (used_on_disk + est_inflight >= quota) { + if (est_inflight > 0 || used_on_disk < quota || + (retval == ENOSPC && used_on_disk < quota + deferred)) + retval = ERESTART; dprintf_dd(dd, "failing: used=%lluK inflight = %lluK " "quota=%lluK tr=%lluK err=%d\n", used_on_disk>>10, est_inflight>>10, - quota>>10, asize>>10, enospc); + quota>>10, asize>>10, retval); mutex_exit(&dd->dd_lock); - return (enospc); + return (retval); } /* We need to up our estimated delta before dropping dd_lock */ @@ -1001,13 +1004,16 @@ dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, static int dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; - int err = 0; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_setarg_t *psa = arg2; + int err; uint64_t towrite; - if (new_quota == 0) + if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) + return (err); + + if (psa->psa_effective_value == 0) return (0); mutex_enter(&dd->dd_lock); @@ -1019,68 +1025,89 @@ dsl_dir_set_quota_check(void *arg1, void *arg2, dmu_tx_t *tx) */ towrite = dsl_dir_space_towrite(dd); if ((dmu_tx_is_syncing(tx) || towrite == 0) && - (new_quota < dd->dd_phys->dd_reserved || - new_quota < dd->dd_phys->dd_used_bytes + towrite)) { + (psa->psa_effective_value < dd->dd_phys->dd_reserved || + psa->psa_effective_value < dd->dd_phys->dd_used_bytes + towrite)) { err = ENOSPC; } mutex_exit(&dd->dd_lock); return (err); } +extern void dsl_prop_set_sync(void *, void *, cred_t *, dmu_tx_t *); + /* ARGSUSED */ static void dsl_dir_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - uint64_t *quotap = arg2; - uint64_t new_quota = *quotap; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value = psa->psa_effective_value; + + dsl_prop_set_sync(ds, psa, cr, tx); + DSL_PROP_CHECK_PREDICTION(dd, psa); dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); - dd->dd_phys->dd_quota = new_quota; + dd->dd_phys->dd_quota = effective_value; mutex_exit(&dd->dd_lock); spa_history_internal_log(LOG_DS_QUOTA, dd->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu ", - (longlong_t)new_quota, dd->dd_phys->dd_head_dataset_obj); + (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); } int -dsl_dir_set_quota(const char *ddname, uint64_t quota) +dsl_dir_set_quota(const char *ddname, zprop_source_t source, uint64_t quota) { dsl_dir_t *dd; + dsl_dataset_t *ds; + dsl_prop_setarg_t psa; int err; - err = dsl_dir_open(ddname, FTAG, &dd, NULL); + dsl_prop_setarg_init_uint64(&psa, "quota", source, "a); + + err = dsl_dataset_hold(ddname, FTAG, &ds); if (err) return (err); - if (quota != dd->dd_phys->dd_quota) { - /* - * If someone removes a file, then tries to set the quota, we - * want to make sure the file freeing takes effect. - */ - txg_wait_open(dd->dd_pool, 0); - - err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, - dsl_dir_set_quota_sync, dd, "a, 0); + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); } + + ASSERT(ds->ds_dir == dd); + + /* + * If someone removes a file, then tries to set the quota, we want to + * make sure the file freeing takes effect. + */ + txg_wait_open(dd->dd_pool, 0); + + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_quota_check, + dsl_dir_set_quota_sync, ds, &psa, 0); + dsl_dir_close(dd, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value; uint64_t used, avail; - int64_t delta; + int err; - if (new_reservation > INT64_MAX) - return (EOVERFLOW); + if ((err = dsl_prop_predict_sync(ds->ds_dir, psa)) != 0) + return (err); + + effective_value = psa->psa_effective_value; /* * If we are doing the preliminary check in open context, the @@ -1091,8 +1118,6 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; - delta = MAX(used, new_reservation) - - MAX(used, dd->dd_phys->dd_reserved); mutex_exit(&dd->dd_lock); if (dd->dd_parent) { @@ -1102,11 +1127,17 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) avail = dsl_pool_adjustedsize(dd->dd_pool, B_FALSE) - used; } - if (delta > 0 && delta > avail) - return (ENOSPC); - if (delta > 0 && dd->dd_phys->dd_quota > 0 && - new_reservation > dd->dd_phys->dd_quota) - return (ENOSPC); + if (MAX(used, effective_value) > MAX(used, dd->dd_phys->dd_reserved)) { + uint64_t delta = MAX(used, effective_value) - + MAX(used, dd->dd_phys->dd_reserved); + + if (delta > avail) + return (ENOSPC); + if (dd->dd_phys->dd_quota > 0 && + effective_value > dd->dd_phys->dd_quota) + return (ENOSPC); + } + return (0); } @@ -1114,19 +1145,23 @@ dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx) static void dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { - dsl_dir_t *dd = arg1; - uint64_t *reservationp = arg2; - uint64_t new_reservation = *reservationp; + dsl_dataset_t *ds = arg1; + dsl_dir_t *dd = ds->ds_dir; + dsl_prop_setarg_t *psa = arg2; + uint64_t effective_value = psa->psa_effective_value; uint64_t used; int64_t delta; + dsl_prop_set_sync(ds, psa, cr, tx); + DSL_PROP_CHECK_PREDICTION(dd, psa); + dmu_buf_will_dirty(dd->dd_dbuf, tx); mutex_enter(&dd->dd_lock); used = dd->dd_phys->dd_used_bytes; - delta = MAX(used, new_reservation) - + delta = MAX(used, effective_value) - MAX(used, dd->dd_phys->dd_reserved); - dd->dd_phys->dd_reserved = new_reservation; + dd->dd_phys->dd_reserved = effective_value; if (dd->dd_parent != NULL) { /* Roll up this additional usage into our ancestors */ @@ -1137,21 +1172,37 @@ dsl_dir_set_reservation_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) spa_history_internal_log(LOG_DS_RESERVATION, dd->dd_pool->dp_spa, tx, cr, "%lld dataset = %llu", - (longlong_t)new_reservation, dd->dd_phys->dd_head_dataset_obj); + (longlong_t)effective_value, dd->dd_phys->dd_head_dataset_obj); } int -dsl_dir_set_reservation(const char *ddname, uint64_t reservation) +dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation) { dsl_dir_t *dd; + dsl_dataset_t *ds; + dsl_prop_setarg_t psa; int err; - err = dsl_dir_open(ddname, FTAG, &dd, NULL); + dsl_prop_setarg_init_uint64(&psa, "reservation", source, &reservation); + + err = dsl_dataset_hold(ddname, FTAG, &ds); if (err) return (err); + + err = dsl_dir_open(ddname, FTAG, &dd, NULL); + if (err) { + dsl_dataset_rele(ds, FTAG); + return (err); + } + + ASSERT(ds->ds_dir == dd); + err = dsl_sync_task_do(dd->dd_pool, dsl_dir_set_reservation_check, - dsl_dir_set_reservation_sync, dd, &reservation, 0); + dsl_dir_set_reservation_sync, ds, &psa, 0); + dsl_dir_close(dd, FTAG); + dsl_dataset_rele(ds, FTAG); return (err); } @@ -1329,3 +1380,26 @@ dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space) return (0); } + +timestruc_t +dsl_dir_snap_cmtime(dsl_dir_t *dd) +{ + timestruc_t t; + + mutex_enter(&dd->dd_lock); + t = dd->dd_snap_cmtime; + mutex_exit(&dd->dd_lock); + + return (t); +} + +void +dsl_dir_snap_cmtime_update(dsl_dir_t *dd) +{ + timestruc_t t; + + gethrestime(&t); + mutex_enter(&dd->dd_lock); + dd->dd_snap_cmtime = t; + mutex_exit(&dd->dd_lock); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c index dacc57c81c254..a4ca02e54fa83 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_pool.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,7 +39,7 @@ int zfs_no_write_throttle = 0; int zfs_write_limit_shift = 3; /* 1/8th of physical memory */ -int zfs_txg_synctime = 5; /* target secs to sync a txg */ +int zfs_txg_synctime_ms = 5000; /* target millisecs to sync a txg */ uint64_t zfs_write_limit_min = 32 << 20; /* min write limit is 32MB */ uint64_t zfs_write_limit_max = 0; /* max data payload per txg */ @@ -90,6 +90,9 @@ dsl_pool_open_impl(spa_t *spa, uint64_t txg) mutex_init(&dp->dp_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&dp->dp_scrub_cancel_lock, NULL, MUTEX_DEFAULT, NULL); + dp->dp_vnrele_taskq = taskq_create("zfs_vn_rele_taskq", 1, minclsyspri, + 1, 4, 0); + return (dp); } @@ -100,13 +103,12 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dsl_dir_t *dd; dsl_dataset_t *ds; - objset_impl_t *osi; rw_enter(&dp->dp_config_rwlock, RW_WRITER); - err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, &osi); + err = dmu_objset_open_impl(spa, NULL, &dp->dp_meta_rootbp, + &dp->dp_meta_objset); if (err) goto out; - dp->dp_meta_objset = &osi->os; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ROOT_DATASET, sizeof (uint64_t), 1, @@ -129,16 +131,25 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) goto out; err = dsl_dataset_hold_obj(dp, dd->dd_phys->dd_head_dataset_obj, FTAG, &ds); + if (err == 0) { + err = dsl_dataset_hold_obj(dp, + ds->ds_phys->ds_prev_snap_obj, dp, + &dp->dp_origin_snap); + dsl_dataset_rele(ds, FTAG); + } + dsl_dir_close(dd, dp); if (err) goto out; - err = dsl_dataset_hold_obj(dp, ds->ds_phys->ds_prev_snap_obj, - dp, &dp->dp_origin_snap); - if (err) - goto out; - dsl_dataset_rele(ds, FTAG); - dsl_dir_close(dd, dp); } + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_TMP_USERREFS, sizeof (uint64_t), 1, + &dp->dp_tmp_userrefs_obj); + if (err == ENOENT) + err = 0; + if (err) + goto out; + /* get scrub status */ err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_FUNC, sizeof (uint32_t), 1, @@ -160,10 +171,22 @@ dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp) if (err) goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), &dp->dp_scrub_bookmark); if (err) goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), + &dp->dp_scrub_ddt_bookmark); + if (err && err != ENOENT) + goto out; + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, + &dp->dp_scrub_ddt_class_max); + if (err && err != ENOENT) + goto out; err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, &spa->spa_scrub_errors); @@ -215,7 +238,7 @@ dsl_pool_close(dsl_pool_t *dp) /* undo the dmu_objset_open_impl(mos) from dsl_pool_open() */ if (dp->dp_meta_objset) - dmu_objset_evict(NULL, dp->dp_meta_objset->os); + dmu_objset_evict(dp->dp_meta_objset); txg_list_destroy(&dp->dp_dirty_datasets); txg_list_destroy(&dp->dp_dirty_dirs); @@ -226,6 +249,7 @@ dsl_pool_close(dsl_pool_t *dp) rw_destroy(&dp->dp_config_rwlock); mutex_destroy(&dp->dp_lock); mutex_destroy(&dp->dp_scrub_cancel_lock); + taskq_destroy(dp->dp_vnrele_taskq); if (dp->dp_blkstats) kmem_free(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); kmem_free(dp, sizeof (dsl_pool_t)); @@ -237,13 +261,13 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) int err; dsl_pool_t *dp = dsl_pool_open_impl(spa, txg); dmu_tx_t *tx = dmu_tx_create_assigned(dp, txg); - objset_impl_t *osip; + objset_t *os; dsl_dataset_t *ds; uint64_t dsobj; /* create and open the MOS (meta-objset) */ - dp->dp_meta_objset = &dmu_objset_create_impl(spa, - NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx)->os; + dp->dp_meta_objset = dmu_objset_create_impl(spa, + NULL, &dp->dp_meta_rootbp, DMU_OST_META, tx); /* create the pool directory */ err = zap_create_claim(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, @@ -268,10 +292,10 @@ dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg) /* create the root objset */ VERIFY(0 == dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - osip = dmu_objset_create_impl(dp->dp_spa, ds, + os = dmu_objset_create_impl(dp->dp_spa, ds, dsl_dataset_get_blkptr(ds), DMU_OST_ZFS, tx); #ifdef _KERNEL - zfs_create_fs(&osip->os, kcred, zplprops, tx); + zfs_create_fs(os, kcred, zplprops, tx); #endif dsl_dataset_rele(ds, FTAG); @@ -288,7 +312,7 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_dir_t *dd; dsl_dataset_t *ds; dsl_sync_task_group_t *dstg; - objset_impl_t *mosi = dp->dp_meta_objset->os; + objset_t *mos = dp->dp_meta_objset; hrtime_t start, write_time; uint64_t data_written; int err; @@ -296,24 +320,61 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) tx = dmu_tx_create_assigned(dp, txg); dp->dp_read_overhead = 0; + start = gethrtime(); + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { - if (!list_link_active(&ds->ds_synced_link)) - list_insert_tail(&dp->dp_synced_datasets, ds); - else - dmu_buf_rele(ds->ds_dbuf, ds); + /* + * We must not sync any non-MOS datasets twice, because + * we may have taken a snapshot of them. However, we + * may sync newly-created datasets on pass 2. + */ + ASSERT(!list_link_active(&ds->ds_synced_link)); + list_insert_tail(&dp->dp_synced_datasets, ds); dsl_dataset_sync(ds, zio, tx); } DTRACE_PROBE(pool_sync__1setup); - - start = gethrtime(); err = zio_wait(zio); + write_time = gethrtime() - start; ASSERT(err == 0); DTRACE_PROBE(pool_sync__2rootzio); - while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) + for (ds = list_head(&dp->dp_synced_datasets); ds; + ds = list_next(&dp->dp_synced_datasets, ds)) + dmu_objset_do_userquota_callbacks(ds->ds_objset, tx); + + /* + * Sync the datasets again to push out the changes due to + * userquota updates. This must be done before we process the + * sync tasks, because that could cause a snapshot of a dataset + * whose ds_bp will be rewritten when we do this 2nd sync. + */ + zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); + while (ds = txg_list_remove(&dp->dp_dirty_datasets, txg)) { + ASSERT(list_link_active(&ds->ds_synced_link)); + dmu_buf_rele(ds->ds_dbuf, ds); + dsl_dataset_sync(ds, zio, tx); + } + err = zio_wait(zio); + + /* + * If anything was added to a deadlist during a zio done callback, + * it had to be put on the deferred queue. Enqueue it for real now. + */ + for (ds = list_head(&dp->dp_synced_datasets); ds; + ds = list_next(&dp->dp_synced_datasets, ds)) + bplist_sync(&ds->ds_deadlist, + bplist_enqueue_cb, &ds->ds_deadlist, tx); + + while (dstg = txg_list_remove(&dp->dp_sync_tasks, txg)) { + /* + * No more sync tasks should have been added while we + * were syncing. + */ + ASSERT(spa_sync_pass(dp->dp_spa) == 1); dsl_sync_task_group_sync(dstg, tx); + } DTRACE_PROBE(pool_sync__3task); start = gethrtime(); @@ -321,14 +382,18 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dsl_dir_sync(dd, tx); write_time += gethrtime() - start; - if (spa_sync_pass(dp->dp_spa) == 1) + if (spa_sync_pass(dp->dp_spa) == 1) { + dp->dp_scrub_prefetch_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); dsl_pool_scrub_sync(dp, tx); + (void) zio_wait(dp->dp_scrub_prefetch_zio_root); + } start = gethrtime(); - if (list_head(&mosi->os_dirty_dnodes[txg & TXG_MASK]) != NULL || - list_head(&mosi->os_free_dnodes[txg & TXG_MASK]) != NULL) { + if (list_head(&mos->os_dirty_dnodes[txg & TXG_MASK]) != NULL || + list_head(&mos->os_free_dnodes[txg & TXG_MASK]) != NULL) { zio = zio_root(dp->dp_spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); - dmu_objset_sync(mosi, zio, tx); + dmu_objset_sync(mos, zio, tx); err = zio_wait(zio); ASSERT(err == 0); dprintf_bp(&dp->dp_meta_rootbp, "meta objset rootbp is %s", ""); @@ -366,10 +431,14 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) * amount of write traffic allowed into each transaction group. * Weight the throughput calculation towards the current value: * thru = 3/4 old_thru + 1/4 new_thru + * + * Note: write_time is in nanosecs, so write_time/MICROSEC + * yields millisecs */ ASSERT(zfs_write_limit_min > 0); - if (data_written > zfs_write_limit_min / 8 && write_time > 0) { - uint64_t throughput = (data_written * NANOSEC) / write_time; + if (data_written > zfs_write_limit_min / 8 && write_time > MICROSEC) { + uint64_t throughput = data_written / (write_time / MICROSEC); + if (dp->dp_throughput) dp->dp_throughput = throughput / 4 + 3 * dp->dp_throughput / 4; @@ -377,21 +446,24 @@ dsl_pool_sync(dsl_pool_t *dp, uint64_t txg) dp->dp_throughput = throughput; dp->dp_write_limit = MIN(zfs_write_limit_inflated, MAX(zfs_write_limit_min, - dp->dp_throughput * zfs_txg_synctime)); + dp->dp_throughput * zfs_txg_synctime_ms)); } } void -dsl_pool_zil_clean(dsl_pool_t *dp) +dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg) { dsl_dataset_t *ds; + objset_t *os; while (ds = list_head(&dp->dp_synced_datasets)) { list_remove(&dp->dp_synced_datasets, ds); - ASSERT(ds->ds_user_ptr != NULL); - zil_clean(((objset_impl_t *)ds->ds_user_ptr)->os_zil); + os = ds->ds_objset; + zil_clean(os->os_zil); + ASSERT(!dmu_objset_is_dirty(os, txg)); dmu_buf_rele(ds->ds_dbuf, ds); } + ASSERT(!dmu_objset_is_dirty(dp->dp_meta_objset, txg)); } /* @@ -568,6 +640,7 @@ upgrade_clones_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) ASSERT(ds->ds_phys->ds_prev_snap_obj == prev->ds_object); if (prev->ds_phys->ds_next_clones_obj == 0) { + dmu_buf_will_dirty(prev->ds_dbuf, tx); prev->ds_phys->ds_next_clones_obj = zap_create(dp->dp_meta_objset, DMU_OT_NEXT_CLONES, DMU_OT_NONE, 0, tx); @@ -587,8 +660,8 @@ dsl_pool_upgrade_clones(dsl_pool_t *dp, dmu_tx_t *tx) ASSERT(dmu_tx_is_syncing(tx)); ASSERT(dp->dp_origin_snap != NULL); - (void) dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, - tx, DS_FIND_CHILDREN); + VERIFY3U(0, ==, dmu_objset_find_spa(dp->dp_spa, NULL, upgrade_clones_cb, + tx, DS_FIND_CHILDREN)); } void @@ -611,3 +684,114 @@ dsl_pool_create_origin(dsl_pool_t *dp, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); rw_exit(&dp->dp_config_rwlock); } + +taskq_t * +dsl_pool_vnrele_taskq(dsl_pool_t *dp) +{ + return (dp->dp_vnrele_taskq); +} + +/* + * Walk through the pool-wide zap object of temporary snapshot user holds + * and release them. + */ +void +dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp) +{ + zap_attribute_t za; + zap_cursor_t zc; + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj = dp->dp_tmp_userrefs_obj; + + if (zapobj == 0) + return; + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + + for (zap_cursor_init(&zc, mos, zapobj); + zap_cursor_retrieve(&zc, &za) == 0; + zap_cursor_advance(&zc)) { + char *htag; + uint64_t dsobj; + + htag = strchr(za.za_name, '-'); + *htag = '\0'; + ++htag; + dsobj = strtonum(za.za_name, NULL); + (void) dsl_dataset_user_release_tmp(dp, dsobj, htag); + } + zap_cursor_fini(&zc); +} + +/* + * Create the pool-wide zap object for storing temporary snapshot holds. + */ +void +dsl_pool_user_hold_create_obj(dsl_pool_t *dp, dmu_tx_t *tx) +{ + objset_t *mos = dp->dp_meta_objset; + + ASSERT(dp->dp_tmp_userrefs_obj == 0); + ASSERT(dmu_tx_is_syncing(tx)); + + dp->dp_tmp_userrefs_obj = zap_create(mos, DMU_OT_USERREFS, + DMU_OT_NONE, 0, tx); + + VERIFY(zap_add(mos, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_TMP_USERREFS, + sizeof (uint64_t), 1, &dp->dp_tmp_userrefs_obj, tx) == 0); +} + +static int +dsl_pool_user_hold_rele_impl(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t *now, dmu_tx_t *tx, boolean_t holding) +{ + objset_t *mos = dp->dp_meta_objset; + uint64_t zapobj = dp->dp_tmp_userrefs_obj; + char *name; + int error; + + ASSERT(spa_version(dp->dp_spa) >= SPA_VERSION_USERREFS); + ASSERT(dmu_tx_is_syncing(tx)); + + /* + * If the pool was created prior to SPA_VERSION_USERREFS, the + * zap object for temporary holds might not exist yet. + */ + if (zapobj == 0) { + if (holding) { + dsl_pool_user_hold_create_obj(dp, tx); + zapobj = dp->dp_tmp_userrefs_obj; + } else { + return (ENOENT); + } + } + + name = kmem_asprintf("%llx-%s", (u_longlong_t)dsobj, tag); + if (holding) + error = zap_add(mos, zapobj, name, 8, 1, now, tx); + else + error = zap_remove(mos, zapobj, name, tx); + strfree(name); + + return (error); +} + +/* + * Add a temporary hold for the given dataset object and tag. + */ +int +dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, const char *tag, + uint64_t *now, dmu_tx_t *tx) +{ + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, now, tx, B_TRUE)); +} + +/* + * Release a temporary hold for the given dataset object and tag. + */ +int +dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, const char *tag, + dmu_tx_t *tx) +{ + return (dsl_pool_user_hold_rele_impl(dp, dsobj, tag, NULL, + tx, B_FALSE)); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c index 212acbbc59688..f27305c953229 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_prop.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - +#include #include #include #include @@ -33,14 +32,16 @@ #include #include #include -#include /* for the default checksum value */ #include #include #include "zfs_prop.h" +#define ZPROP_INHERIT_SUFFIX "$inherit" +#define ZPROP_RECVD_SUFFIX "$recvd" + static int -dodefault(const char *propname, int intsz, int numint, void *buf) +dodefault(const char *propname, int intsz, int numints, void *buf) { zfs_prop_t prop; @@ -57,9 +58,9 @@ dodefault(const char *propname, int intsz, int numint, void *buf) if (intsz != 1) return (EOVERFLOW); (void) strncpy(buf, zfs_prop_default_string(prop), - numint); + numints); } else { - if (intsz != 8 || numint < 1) + if (intsz != 8 || numints < 1) return (EOVERFLOW); *(uint64_t *)buf = zfs_prop_default_numeric(prop); @@ -70,11 +71,16 @@ dodefault(const char *propname, int intsz, int numint, void *buf) int dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, - int intsz, int numint, void *buf, char *setpoint) + int intsz, int numints, void *buf, char *setpoint, boolean_t snapshot) { int err = ENOENT; + dsl_dir_t *target = dd; objset_t *mos = dd->dd_pool->dp_meta_objset; zfs_prop_t prop; + boolean_t inheritable; + boolean_t inheriting = B_FALSE; + char *inheritstr; + char *recvdstr; ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); @@ -82,51 +88,135 @@ dsl_prop_get_dd(dsl_dir_t *dd, const char *propname, setpoint[0] = '\0'; prop = zfs_name_to_prop(propname); + inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); /* - * Note: dd may be NULL, therefore we shouldn't dereference it - * ouside this loop. + * Note: dd may become NULL, therefore we shouldn't dereference it + * after this loop. */ for (; dd != NULL; dd = dd->dd_parent) { ASSERT(RW_LOCK_HELD(&dd->dd_pool->dp_config_rwlock)); - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, - propname, intsz, numint, buf); + + if (dd != target || snapshot) { + if (!inheritable) + break; + inheriting = B_TRUE; + } + + /* Check for a local value. */ + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, + intsz, numints, buf); if (err != ENOENT) { - if (setpoint) + if (setpoint != NULL && err == 0) dsl_dir_name(dd, setpoint); break; } /* - * Break out of this loop for non-inheritable properties. + * Skip the check for a received value if there is an explicit + * inheritance entry. */ - if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) + err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, + inheritstr); + if (err != 0 && err != ENOENT) break; + + if (err == ENOENT) { + /* Check for a received value. */ + err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, + recvdstr, intsz, numints, buf); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) { + if (inheriting) { + dsl_dir_name(dd, setpoint); + } else { + (void) strcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD); + } + } + break; + } + } + + /* + * If we found an explicit inheritance entry, err is zero even + * though we haven't yet found the value, so reinitializing err + * at the end of the loop (instead of at the beginning) ensures + * that err has a valid post-loop value. + */ + err = ENOENT; } + if (err == ENOENT) - err = dodefault(propname, intsz, numint, buf); + err = dodefault(propname, intsz, numints, buf); + + strfree(inheritstr); + strfree(recvdstr); return (err); } int dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname, - int intsz, int numint, void *buf, char *setpoint) + int intsz, int numints, void *buf, char *setpoint) { + zfs_prop_t prop = zfs_name_to_prop(propname); + boolean_t inheritable; + boolean_t snapshot; + uint64_t zapobj; + ASSERT(RW_LOCK_HELD(&ds->ds_dir->dd_pool->dp_config_rwlock)); + inheritable = (prop == ZPROP_INVAL || zfs_prop_inheritable(prop)); + snapshot = (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)); + zapobj = (ds->ds_phys == NULL ? 0 : ds->ds_phys->ds_props_obj); - if (ds->ds_phys->ds_props_obj) { - int err = zap_lookup(ds->ds_dir->dd_pool->dp_meta_objset, - ds->ds_phys->ds_props_obj, propname, intsz, numint, buf); + if (zapobj != 0) { + objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; + int err; + + ASSERT(snapshot); + + /* Check for a local value. */ + err = zap_lookup(mos, zapobj, propname, intsz, numints, buf); if (err != ENOENT) { - if (setpoint) + if (setpoint != NULL && err == 0) dsl_dataset_name(ds, setpoint); return (err); } + + /* + * Skip the check for a received value if there is an explicit + * inheritance entry. + */ + if (inheritable) { + char *inheritstr = kmem_asprintf("%s%s", propname, + ZPROP_INHERIT_SUFFIX); + err = zap_contains(mos, zapobj, inheritstr); + strfree(inheritstr); + if (err != 0 && err != ENOENT) + return (err); + } + + if (err == ENOENT) { + /* Check for a received value. */ + char *recvdstr = kmem_asprintf("%s%s", propname, + ZPROP_RECVD_SUFFIX); + err = zap_lookup(mos, zapobj, recvdstr, + intsz, numints, buf); + strfree(recvdstr); + if (err != ENOENT) { + if (setpoint != NULL && err == 0) + (void) strcpy(setpoint, + ZPROP_SOURCE_VAL_RECVD); + return (err); + } + } } return (dsl_prop_get_dd(ds->ds_dir, propname, - intsz, numint, buf, setpoint)); + intsz, numints, buf, setpoint, snapshot)); } /* @@ -212,6 +302,137 @@ dsl_prop_get_integer(const char *ddname, const char *propname, return (dsl_prop_get(ddname, propname, 8, 1, valuep, setpoint)); } +void +dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, + zprop_source_t source, uint64_t *value) +{ + psa->psa_name = propname; + psa->psa_source = source; + psa->psa_intsz = 8; + psa->psa_numints = 1; + psa->psa_value = value; + + psa->psa_effective_value = -1ULL; +} + +/* + * Predict the effective value of the given special property if it were set with + * the given value and source. This is not a general purpose function. It exists + * only to handle the special requirements of the quota and reservation + * properties. The fact that these properties are non-inheritable greatly + * simplifies the prediction logic. + * + * Returns 0 on success, a positive error code on failure, or -1 if called with + * a property not handled by this function. + */ +int +dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa) +{ + const char *propname = psa->psa_name; + zfs_prop_t prop = zfs_name_to_prop(propname); + zprop_source_t source = psa->psa_source; + objset_t *mos; + uint64_t zapobj; + uint64_t version; + char *recvdstr; + int err = 0; + + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_RESERVATION: + case ZFS_PROP_REFQUOTA: + case ZFS_PROP_REFRESERVATION: + break; + default: + return (-1); + } + + mos = dd->dd_pool->dp_meta_objset; + zapobj = dd->dd_phys->dd_props_zapobj; + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + version = spa_version(dd->dd_pool->dp_spa); + if (version < SPA_VERSION_RECVD_PROPS) { + if (source & ZPROP_SRC_NONE) + source = ZPROP_SRC_NONE; + else if (source & ZPROP_SRC_RECEIVED) + source = ZPROP_SRC_LOCAL; + } + + switch (source) { + case ZPROP_SRC_NONE: + /* Revert to the received value, if any. */ + err = zap_lookup(mos, zapobj, recvdstr, 8, 1, + &psa->psa_effective_value); + if (err == ENOENT) + psa->psa_effective_value = 0; + break; + case ZPROP_SRC_LOCAL: + psa->psa_effective_value = *(uint64_t *)psa->psa_value; + break; + case ZPROP_SRC_RECEIVED: + /* + * If there's no local setting, then the new received value will + * be the effective value. + */ + err = zap_lookup(mos, zapobj, propname, 8, 1, + &psa->psa_effective_value); + if (err == ENOENT) + psa->psa_effective_value = *(uint64_t *)psa->psa_value; + break; + case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): + /* + * We're clearing the received value, so the local setting (if + * it exists) remains the effective value. + */ + err = zap_lookup(mos, zapobj, propname, 8, 1, + &psa->psa_effective_value); + if (err == ENOENT) + psa->psa_effective_value = 0; + break; + default: + cmn_err(CE_PANIC, "unexpected property source: %d", source); + } + + strfree(recvdstr); + + if (err == ENOENT) + return (0); + + return (err); +} + +#ifdef ZFS_DEBUG +void +dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa) +{ + zfs_prop_t prop = zfs_name_to_prop(psa->psa_name); + uint64_t intval; + char setpoint[MAXNAMELEN]; + uint64_t version = spa_version(dd->dd_pool->dp_spa); + int err; + + if (version < SPA_VERSION_RECVD_PROPS) { + switch (prop) { + case ZFS_PROP_QUOTA: + case ZFS_PROP_RESERVATION: + return; + } + } + + err = dsl_prop_get_dd(dd, psa->psa_name, 8, 1, &intval, + setpoint, B_FALSE); + if (err == 0 && intval != psa->psa_effective_value) { + cmn_err(CE_PANIC, "%s property, source: %x, " + "predicted effective value: %llu, " + "actual effective value: %llu (setpoint: %s)", + psa->psa_name, psa->psa_source, + (unsigned long long)psa->psa_effective_value, + (unsigned long long)intval, setpoint); + } +} +#endif + /* * Unregister this callback. Return 0 on success, ENOENT if ddname is * invalid, ENOMSG if no matching callback registered. @@ -279,7 +500,6 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, zap_cursor_t zc; zap_attribute_t *za; int err; - uint64_t dummyval; ASSERT(RW_WRITE_HELD(&dp->dp_config_rwlock)); err = dsl_dir_open_obj(dp, ddobj, NULL, FTAG, &dd); @@ -291,8 +511,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, * If the prop is set here, then this change is not * being inherited here or below; stop the recursion. */ - err = zap_lookup(mos, dd->dd_phys->dd_props_zapobj, propname, - 8, 1, &dummyval); + err = zap_contains(mos, dd->dd_phys->dd_props_zapobj, propname); if (err == 0) { dsl_dir_close(dd, FTAG); return; @@ -312,8 +531,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, * If the property is set on this ds, then it is not * inherited here; don't call the callback. */ - if (propobj && 0 == zap_lookup(mos, propobj, propname, - 8, 1, &dummyval)) + if (propobj && 0 == zap_contains(mos, propobj, propname)) continue; cbr->cbr_func(cbr->cbr_arg, value); @@ -333,30 +551,28 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj, dsl_dir_close(dd, FTAG); } -struct prop_set_arg { - const char *name; - int intsz; - int numints; - const void *buf; -}; - - -static void +void dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) { dsl_dataset_t *ds = arg1; - struct prop_set_arg *psa = arg2; + dsl_prop_setarg_t *psa = arg2; objset_t *mos = ds->ds_dir->dd_pool->dp_meta_objset; - uint64_t zapobj, intval; + uint64_t zapobj, intval, dummy; int isint; char valbuf[32]; - char *valstr; + char *valstr = NULL; + char *inheritstr; + char *recvdstr; + char *tbuf = NULL; + int err; + uint64_t version = spa_version(ds->ds_dir->dd_pool->dp_spa); + const char *propname = psa->psa_name; + zprop_source_t source = psa->psa_source; - isint = (dodefault(psa->name, 8, 1, &intval) == 0); + isint = (dodefault(propname, 8, 1, &intval) == 0); - if (dsl_dataset_is_snapshot(ds)) { - ASSERT(spa_version(ds->ds_dir->dd_pool->dp_spa) >= - SPA_VERSION_SNAP_PROPS); + if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { + ASSERT(version >= SPA_VERSION_SNAP_PROPS); if (ds->ds_phys->ds_props_obj == 0) { dmu_buf_will_dirty(ds->ds_dbuf, tx); ds->ds_phys->ds_props_obj = @@ -368,22 +584,96 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) zapobj = ds->ds_dir->dd_phys->dd_props_zapobj; } - if (psa->numints == 0) { - int err = zap_remove(mos, zapobj, psa->name, tx); + if (version < SPA_VERSION_RECVD_PROPS) { + zfs_prop_t prop = zfs_name_to_prop(propname); + if (prop == ZFS_PROP_QUOTA || prop == ZFS_PROP_RESERVATION) + return; + + if (source & ZPROP_SRC_NONE) + source = ZPROP_SRC_NONE; + else if (source & ZPROP_SRC_RECEIVED) + source = ZPROP_SRC_LOCAL; + } + + inheritstr = kmem_asprintf("%s%s", propname, ZPROP_INHERIT_SUFFIX); + recvdstr = kmem_asprintf("%s%s", propname, ZPROP_RECVD_SUFFIX); + + switch (source) { + case ZPROP_SRC_NONE: + /* + * revert to received value, if any (inherit -S) + * - remove propname + * - remove propname$inherit + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + break; + case ZPROP_SRC_LOCAL: + /* + * remove propname$inherit + * set propname -> value + */ + err = zap_remove(mos, zapobj, inheritstr, tx); ASSERT(err == 0 || err == ENOENT); - if (isint) { - VERIFY(0 == dsl_prop_get_ds(ds, - psa->name, 8, 1, &intval, NULL)); + VERIFY(0 == zap_update(mos, zapobj, propname, + psa->psa_intsz, psa->psa_numints, psa->psa_value, tx)); + break; + case ZPROP_SRC_INHERITED: + /* + * explicitly inherit + * - remove propname + * - set propname$inherit + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + if (version >= SPA_VERSION_RECVD_PROPS && + zap_contains(mos, zapobj, ZPROP_HAS_RECVD) == 0) { + dummy = 0; + err = zap_update(mos, zapobj, inheritstr, + 8, 1, &dummy, tx); + ASSERT(err == 0); } - } else { - VERIFY(0 == zap_update(mos, zapobj, psa->name, - psa->intsz, psa->numints, psa->buf, tx)); - if (isint) - intval = *(uint64_t *)psa->buf; + break; + case ZPROP_SRC_RECEIVED: + /* + * set propname$recvd -> value + */ + err = zap_update(mos, zapobj, recvdstr, + psa->psa_intsz, psa->psa_numints, psa->psa_value, tx); + ASSERT(err == 0); + break; + case (ZPROP_SRC_NONE | ZPROP_SRC_LOCAL | ZPROP_SRC_RECEIVED): + /* + * clear local and received settings + * - remove propname + * - remove propname$inherit + * - remove propname$recvd + */ + err = zap_remove(mos, zapobj, propname, tx); + ASSERT(err == 0 || err == ENOENT); + err = zap_remove(mos, zapobj, inheritstr, tx); + ASSERT(err == 0 || err == ENOENT); + /* FALLTHRU */ + case (ZPROP_SRC_NONE | ZPROP_SRC_RECEIVED): + /* + * remove propname$recvd + */ + err = zap_remove(mos, zapobj, recvdstr, tx); + ASSERT(err == 0 || err == ENOENT); + break; + default: + cmn_err(CE_PANIC, "unexpected property source: %d", source); } + strfree(inheritstr); + strfree(recvdstr); + if (isint) { - if (dsl_dataset_is_snapshot(ds)) { + VERIFY(0 == dsl_prop_get_ds(ds, propname, 8, 1, &intval, NULL)); + + if (ds->ds_phys != NULL && dsl_dataset_is_snapshot(ds)) { dsl_prop_cb_record_t *cbr; /* * It's a snapshot; nothing can inherit this @@ -394,29 +684,84 @@ dsl_prop_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr; cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) { if (cbr->cbr_ds == ds && - strcmp(cbr->cbr_propname, psa->name) == 0) + strcmp(cbr->cbr_propname, propname) == 0) cbr->cbr_func(cbr->cbr_arg, intval); } mutex_exit(&ds->ds_dir->dd_lock); } else { dsl_prop_changed_notify(ds->ds_dir->dd_pool, - ds->ds_dir->dd_object, psa->name, intval, TRUE); + ds->ds_dir->dd_object, propname, intval, TRUE); } - } - if (isint) { + (void) snprintf(valbuf, sizeof (valbuf), "%lld", (longlong_t)intval); valstr = valbuf; } else { - valstr = (char *)psa->buf; + if (source == ZPROP_SRC_LOCAL) { + valstr = (char *)psa->psa_value; + } else { + tbuf = kmem_alloc(ZAP_MAXVALUELEN, KM_SLEEP); + if (dsl_prop_get_ds(ds, propname, 1, + ZAP_MAXVALUELEN, tbuf, NULL) == 0) + valstr = tbuf; + } } - spa_history_internal_log((psa->numints == 0) ? LOG_DS_INHERIT : + + spa_history_internal_log((source == ZPROP_SRC_NONE || + source == ZPROP_SRC_INHERITED) ? LOG_DS_INHERIT : LOG_DS_PROPSET, ds->ds_dir->dd_pool->dp_spa, tx, cr, - "%s=%s dataset = %llu", psa->name, valstr, ds->ds_object); + "%s=%s dataset = %llu", propname, + (valstr == NULL ? "" : valstr), ds->ds_object); + + if (tbuf != NULL) + kmem_free(tbuf, ZAP_MAXVALUELEN); } void -dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, +dsl_props_set_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) +{ + dsl_dataset_t *ds = arg1; + dsl_props_arg_t *pa = arg2; + nvlist_t *props = pa->pa_props; + dsl_prop_setarg_t psa; + nvpair_t *elem = NULL; + + psa.psa_source = pa->pa_source; + + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + nvpair_t *pair = elem; + + psa.psa_name = nvpair_name(pair); + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * dsl_prop_get_all_impl() returns properties in this + * format. + */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } + + if (nvpair_type(pair) == DATA_TYPE_STRING) { + VERIFY(nvpair_value_string(pair, + (char **)&psa.psa_value) == 0); + psa.psa_intsz = 1; + psa.psa_numints = strlen(psa.psa_value) + 1; + } else { + uint64_t intval; + VERIFY(nvpair_value_uint64(pair, &intval) == 0); + psa.psa_intsz = sizeof (intval); + psa.psa_numints = 1; + psa.psa_value = &intval; + } + dsl_prop_set_sync(ds, &psa, cr, tx); + } +} + +void +dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx) { objset_t *mos = dd->dd_pool->dp_meta_objset; @@ -434,12 +779,13 @@ dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, } int -dsl_prop_set(const char *dsname, const char *propname, +dsl_prop_set(const char *dsname, const char *propname, zprop_source_t source, int intsz, int numints, const void *buf) { dsl_dataset_t *ds; + uint64_t version; int err; - struct prop_set_arg psa; + dsl_prop_setarg_t psa; /* * We must do these checks before we get to the syncfunc, since @@ -447,23 +793,30 @@ dsl_prop_set(const char *dsname, const char *propname, */ if (strlen(propname) >= ZAP_MAXNAMELEN) return (ENAMETOOLONG); - if (intsz * numints >= ZAP_MAXVALUELEN) - return (E2BIG); err = dsl_dataset_hold(dsname, FTAG, &ds); if (err) return (err); + version = spa_version(ds->ds_dir->dd_pool->dp_spa); + if (intsz * numints >= (version < SPA_VERSION_STMF_PROP ? + ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { + dsl_dataset_rele(ds, FTAG); + return (E2BIG); + } if (dsl_dataset_is_snapshot(ds) && - spa_version(ds->ds_dir->dd_pool->dp_spa) < SPA_VERSION_SNAP_PROPS) { + version < SPA_VERSION_SNAP_PROPS) { dsl_dataset_rele(ds, FTAG); return (ENOTSUP); } - psa.name = propname; - psa.intsz = intsz; - psa.numints = numints; - psa.buf = buf; + psa.psa_name = propname; + psa.psa_source = source; + psa.psa_intsz = intsz; + psa.psa_numints = numints; + psa.psa_value = buf; + psa.psa_effective_value = -1ULL; + err = dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, dsl_prop_set_sync, ds, &psa, 2); @@ -471,122 +824,319 @@ dsl_prop_set(const char *dsname, const char *propname, return (err); } -/* - * Iterate over all properties for this dataset and return them in an nvlist. - */ int -dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local) +dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *props) { - dsl_dataset_t *ds = os->os->os_dsl_dataset; - dsl_dir_t *dd = ds->ds_dir; - boolean_t snapshot = dsl_dataset_is_snapshot(ds); - int err = 0; - dsl_pool_t *dp = dd->dd_pool; - objset_t *mos = dp->dp_meta_objset; - uint64_t propobj = ds->ds_phys->ds_props_obj; + dsl_dataset_t *ds; + uint64_t version; + nvpair_t *elem = NULL; + dsl_props_arg_t pa; + int err; - VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (err = dsl_dataset_hold(dsname, FTAG, &ds)) + return (err); + /* + * Do these checks before the syncfunc, since it can't fail. + */ + version = spa_version(ds->ds_dir->dd_pool->dp_spa); + while ((elem = nvlist_next_nvpair(props, elem)) != NULL) { + if (strlen(nvpair_name(elem)) >= ZAP_MAXNAMELEN) { + dsl_dataset_rele(ds, FTAG); + return (ENAMETOOLONG); + } + if (nvpair_type(elem) == DATA_TYPE_STRING) { + char *valstr; + VERIFY(nvpair_value_string(elem, &valstr) == 0); + if (strlen(valstr) >= (version < + SPA_VERSION_STMF_PROP ? + ZAP_OLDMAXVALUELEN : ZAP_MAXVALUELEN)) { + dsl_dataset_rele(ds, FTAG); + return (E2BIG); + } + } + } - if (local && snapshot && !propobj) - return (0); + if (dsl_dataset_is_snapshot(ds) && + version < SPA_VERSION_SNAP_PROPS) { + dsl_dataset_rele(ds, FTAG); + return (ENOTSUP); + } - rw_enter(&dp->dp_config_rwlock, RW_READER); - while (dd != NULL) { - char setpoint[MAXNAMELEN]; - zap_cursor_t zc; - zap_attribute_t za; - dsl_dir_t *dd_next; - - if (propobj) { - dsl_dataset_name(ds, setpoint); - dd_next = dd; - } else { - dsl_dir_name(dd, setpoint); - propobj = dd->dd_phys->dd_props_zapobj; - dd_next = dd->dd_parent; - } + pa.pa_props = props; + pa.pa_source = source; - for (zap_cursor_init(&zc, mos, propobj); - (err = zap_cursor_retrieve(&zc, &za)) == 0; - zap_cursor_advance(&zc)) { - nvlist_t *propval; - zfs_prop_t prop = zfs_name_to_prop(za.za_name); + err = dsl_sync_task_do(ds->ds_dir->dd_pool, + NULL, dsl_props_set_sync, ds, &pa, 2); - /* Skip non-inheritable properties. */ - if (prop != ZPROP_INVAL && - !zfs_prop_inheritable(prop) && - (dd != ds->ds_dir || (snapshot && dd != dd_next))) - continue; + dsl_dataset_rele(ds, FTAG); + return (err); +} - /* Skip properties not valid for this type. */ - if (snapshot && prop != ZPROP_INVAL && - !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) +typedef enum dsl_prop_getflags { + DSL_PROP_GET_INHERITING = 0x1, /* searching parent of target ds */ + DSL_PROP_GET_SNAPSHOT = 0x2, /* snapshot dataset */ + DSL_PROP_GET_LOCAL = 0x4, /* local properties */ + DSL_PROP_GET_RECEIVED = 0x8 /* received properties */ +} dsl_prop_getflags_t; + +static int +dsl_prop_get_all_impl(objset_t *mos, uint64_t propobj, + const char *setpoint, dsl_prop_getflags_t flags, nvlist_t *nv) +{ + zap_cursor_t zc; + zap_attribute_t za; + int err = 0; + + for (zap_cursor_init(&zc, mos, propobj); + (err = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + nvlist_t *propval; + zfs_prop_t prop; + char buf[ZAP_MAXNAMELEN]; + char *valstr; + const char *suffix; + const char *propname; + const char *source; + + suffix = strchr(za.za_name, '$'); + + if (suffix == NULL) { + /* + * Skip local properties if we only want received + * properties. + */ + if (flags & DSL_PROP_GET_RECEIVED) continue; - /* Skip properties already defined */ - if (nvlist_lookup_nvlist(*nvp, za.za_name, - &propval) == 0) + propname = za.za_name; + source = setpoint; + } else if (strcmp(suffix, ZPROP_INHERIT_SUFFIX) == 0) { + /* Skip explicitly inherited entries. */ + continue; + } else if (strcmp(suffix, ZPROP_RECVD_SUFFIX) == 0) { + if (flags & DSL_PROP_GET_LOCAL) continue; - VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, - KM_SLEEP) == 0); - if (za.za_integer_length == 1) { - /* - * String property - */ - char *tmp = kmem_alloc(za.za_num_integers, - KM_SLEEP); - err = zap_lookup(mos, propobj, - za.za_name, 1, za.za_num_integers, tmp); - if (err != 0) { - kmem_free(tmp, za.za_num_integers); + (void) strncpy(buf, za.za_name, (suffix - za.za_name)); + buf[suffix - za.za_name] = '\0'; + propname = buf; + + if (!(flags & DSL_PROP_GET_RECEIVED)) { + /* Skip if locally overridden. */ + err = zap_contains(mos, propobj, propname); + if (err == 0) + continue; + if (err != ENOENT) + break; + + /* Skip if explicitly inherited. */ + valstr = kmem_asprintf("%s%s", propname, + ZPROP_INHERIT_SUFFIX); + err = zap_contains(mos, propobj, valstr); + strfree(valstr); + if (err == 0) + continue; + if (err != ENOENT) break; - } - VERIFY(nvlist_add_string(propval, ZPROP_VALUE, - tmp) == 0); - kmem_free(tmp, za.za_num_integers); - } else { - /* - * Integer property - */ - ASSERT(za.za_integer_length == 8); - (void) nvlist_add_uint64(propval, ZPROP_VALUE, - za.za_first_integer); } - VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, - setpoint) == 0); - VERIFY(nvlist_add_nvlist(*nvp, za.za_name, - propval) == 0); - nvlist_free(propval); + source = ((flags & DSL_PROP_GET_INHERITING) ? + setpoint : ZPROP_SOURCE_VAL_RECVD); + } else { + /* + * For backward compatibility, skip suffixes we don't + * recognize. + */ + continue; } - zap_cursor_fini(&zc); - if (err != ENOENT) - break; + prop = zfs_name_to_prop(propname); + + /* Skip non-inheritable properties. */ + if ((flags & DSL_PROP_GET_INHERITING) && prop != ZPROP_INVAL && + !zfs_prop_inheritable(prop)) + continue; + + /* Skip properties not valid for this type. */ + if ((flags & DSL_PROP_GET_SNAPSHOT) && prop != ZPROP_INVAL && + !zfs_prop_valid_for_type(prop, ZFS_TYPE_SNAPSHOT)) + continue; + + /* Skip properties already defined. */ + if (nvlist_exists(nv, propname)) + continue; + + VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); + if (za.za_integer_length == 1) { + /* + * String property + */ + char *tmp = kmem_alloc(za.za_num_integers, + KM_SLEEP); + err = zap_lookup(mos, propobj, + za.za_name, 1, za.za_num_integers, tmp); + if (err != 0) { + kmem_free(tmp, za.za_num_integers); + break; + } + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, + tmp) == 0); + kmem_free(tmp, za.za_num_integers); + } else { + /* + * Integer property + */ + ASSERT(za.za_integer_length == 8); + (void) nvlist_add_uint64(propval, ZPROP_VALUE, + za.za_first_integer); + } + + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, source) == 0); + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); + nvlist_free(propval); + } + zap_cursor_fini(&zc); + if (err == ENOENT) err = 0; - /* - * If we are just after the props that have been set - * locally, then we are done after the first iteration. - */ - if (local) + return (err); +} + +/* + * Iterate over all properties for this dataset and return them in an nvlist. + */ +static int +dsl_prop_get_all_ds(dsl_dataset_t *ds, nvlist_t **nvp, + dsl_prop_getflags_t flags) +{ + dsl_dir_t *dd = ds->ds_dir; + dsl_pool_t *dp = dd->dd_pool; + objset_t *mos = dp->dp_meta_objset; + int err = 0; + char setpoint[MAXNAMELEN]; + + VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (dsl_dataset_is_snapshot(ds)) + flags |= DSL_PROP_GET_SNAPSHOT; + + rw_enter(&dp->dp_config_rwlock, RW_READER); + + if (ds->ds_phys->ds_props_obj != 0) { + ASSERT(flags & DSL_PROP_GET_SNAPSHOT); + dsl_dataset_name(ds, setpoint); + err = dsl_prop_get_all_impl(mos, ds->ds_phys->ds_props_obj, + setpoint, flags, *nvp); + if (err) + goto out; + } + + for (; dd != NULL; dd = dd->dd_parent) { + if (dd != ds->ds_dir || (flags & DSL_PROP_GET_SNAPSHOT)) { + if (flags & (DSL_PROP_GET_LOCAL | + DSL_PROP_GET_RECEIVED)) + break; + flags |= DSL_PROP_GET_INHERITING; + } + dsl_dir_name(dd, setpoint); + err = dsl_prop_get_all_impl(mos, dd->dd_phys->dd_props_zapobj, + setpoint, flags, *nvp); + if (err) break; - dd = dd_next; - propobj = 0; } +out: rw_exit(&dp->dp_config_rwlock); - return (err); } +boolean_t +dsl_prop_get_hasrecvd(objset_t *os) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + int rc; + uint64_t dummy; + + rw_enter(&ds->ds_dir->dd_pool->dp_config_rwlock, RW_READER); + rc = dsl_prop_get_ds(ds, ZPROP_HAS_RECVD, 8, 1, &dummy, NULL); + rw_exit(&ds->ds_dir->dd_pool->dp_config_rwlock); + ASSERT(rc != 0 || spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); + return (rc == 0); +} + +static void +dsl_prop_set_hasrecvd_impl(objset_t *os, zprop_source_t source) +{ + dsl_dataset_t *ds = os->os_dsl_dataset; + uint64_t dummy = 0; + dsl_prop_setarg_t psa; + + if (spa_version(os->os_spa) < SPA_VERSION_RECVD_PROPS) + return; + + dsl_prop_setarg_init_uint64(&psa, ZPROP_HAS_RECVD, source, &dummy); + + (void) dsl_sync_task_do(ds->ds_dir->dd_pool, NULL, + dsl_prop_set_sync, ds, &psa, 2); +} + +/* + * Call after successfully receiving properties to ensure that only the first + * receive on or after SPA_VERSION_RECVD_PROPS blows away local properties. + */ +void +dsl_prop_set_hasrecvd(objset_t *os) +{ + if (dsl_prop_get_hasrecvd(os)) { + ASSERT(spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS); + return; + } + dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_LOCAL); +} + +void +dsl_prop_unset_hasrecvd(objset_t *os) +{ + dsl_prop_set_hasrecvd_impl(os, ZPROP_SRC_NONE); +} + +int +dsl_prop_get_all(objset_t *os, nvlist_t **nvp) +{ + return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, 0)); +} + +int +dsl_prop_get_received(objset_t *os, nvlist_t **nvp) +{ + /* + * Received properties are not distinguishable from local properties + * until the dataset has received properties on or after + * SPA_VERSION_RECVD_PROPS. + */ + dsl_prop_getflags_t flags = (dsl_prop_get_hasrecvd(os) ? + DSL_PROP_GET_RECEIVED : DSL_PROP_GET_LOCAL); + return (dsl_prop_get_all_ds(os->os_dsl_dataset, nvp, flags)); +} + void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value) { nvlist_t *propval; + const char *propname = zfs_prop_to_name(prop); + uint64_t default_value; + + if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { + VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); + return; + } VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(propval, ZPROP_VALUE, value) == 0); - VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); + /* Indicate the default source if we can. */ + if (dodefault(propname, 8, 1, &default_value) == 0 && + value == default_value) { + VERIFY(nvlist_add_string(propval, ZPROP_SOURCE, "") == 0); + } + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } @@ -594,9 +1144,15 @@ void dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value) { nvlist_t *propval; + const char *propname = zfs_prop_to_name(prop); + + if (nvlist_lookup_nvlist(nv, propname, &propval) == 0) { + VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); + return; + } VERIFY(nvlist_alloc(&propval, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_string(propval, ZPROP_VALUE, value) == 0); - VERIFY(nvlist_add_nvlist(nv, zfs_prop_to_name(prop), propval) == 0); + VERIFY(nvlist_add_nvlist(nv, propname, propval) == 0); nvlist_free(propval); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c index 950a91f783a47..cf7f0f42684b7 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_scrub.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -40,15 +40,21 @@ #include #include #include +#include +#include typedef int (scrub_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_t *); static scrub_cb_t dsl_pool_scrub_clean_cb; static dsl_syncfunc_t dsl_pool_scrub_cancel_sync; +static void scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t objset, uint64_t object); -int zfs_scrub_min_time = 1; /* scrub for at least 1 sec each txg */ -int zfs_resilver_min_time = 3; /* resilver for at least 3 sec each txg */ +int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ +int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ +boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable srub prefetching */ +enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; extern int zfs_txg_timeout; @@ -57,14 +63,6 @@ static scrub_cb_t *scrub_funcs[SCRUB_FUNC_NUMFUNCS] = { dsl_pool_scrub_clean_cb }; -#define SET_BOOKMARK(zb, objset, object, level, blkid) \ -{ \ - (zb)->zb_objset = objset; \ - (zb)->zb_object = object; \ - (zb)->zb_level = level; \ - (zb)->zb_blkid = blkid; \ -} - /* ARGSUSED */ static void dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) @@ -82,6 +80,7 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_min_txg = 0; dp->dp_scrub_max_txg = tx->tx_txg; + dp->dp_scrub_ddt_class_max = zfs_scrub_ddt_class_max; if (*funcp == SCRUB_FUNC_CLEAN) { vdev_t *rvd = dp->dp_spa->spa_root_vdev; @@ -95,6 +94,9 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) ESC_ZFS_RESILVER_START); dp->dp_scrub_max_txg = MIN(dp->dp_scrub_max_txg, tx->tx_txg); + } else { + spa_event_notify(dp->dp_spa, NULL, + ESC_ZFS_SCRUB_START); } /* zero out the scrub stats in all vdev_stat_t's */ @@ -102,6 +104,14 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : POOL_SCRUB_EVERYTHING, B_FALSE); + /* + * If this is an incremental scrub, limit the DDT scrub phase + * to just the auto-ditto class (for correctness); the rest + * of the scrub should go faster using top-down pruning. + */ + if (dp->dp_scrub_min_txg > TXG_INITIAL) + dp->dp_scrub_ddt_class_max = DDT_CLASS_DITTO; + dp->dp_spa->spa_scrub_started = B_TRUE; } @@ -120,6 +130,7 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCRUB_QUEUE, DMU_OT_NONE, 0, tx); bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); dp->dp_scrub_restart = B_FALSE; dp->dp_spa->spa_scrub_errors = 0; @@ -136,8 +147,16 @@ dsl_pool_scrub_setup_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) DMU_POOL_SCRUB_MAX_TXG, sizeof (uint64_t), 1, &dp->dp_scrub_max_txg, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), &dp->dp_scrub_bookmark, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), + &dp->dp_scrub_ddt_bookmark, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, + &dp->dp_scrub_ddt_class_max, tx)); VERIFY(0 == zap_add(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, &dp->dp_spa->spa_scrub_errors, tx)); @@ -186,6 +205,7 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) dp->dp_scrub_queue_obj, tx)); dp->dp_scrub_queue_obj = 0; bzero(&dp->dp_scrub_bookmark, sizeof (zbookmark_t)); + bzero(&dp->dp_scrub_ddt_bookmark, sizeof (ddt_bookmark_t)); VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_QUEUE, tx)); @@ -200,6 +220,11 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) VERIFY(0 == zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, tx)); + (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, tx); + (void) zap_remove(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, tx); + spa_history_internal_log(LOG_POOL_SCRUB_DONE, dp->dp_spa, tx, cr, "complete=%u", *completep); @@ -212,8 +237,9 @@ dsl_pool_scrub_cancel_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) */ vdev_dtl_reassess(dp->dp_spa->spa_root_vdev, tx->tx_txg, *completep ? dp->dp_scrub_max_txg : 0, B_TRUE); - if (dp->dp_scrub_min_txg && *completep) - spa_event_notify(dp->dp_spa, NULL, ESC_ZFS_RESILVER_FINISH); + if (*completep) + spa_event_notify(dp->dp_spa, NULL, dp->dp_scrub_min_txg ? + ESC_ZFS_RESILVER_FINISH : ESC_ZFS_SCRUB_FINISH); spa_errlog_rotate(dp->dp_spa); /* @@ -235,15 +261,13 @@ dsl_pool_scrub_cancel(dsl_pool_t *dp) dsl_pool_scrub_cancel_sync, dp, &complete, 3)); } -int -dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, - zio_done_func_t *done, void *private, uint32_t arc_flags) +void +dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) { /* * This function will be used by bp-rewrite wad to intercept frees. */ - return (arc_free(pio, dp->dp_spa, txg, (blkptr_t *)bpp, - done, private, arc_flags)); + zio_free(dp->dp_spa, txg, bpp); } static boolean_t @@ -261,14 +285,14 @@ bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, uint64_t zb1nextL0, zb2thisobj; ASSERT(zb1->zb_objset == zb2->zb_objset); - ASSERT(zb1->zb_object != -1ULL); + ASSERT(zb1->zb_object != DMU_DEADLIST_OBJECT); ASSERT(zb2->zb_level == 0); /* * A bookmark in the deadlist is considered to be after * everything else. */ - if (zb2->zb_object == -1ULL) + if (zb2->zb_object == DMU_DEADLIST_OBJECT) return (B_TRUE); /* The objset_phys_t isn't before anything. */ @@ -281,7 +305,7 @@ bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, zb2thisobj = zb2->zb_object ? zb2->zb_object : zb2->zb_blkid << (DNODE_BLOCK_SHIFT - DNODE_SHIFT); - if (zb1->zb_object == 0) { + if (zb1->zb_object == DMU_META_DNODE_OBJECT) { uint64_t nextobj = zb1nextL0 * (dnp->dn_datablkszsec << SPA_MINBLOCKSHIFT) >> DNODE_SHIFT; return (nextobj <= zb2thisobj); @@ -291,15 +315,15 @@ bookmark_is_before(dnode_phys_t *dnp, const zbookmark_t *zb1, return (B_TRUE); if (zb1->zb_object > zb2thisobj) return (B_FALSE); - if (zb2->zb_object == 0) + if (zb2->zb_object == DMU_META_DNODE_OBJECT) return (B_FALSE); return (zb1nextL0 <= zb2->zb_blkid); } static boolean_t -scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) +scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb, const ddt_bookmark_t *ddb) { - int elapsed_ticks; + uint64_t elapsed_nanosecs; int mintime; if (dp->dp_scrub_pausing) @@ -309,19 +333,31 @@ scrub_pause(dsl_pool_t *dp, const zbookmark_t *zb) return (B_FALSE); /* we're resuming */ /* We only know how to resume from level-0 blocks. */ - if (zb->zb_level != 0) + if (zb != NULL && zb->zb_level != 0) return (B_FALSE); - mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time : - zfs_scrub_min_time; - elapsed_ticks = lbolt64 - dp->dp_scrub_start_time; - if (elapsed_ticks > hz * zfs_txg_timeout || - (elapsed_ticks > hz * mintime && txg_sync_waiting(dp))) { - dprintf("pausing at %llx/%llx/%llx/%llx\n", - (longlong_t)zb->zb_objset, (longlong_t)zb->zb_object, - (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); + mintime = dp->dp_scrub_isresilver ? zfs_resilver_min_time_ms : + zfs_scrub_min_time_ms; + elapsed_nanosecs = gethrtime() - dp->dp_scrub_start_time; + if (elapsed_nanosecs / NANOSEC > zfs_txg_timeout || + (elapsed_nanosecs / MICROSEC > mintime && txg_sync_waiting(dp))) { + if (zb) { + dprintf("pausing at bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)zb->zb_objset, + (longlong_t)zb->zb_object, + (longlong_t)zb->zb_level, + (longlong_t)zb->zb_blkid); + dp->dp_scrub_bookmark = *zb; + } + if (ddb) { + dprintf("pausing at DDT bookmark %llx/%llx/%llx/%llx\n", + (longlong_t)ddb->ddb_class, + (longlong_t)ddb->ddb_type, + (longlong_t)ddb->ddb_checksum, + (longlong_t)ddb->ddb_cursor); + ASSERT(&dp->dp_scrub_ddt_bookmark == ddb); + } dp->dp_scrub_pausing = B_TRUE; - dp->dp_scrub_bookmark = *zb; return (B_TRUE); } return (B_FALSE); @@ -333,7 +369,7 @@ typedef struct zil_traverse_arg { } zil_traverse_arg_t; /* ARGSUSED */ -static void +static int traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zil_traverse_arg_t *zta = arg; @@ -342,20 +378,26 @@ traverse_zil_block(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) zbookmark_t zb; if (bp->blk_birth <= dp->dp_scrub_min_txg) - return; + return (0); + /* + * One block ("stubby") can be allocated a long time ago; we + * want to visit that one because it has been allocated + * (on-disk) even if it hasn't been claimed (even though for + * plain scrub there's nothing to do to it). + */ if (claim_txg == 0 && bp->blk_birth >= spa_first_txg(dp->dp_spa)) - return; + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); - zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); + return (0); } /* ARGSUSED */ -static void +static int traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) { if (lrc->lrc_txtype == TX_WRITE) { @@ -367,17 +409,23 @@ traverse_zil_record(zilog_t *zilog, lr_t *lrc, void *arg, uint64_t claim_txg) zbookmark_t zb; if (bp->blk_birth <= dp->dp_scrub_min_txg) - return; + return (0); + /* + * birth can be < claim_txg if this record's txg is + * already txg sync'ed (but this log block contains + * other records that are not synced) + */ if (claim_txg == 0 || bp->blk_birth < claim_txg) - return; + return (0); + + SET_BOOKMARK(&zb, zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET], + lr->lr_foid, ZB_ZIL_LEVEL, + lr->lr_offset / BP_GET_LSIZE(bp)); - zb.zb_objset = zh->zh_log.blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = lr->lr_foid; - zb.zb_level = BP_GET_LEVEL(bp); - zb.zb_blkid = lr->lr_offset / BP_GET_LSIZE(bp); VERIFY(0 == scrub_funcs[dp->dp_scrub_func](dp, bp, &zb)); } + return (0); } static void @@ -391,7 +439,7 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh) * We only want to visit blocks that have been claimed but not yet * replayed (or, in read-only mode, blocks that *would* be claimed). */ - if (claim_txg == 0 && (spa_mode & FWRITE)) + if (claim_txg == 0 && spa_writeable(dp->dp_spa)) return; zilog = zil_alloc(dp->dp_meta_objset, zh); @@ -402,6 +450,27 @@ traverse_zil(dsl_pool_t *dp, zil_header_t *zh) zil_free(zilog); } +static void +scrub_prefetch(dsl_pool_t *dp, arc_buf_t *buf, blkptr_t *bp, uint64_t objset, + uint64_t object, uint64_t blkid) +{ + zbookmark_t czb; + uint32_t flags = ARC_NOWAIT | ARC_PREFETCH; + + if (zfs_no_scrub_prefetch) + return; + + if (BP_IS_HOLE(bp) || bp->blk_birth <= dp->dp_scrub_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) + return; + + SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); + + (void) arc_read(dp->dp_scrub_prefetch_zio_root, dp->dp_spa, bp, + buf, NULL, NULL, ZIO_PRIORITY_ASYNC_READ, ZIO_FLAG_CANFAIL, + &flags, &czb); +} + static void scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *pbuf, blkptr_t *bp, const zbookmark_t *zb) @@ -409,13 +478,10 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, int err; arc_buf_t *buf = NULL; - if (bp->blk_birth == 0) - return; - if (bp->blk_birth <= dp->dp_scrub_min_txg) return; - if (scrub_pause(dp, zb)) + if (scrub_pause(dp, zb, NULL)) return; if (!bookmark_is_zero(&dp->dp_scrub_bookmark)) { @@ -443,6 +509,13 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, } } + /* + * If dsl_pool_scrub_ddt() has aready scrubbed this block, + * don't scrub it again. + */ + if (!ddt_class_contains(dp->dp_spa, dp->dp_scrub_ddt_class_max, bp)) + (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); + if (BP_GET_LEVEL(bp) > 0) { uint32_t flags = ARC_WAIT; int i; @@ -458,9 +531,11 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, mutex_exit(&dp->dp_spa->spa_scrub_lock); return; } - cbp = buf->b_data; - - for (i = 0; i < epb; i++, cbp++) { + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + scrub_prefetch(dp, buf, cbp, zb->zb_objset, + zb->zb_object, zb->zb_blkid * epb + i); + } + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_t czb; SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, @@ -470,7 +545,7 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, } } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { uint32_t flags = ARC_WAIT; - dnode_phys_t *child_dnp; + dnode_phys_t *cdnp; int i, j; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; @@ -483,23 +558,20 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, mutex_exit(&dp->dp_spa->spa_scrub_lock); return; } - child_dnp = buf->b_data; - - for (i = 0; i < epb; i++, child_dnp++) { - for (j = 0; j < child_dnp->dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, - zb->zb_blkid * epb + i, - child_dnp->dn_nlevels - 1, j); - scrub_visitbp(dp, child_dnp, buf, - &child_dnp->dn_blkptr[j], &czb); + for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + for (j = 0; j < cdnp->dn_nblkptr; j++) { + blkptr_t *cbp = &cdnp->dn_blkptr[j]; + scrub_prefetch(dp, buf, cbp, zb->zb_objset, + zb->zb_blkid * epb + i, j); } } + for (i = 0, cdnp = buf->b_data; i < epb; i++, cdnp++) { + scrub_visitdnode(dp, cdnp, buf, zb->zb_objset, + zb->zb_blkid * epb + i); + } } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { uint32_t flags = ARC_WAIT; objset_phys_t *osp; - int j; err = arc_read_nolock(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, @@ -515,27 +587,41 @@ scrub_visitbp(dsl_pool_t *dp, dnode_phys_t *dnp, traverse_zil(dp, &osp->os_zil_header); - for (j = 0; j < osp->os_meta_dnode.dn_nblkptr; j++) { - zbookmark_t czb; - - SET_BOOKMARK(&czb, zb->zb_objset, 0, - osp->os_meta_dnode.dn_nlevels - 1, j); - scrub_visitbp(dp, &osp->os_meta_dnode, buf, - &osp->os_meta_dnode.dn_blkptr[j], &czb); + scrub_visitdnode(dp, &osp->os_meta_dnode, + buf, zb->zb_objset, DMU_META_DNODE_OBJECT); + if (arc_buf_size(buf) >= sizeof (objset_phys_t)) { + scrub_visitdnode(dp, &osp->os_userused_dnode, + buf, zb->zb_objset, DMU_USERUSED_OBJECT); + scrub_visitdnode(dp, &osp->os_groupused_dnode, + buf, zb->zb_objset, DMU_GROUPUSED_OBJECT); } } - (void) scrub_funcs[dp->dp_scrub_func](dp, bp, zb); if (buf) (void) arc_buf_remove_ref(buf, &buf); } +static void +scrub_visitdnode(dsl_pool_t *dp, dnode_phys_t *dnp, arc_buf_t *buf, + uint64_t objset, uint64_t object) +{ + int j; + + for (j = 0; j < dnp->dn_nblkptr; j++) { + zbookmark_t czb; + + SET_BOOKMARK(&czb, objset, object, dnp->dn_nlevels - 1, j); + scrub_visitbp(dp, dnp, buf, &dnp->dn_blkptr[j], &czb); + } +} + static void scrub_visit_rootbp(dsl_pool_t *dp, dsl_dataset_t *ds, blkptr_t *bp) { zbookmark_t zb; - SET_BOOKMARK(&zb, ds ? ds->ds_object : 0, 0, -1, 0); + SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, + ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); scrub_visitbp(dp, NULL, NULL, bp, &zb); } @@ -548,7 +634,8 @@ dsl_pool_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) return; if (dp->dp_scrub_bookmark.zb_objset == ds->ds_object) { - SET_BOOKMARK(&dp->dp_scrub_bookmark, -1, 0, 0, 0); + SET_BOOKMARK(&dp->dp_scrub_bookmark, + ZB_DESTROYED_OBJSET, 0, 0, 0); } else if (zap_remove_int(dp->dp_meta_objset, dp->dp_scrub_queue_obj, ds->ds_object, tx) != 0) { return; @@ -677,17 +764,34 @@ scrub_visitds(dsl_pool_t *dp, uint64_t dsobj, dmu_tx_t *tx) ds->ds_phys->ds_next_snap_obj, tx) == 0); } if (ds->ds_phys->ds_num_children > 1) { - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { + boolean_t usenext = B_FALSE; + if (ds->ds_phys->ds_next_clones_obj != 0) { + uint64_t count; + /* + * A bug in a previous version of the code could + * cause upgrade_clones_cb() to not set + * ds_next_snap_obj when it should, leading to a + * missing entry. Therefore we can only use the + * next_clones_obj when its count is correct. + */ + int err = zap_count(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, &count); + if (err == 0 && + count == ds->ds_phys->ds_num_children - 1) + usenext = B_TRUE; + } + + if (usenext) { + VERIFY(zap_join(dp->dp_meta_objset, + ds->ds_phys->ds_next_clones_obj, + dp->dp_scrub_queue_obj, tx) == 0); + } else { struct enqueue_clones_arg eca; eca.tx = tx; eca.originobj = ds->ds_object; (void) dmu_objset_find_spa(ds->ds_dir->dd_pool->dp_spa, NULL, enqueue_clones_cb, &eca, DS_FIND_CHILDREN); - } else { - VERIFY(zap_join(dp->dp_meta_objset, - ds->ds_phys->ds_next_clones_obj, - dp->dp_scrub_queue_obj, tx) == 0); } } @@ -737,9 +841,78 @@ enqueue_cb(spa_t *spa, uint64_t dsobj, const char *dsname, void *arg) return (0); } +/* + * Scrub/dedup interaction. + * + * If there are N references to a deduped block, we don't want to scrub it + * N times -- ideally, we should scrub it exactly once. + * + * We leverage the fact that the dde's replication class (enum ddt_class) + * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest + * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. + * + * To prevent excess scrubbing, the scrub begins by walking the DDT + * to find all blocks with refcnt > 1, and scrubs each of these once. + * Since there are two replication classes which contain blocks with + * refcnt > 1, we scrub the highest replication class (DDT_CLASS_DITTO) first. + * Finally the top-down scrub begins, only visiting blocks with refcnt == 1. + * + * There would be nothing more to say if a block's refcnt couldn't change + * during a scrub, but of course it can so we must account for changes + * in a block's replication class. + * + * Here's an example of what can occur: + * + * If a block has refcnt > 1 during the DDT scrub phase, but has refcnt == 1 + * when visited during the top-down scrub phase, it will be scrubbed twice. + * This negates our scrub optimization, but is otherwise harmless. + * + * If a block has refcnt == 1 during the DDT scrub phase, but has refcnt > 1 + * on each visit during the top-down scrub phase, it will never be scrubbed. + * To catch this, ddt_sync_entry() notifies the scrub code whenever a block's + * reference class transitions to a higher level (i.e DDT_CLASS_UNIQUE to + * DDT_CLASS_DUPLICATE); if it transitions from refcnt == 1 to refcnt > 1 + * while a scrub is in progress, it scrubs the block right then. + */ +static void +dsl_pool_scrub_ddt(dsl_pool_t *dp) +{ + ddt_bookmark_t *ddb = &dp->dp_scrub_ddt_bookmark; + ddt_entry_t dde; + int error; + + while ((error = ddt_walk(dp->dp_spa, ddb, &dde)) == 0) { + if (ddb->ddb_class > dp->dp_scrub_ddt_class_max) + return; + dsl_pool_scrub_ddt_entry(dp, ddb->ddb_checksum, &dde); + if (scrub_pause(dp, NULL, ddb)) + return; + } + ASSERT(error == ENOENT); + ASSERT(ddb->ddb_class > dp->dp_scrub_ddt_class_max); +} + +void +dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, + const ddt_entry_t *dde) +{ + const ddt_key_t *ddk = &dde->dde_key; + const ddt_phys_t *ddp = dde->dde_phys; + blkptr_t blk; + zbookmark_t zb = { 0 }; + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0) + continue; + ddt_bp_create(checksum, ddk, ddp, &blk); + scrub_funcs[dp->dp_scrub_func](dp, &blk, &zb); + } +} + void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) { + spa_t *spa = dp->dp_spa; zap_cursor_t zc; zap_attribute_t za; boolean_t complete = B_TRUE; @@ -747,8 +920,10 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (dp->dp_scrub_func == SCRUB_FUNC_NONE) return; - /* If the spa is not fully loaded, don't bother. */ - if (dp->dp_spa->spa_load_state != SPA_LOAD_NONE) + /* + * If the pool is not loaded, or is trying to unload, leave it alone. + */ + if (spa_load_state(spa) != SPA_LOAD_NONE || spa_shutting_down(spa)) return; if (dp->dp_scrub_restart) { @@ -757,41 +932,47 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_pool_scrub_setup_sync(dp, &func, kcred, tx); } - if (dp->dp_spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { + if (spa->spa_root_vdev->vdev_stat.vs_scrub_type == 0) { /* * We must have resumed after rebooting; reset the vdev * stats to know that we're doing a scrub (although it * will think we're just starting now). */ - vdev_scrub_stat_update(dp->dp_spa->spa_root_vdev, + vdev_scrub_stat_update(spa->spa_root_vdev, dp->dp_scrub_min_txg ? POOL_SCRUB_RESILVER : POOL_SCRUB_EVERYTHING, B_FALSE); } dp->dp_scrub_pausing = B_FALSE; - dp->dp_scrub_start_time = lbolt64; + dp->dp_scrub_start_time = gethrtime(); dp->dp_scrub_isresilver = (dp->dp_scrub_min_txg != 0); - dp->dp_spa->spa_scrub_active = B_TRUE; + spa->spa_scrub_active = B_TRUE; + + if (dp->dp_scrub_ddt_bookmark.ddb_class <= dp->dp_scrub_ddt_class_max) { + dsl_pool_scrub_ddt(dp); + if (dp->dp_scrub_pausing) + goto out; + } - if (dp->dp_scrub_bookmark.zb_objset == 0) { + if (dp->dp_scrub_bookmark.zb_objset == DMU_META_OBJSET) { /* First do the MOS & ORIGIN */ scrub_visit_rootbp(dp, NULL, &dp->dp_meta_rootbp); if (dp->dp_scrub_pausing) goto out; - if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { - VERIFY(0 == dmu_objset_find_spa(dp->dp_spa, + if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) { + VERIFY(0 == dmu_objset_find_spa(spa, NULL, enqueue_cb, tx, DS_FIND_CHILDREN)); } else { scrub_visitds(dp, dp->dp_origin_snap->ds_object, tx); } ASSERT(!dp->dp_scrub_pausing); - } else if (dp->dp_scrub_bookmark.zb_objset != -1ULL) { + } else if (dp->dp_scrub_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { /* - * If we were paused, continue from here. Note if the - * ds we were paused on was deleted, the zb_objset will - * be -1, so we will skip this and find a new objset - * below. + * If we were paused, continue from here. Note if the ds + * we were paused on was destroyed, the zb_objset will be + * ZB_DESTROYED_OBJSET, so we will skip this and find a new + * objset below. */ scrub_visitds(dp, dp->dp_scrub_bookmark.zb_objset, tx); if (dp->dp_scrub_pausing) @@ -823,22 +1004,20 @@ dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx) dsl_pool_scrub_cancel_sync(dp, &complete, kcred, tx); return; out: - VERIFY(0 == zap_update(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), 4, + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_bookmark) / sizeof (uint64_t), &dp->dp_scrub_bookmark, tx)); - VERIFY(0 == zap_update(dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_BOOKMARK, sizeof (uint64_t), + sizeof (dp->dp_scrub_ddt_bookmark) / sizeof (uint64_t), + &dp->dp_scrub_ddt_bookmark, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCRUB_DDT_CLASS_MAX, sizeof (uint64_t), 1, + &dp->dp_scrub_ddt_class_max, tx)); + VERIFY(0 == zap_update(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SCRUB_ERRORS, sizeof (uint64_t), 1, - &dp->dp_spa->spa_scrub_errors, tx)); - - /* XXX this is scrub-clean specific */ - mutex_enter(&dp->dp_spa->spa_scrub_lock); - while (dp->dp_spa->spa_scrub_inflight > 0) { - cv_wait(&dp->dp_spa->spa_scrub_io_cv, - &dp->dp_spa->spa_scrub_lock); - } - mutex_exit(&dp->dp_spa->spa_scrub_lock); + &spa->spa_scrub_errors, tx)); } void @@ -920,13 +1099,17 @@ static int dsl_pool_scrub_clean_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_t *zb) { - size_t size = BP_GET_LSIZE(bp); - int d; + size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; + uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); boolean_t needs_io; - int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_CANFAIL; + int zio_flags = ZIO_FLAG_SCRUB_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; int zio_priority; + if (phys_birth <= dp->dp_scrub_min_txg || + phys_birth >= dp->dp_scrub_max_txg) + return (0); + count_block(dp->dp_blkstats, bp); if (dp->dp_scrub_isresilver == 0) { @@ -942,10 +1125,10 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, } /* If it's an intent log block, failure is expected. */ - if (zb->zb_level == -1 && BP_GET_TYPE(bp) != DMU_OT_OBJSET) + if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; - for (d = 0; d < BP_GET_NDVAS(bp); d++) { + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[d])); @@ -963,16 +1146,17 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, if (DVA_GET_GANG(&bp->blk_dva[d])) { /* * Gang members may be spread across multiple - * vdevs, so the best we can do is look at the - * pool-wide DTL. + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. * XXX -- it would be better to change our - * allocation policy to ensure that this can't - * happen. + * allocation policy to ensure that all + * gang members reside on the same vdev. */ - vd = spa->spa_root_vdev; + needs_io = B_TRUE; + } else { + needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, + phys_birth, 1); } - needs_io = vdev_dtl_contains(&vd->vdev_dtl_map, - bp->blk_birth, 1); } } @@ -997,18 +1181,20 @@ dsl_pool_scrub_clean_cb(dsl_pool_t *dp, int dsl_pool_scrub_clean(dsl_pool_t *dp) { + spa_t *spa = dp->dp_spa; + /* - * Purge all vdev caches. We do this here rather than in sync - * context because this requires a writer lock on the spa_config - * lock, which we can't do from sync context. The + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The * spa_scrub_reopen flag indicates that vdev_open() should not * attempt to start another scrub. */ - spa_config_enter(dp->dp_spa, SCL_ALL, FTAG, RW_WRITER); - dp->dp_spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(dp->dp_spa->spa_root_vdev); - dp->dp_spa->spa_scrub_reopen = B_FALSE; - spa_config_exit(dp->dp_spa, SCL_ALL, FTAG); + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); return (dsl_pool_scrub_setup(dp, SCRUB_FUNC_CLEAN)); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c index 21100225abf73..cdea979890ffa 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/dsl_synctask.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -118,8 +116,10 @@ dsl_sync_task_group_wait(dsl_sync_task_group_t *dstg) txg_wait_synced(dstg->dstg_pool, txg); - if (dstg->dstg_err == EAGAIN) + if (dstg->dstg_err == EAGAIN) { + txg_wait_synced(dstg->dstg_pool, txg + TXG_DEFER_SIZE); goto top; + } return (dstg->dstg_err); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c b/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c index 7fcde8475e003..10952f472b333 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/lzjb.c @@ -20,18 +20,18 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * We keep our own copy of this algorithm for 2 main reasons: - * 1. If we didn't, anyone modifying common/os/compress.c would + * 1. If we didn't, anyone modifying common/os/compress.c would * directly break our on disk format - * 2. Our version of lzjb does not have a number of checks that the + * 2. Our version of lzjb does not have a number of checks that the * common/os version needs and uses + * 3. We initialize the lempel to ensure deterministic results, + * so that identical blocks can always be deduplicated. * In particular, we are adding the "feature" that compress() can * take a destination buffer size and return -1 if the data will not * compress to d_len or less. @@ -43,7 +43,7 @@ #define MATCH_MIN 3 #define MATCH_MAX ((1 << MATCH_BITS) + (MATCH_MIN - 1)) #define OFFSET_MASK ((1 << (16 - MATCH_BITS)) - 1) -#define LEMPEL_SIZE 256 +#define LEMPEL_SIZE 1024 /*ARGSUSED*/ size_t @@ -53,20 +53,14 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) uchar_t *dst = d_start; uchar_t *cpy, *copymap; int copymask = 1 << (NBBY - 1); - int mlen, offset; + int mlen, offset, hash; uint16_t *hp; - uint16_t lempel[LEMPEL_SIZE]; /* uninitialized; see above */ + uint16_t lempel[LEMPEL_SIZE] = { 0 }; while (src < (uchar_t *)s_start + s_len) { if ((copymask <<= 1) == (1 << NBBY)) { - if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) { - if (d_len != s_len) - return (s_len); - mlen = s_len; - for (src = s_start, dst = d_start; mlen; mlen--) - *dst++ = *src++; + if (dst >= (uchar_t *)d_start + d_len - 1 - 2 * NBBY) return (s_len); - } copymask = 1; copymap = dst; *dst++ = 0; @@ -75,8 +69,10 @@ lzjb_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) *dst++ = *src++; continue; } - hp = &lempel[((src[0] + 13) ^ (src[1] - 13) ^ src[2]) & - (LEMPEL_SIZE - 1)]; + hash = (src[0] << 16) + (src[1] << 8) + src[2]; + hash += hash >> 9; + hash += hash >> 5; + hp = &lempel[hash & (LEMPEL_SIZE - 1)]; offset = (intptr_t)(src - *hp) & OFFSET_MASK; *hp = (uint16_t)(uintptr_t)src; cpy = src - offset; diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c b/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c index 87727fac2dbed..233fd9b336158 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/metaslab.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include -#include #include #include #include @@ -35,19 +34,58 @@ uint64_t metaslab_aliquot = 512ULL << 10; uint64_t metaslab_gang_bang = SPA_MAXBLOCKSIZE + 1; /* force gang blocks */ +/* + * Metaslab debugging: when set, keeps all space maps in core to verify frees. + */ +static int metaslab_debug = 0; + +/* + * Minimum size which forces the dynamic allocator to change + * it's allocation strategy. Once the space map cannot satisfy + * an allocation of this size then it switches to using more + * aggressive strategy (i.e search by size rather than offset). + */ +uint64_t metaslab_df_alloc_threshold = SPA_MAXBLOCKSIZE; + +/* + * The minimum free space, in percent, which must be available + * in a space map to continue allocations in a first-fit fashion. + * Once the space_map's free space drops below this level we dynamically + * switch to using best-fit allocations. + */ +int metaslab_df_free_pct = 4; + +/* + * A metaslab is considered "free" if it contains a contiguous + * segment which is greater than metaslab_min_alloc_size. + */ +uint64_t metaslab_min_alloc_size = DMU_MAX_ACCESS; + +/* + * Max number of space_maps to prefetch. + */ +int metaslab_prefetch_limit = SPA_DVAS_PER_BP; + +/* + * Percentage bonus multiplier for metaslabs that are in the bonus area. + */ +int metaslab_smo_bonus_pct = 150; + /* * ========================================================================== * Metaslab classes * ========================================================================== */ metaslab_class_t * -metaslab_class_create(void) +metaslab_class_create(spa_t *spa, space_map_ops_t *ops) { metaslab_class_t *mc; mc = kmem_zalloc(sizeof (metaslab_class_t), KM_SLEEP); + mc->mc_spa = spa; mc->mc_rotor = NULL; + mc->mc_ops = ops; return (mc); } @@ -55,58 +93,73 @@ metaslab_class_create(void) void metaslab_class_destroy(metaslab_class_t *mc) { - metaslab_group_t *mg; - - while ((mg = mc->mc_rotor) != NULL) { - metaslab_class_remove(mc, mg); - metaslab_group_destroy(mg); - } + ASSERT(mc->mc_rotor == NULL); + ASSERT(mc->mc_alloc == 0); + ASSERT(mc->mc_deferred == 0); + ASSERT(mc->mc_space == 0); + ASSERT(mc->mc_dspace == 0); kmem_free(mc, sizeof (metaslab_class_t)); } -void -metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg) +int +metaslab_class_validate(metaslab_class_t *mc) { - metaslab_group_t *mgprev, *mgnext; + metaslab_group_t *mg; + vdev_t *vd; - ASSERT(mg->mg_class == NULL); + /* + * Must hold one of the spa_config locks. + */ + ASSERT(spa_config_held(mc->mc_spa, SCL_ALL, RW_READER) || + spa_config_held(mc->mc_spa, SCL_ALL, RW_WRITER)); - if ((mgprev = mc->mc_rotor) == NULL) { - mg->mg_prev = mg; - mg->mg_next = mg; - } else { - mgnext = mgprev->mg_next; - mg->mg_prev = mgprev; - mg->mg_next = mgnext; - mgprev->mg_next = mg; - mgnext->mg_prev = mg; - } - mc->mc_rotor = mg; - mg->mg_class = mc; + if ((mg = mc->mc_rotor) == NULL) + return (0); + + do { + vd = mg->mg_vd; + ASSERT(vd->vdev_mg != NULL); + ASSERT3P(vd->vdev_top, ==, vd); + ASSERT3P(mg->mg_class, ==, mc); + ASSERT3P(vd->vdev_ops, !=, &vdev_hole_ops); + } while ((mg = mg->mg_next) != mc->mc_rotor); + + return (0); } void -metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg) +metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, + int64_t defer_delta, int64_t space_delta, int64_t dspace_delta) { - metaslab_group_t *mgprev, *mgnext; + atomic_add_64(&mc->mc_alloc, alloc_delta); + atomic_add_64(&mc->mc_deferred, defer_delta); + atomic_add_64(&mc->mc_space, space_delta); + atomic_add_64(&mc->mc_dspace, dspace_delta); +} - ASSERT(mg->mg_class == mc); +uint64_t +metaslab_class_get_alloc(metaslab_class_t *mc) +{ + return (mc->mc_alloc); +} - mgprev = mg->mg_prev; - mgnext = mg->mg_next; +uint64_t +metaslab_class_get_deferred(metaslab_class_t *mc) +{ + return (mc->mc_deferred); +} - if (mg == mgnext) { - mc->mc_rotor = NULL; - } else { - mc->mc_rotor = mgnext; - mgprev->mg_next = mgnext; - mgnext->mg_prev = mgprev; - } +uint64_t +metaslab_class_get_space(metaslab_class_t *mc) +{ + return (mc->mc_space); +} - mg->mg_prev = NULL; - mg->mg_next = NULL; - mg->mg_class = NULL; +uint64_t +metaslab_class_get_dspace(metaslab_class_t *mc) +{ + return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } /* @@ -147,9 +200,9 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) mutex_init(&mg->mg_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&mg->mg_metaslab_tree, metaslab_compare, sizeof (metaslab_t), offsetof(struct metaslab, ms_group_node)); - mg->mg_aliquot = metaslab_aliquot * MAX(1, vd->vdev_children); mg->mg_vd = vd; - metaslab_class_add(mc, mg); + mg->mg_class = mc; + mg->mg_activation_count = 0; return (mg); } @@ -157,11 +210,82 @@ metaslab_group_create(metaslab_class_t *mc, vdev_t *vd) void metaslab_group_destroy(metaslab_group_t *mg) { + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + /* + * We may have gone below zero with the activation count + * either because we never activated in the first place or + * because we're done, and possibly removing the vdev. + */ + ASSERT(mg->mg_activation_count <= 0); + avl_destroy(&mg->mg_metaslab_tree); mutex_destroy(&mg->mg_lock); kmem_free(mg, sizeof (metaslab_group_t)); } +void +metaslab_group_activate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count <= 0); + + if (++mg->mg_activation_count <= 0) + return; + + mg->mg_aliquot = metaslab_aliquot * MAX(1, mg->mg_vd->vdev_children); + + if ((mgprev = mc->mc_rotor) == NULL) { + mg->mg_prev = mg; + mg->mg_next = mg; + } else { + mgnext = mgprev->mg_next; + mg->mg_prev = mgprev; + mg->mg_next = mgnext; + mgprev->mg_next = mg; + mgnext->mg_prev = mg; + } + mc->mc_rotor = mg; +} + +void +metaslab_group_passivate(metaslab_group_t *mg) +{ + metaslab_class_t *mc = mg->mg_class; + metaslab_group_t *mgprev, *mgnext; + + ASSERT(spa_config_held(mc->mc_spa, SCL_ALLOC, RW_WRITER)); + + if (--mg->mg_activation_count != 0) { + ASSERT(mc->mc_rotor != mg); + ASSERT(mg->mg_prev == NULL); + ASSERT(mg->mg_next == NULL); + ASSERT(mg->mg_activation_count < 0); + return; + } + + mgprev = mg->mg_prev; + mgnext = mg->mg_next; + + if (mg == mgnext) { + mc->mc_rotor = NULL; + } else { + mc->mc_rotor = mgnext; + mgprev->mg_next = mgnext; + mgnext->mg_prev = mgprev; + } + + mg->mg_prev = NULL; + mg->mg_next = NULL; +} + static void metaslab_group_add(metaslab_group_t *mg, metaslab_t *msp) { @@ -203,29 +327,39 @@ metaslab_group_sort(metaslab_group_t *mg, metaslab_t *msp, uint64_t weight) /* * ========================================================================== - * The first-fit block allocator + * Common allocator routines * ========================================================================== */ -static void -metaslab_ff_load(space_map_t *sm) +static int +metaslab_segsize_compare(const void *x1, const void *x2) { - ASSERT(sm->sm_ppd == NULL); - sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); -} + const space_seg_t *s1 = x1; + const space_seg_t *s2 = x2; + uint64_t ss_size1 = s1->ss_end - s1->ss_start; + uint64_t ss_size2 = s2->ss_end - s2->ss_start; -static void -metaslab_ff_unload(space_map_t *sm) -{ - kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); - sm->sm_ppd = NULL; + if (ss_size1 < ss_size2) + return (-1); + if (ss_size1 > ss_size2) + return (1); + + if (s1->ss_start < s2->ss_start) + return (-1); + if (s1->ss_start > s2->ss_start) + return (1); + + return (0); } +/* + * This is a helper function that can be used by the allocator to find + * a suitable block to allocate. This will search the specified AVL + * tree looking for a block that matches the specified criteria. + */ static uint64_t -metaslab_ff_alloc(space_map_t *sm, uint64_t size) +metaslab_block_picker(avl_tree_t *t, uint64_t *cursor, uint64_t size, + uint64_t align) { - avl_tree_t *t = &sm->sm_root; - uint64_t align = size & -size; - uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; space_seg_t *ss, ssearch; avl_index_t where; @@ -254,31 +388,291 @@ metaslab_ff_alloc(space_map_t *sm, uint64_t size) return (-1ULL); *cursor = 0; - return (metaslab_ff_alloc(sm, size)); + return (metaslab_block_picker(t, cursor, size, align)); +} + +static void +metaslab_pp_load(space_map_t *sm) +{ + space_seg_t *ss; + + ASSERT(sm->sm_ppd == NULL); + sm->sm_ppd = kmem_zalloc(64 * sizeof (uint64_t), KM_SLEEP); + + sm->sm_pp_root = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP); + avl_create(sm->sm_pp_root, metaslab_segsize_compare, + sizeof (space_seg_t), offsetof(struct space_seg, ss_pp_node)); + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + avl_add(sm->sm_pp_root, ss); +} + +static void +metaslab_pp_unload(space_map_t *sm) +{ + void *cookie = NULL; + + kmem_free(sm->sm_ppd, 64 * sizeof (uint64_t)); + sm->sm_ppd = NULL; + + while (avl_destroy_nodes(sm->sm_pp_root, &cookie) != NULL) { + /* tear down the tree */ + } + + avl_destroy(sm->sm_pp_root); + kmem_free(sm->sm_pp_root, sizeof (avl_tree_t)); + sm->sm_pp_root = NULL; } /* ARGSUSED */ static void -metaslab_ff_claim(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_pp_claim(space_map_t *sm, uint64_t start, uint64_t size) { /* No need to update cursor */ } /* ARGSUSED */ static void -metaslab_ff_free(space_map_t *sm, uint64_t start, uint64_t size) +metaslab_pp_free(space_map_t *sm, uint64_t start, uint64_t size) { /* No need to update cursor */ } +/* + * Return the maximum contiguous segment within the metaslab. + */ +uint64_t +metaslab_pp_maxsize(space_map_t *sm) +{ + avl_tree_t *t = sm->sm_pp_root; + space_seg_t *ss; + + if (t == NULL || (ss = avl_last(t)) == NULL) + return (0ULL); + + return (ss->ss_end - ss->ss_start); +} + +/* + * ========================================================================== + * The first-fit block allocator + * ========================================================================== + */ +static uint64_t +metaslab_ff_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + + return (metaslab_block_picker(t, cursor, size, align)); +} + +/* ARGSUSED */ +boolean_t +metaslab_ff_fragmented(space_map_t *sm) +{ + return (B_TRUE); +} + static space_map_ops_t metaslab_ff_ops = { - metaslab_ff_load, - metaslab_ff_unload, + metaslab_pp_load, + metaslab_pp_unload, metaslab_ff_alloc, - metaslab_ff_claim, - metaslab_ff_free + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ff_fragmented }; +/* + * ========================================================================== + * Dynamic block allocator - + * Uses the first fit allocation scheme until space get low and then + * adjusts to a best fit allocation method. Uses metaslab_df_alloc_threshold + * and metaslab_df_free_pct to determine when to switch the allocation scheme. + * ========================================================================== + */ +static uint64_t +metaslab_df_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t align = size & -size; + uint64_t *cursor = (uint64_t *)sm->sm_ppd + highbit(align) - 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if (max_size < metaslab_df_alloc_threshold || + free_pct < metaslab_df_free_pct) { + t = sm->sm_pp_root; + *cursor = 0; + } + + return (metaslab_block_picker(t, cursor, size, 1ULL)); +} + +static boolean_t +metaslab_df_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + int free_pct = sm->sm_space * 100 / sm->sm_size; + + if (max_size >= metaslab_df_alloc_threshold && + free_pct >= metaslab_df_free_pct) + return (B_FALSE); + + return (B_TRUE); +} + +static space_map_ops_t metaslab_df_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_df_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_df_fragmented +}; + +/* + * ========================================================================== + * Other experimental allocators + * ========================================================================== + */ +static uint64_t +metaslab_cdf_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + uint64_t *cursor = (uint64_t *)sm->sm_ppd; + uint64_t *extent_end = (uint64_t *)sm->sm_ppd + 1; + uint64_t max_size = metaslab_pp_maxsize(sm); + uint64_t rsize = size; + uint64_t offset = 0; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + ASSERT3U(*extent_end, >=, *cursor); + + /* + * If we're running low on space switch to using the size + * sorted AVL tree (best-fit). + */ + if ((*cursor + size) > *extent_end) { + + t = sm->sm_pp_root; + *cursor = *extent_end = 0; + + if (max_size > 2 * SPA_MAXBLOCKSIZE) + rsize = MIN(metaslab_min_alloc_size, max_size); + offset = metaslab_block_picker(t, extent_end, rsize, 1ULL); + if (offset != -1) + *cursor = offset + size; + } else { + offset = metaslab_block_picker(t, cursor, rsize, 1ULL); + } + ASSERT3U(*cursor, <=, *extent_end); + return (offset); +} + +static boolean_t +metaslab_cdf_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + + if (max_size > (metaslab_min_alloc_size * 10)) + return (B_FALSE); + return (B_TRUE); +} + +static space_map_ops_t metaslab_cdf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_cdf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_cdf_fragmented +}; + +static uint64_t +metaslab_ndf_alloc(space_map_t *sm, uint64_t size) +{ + avl_tree_t *t = &sm->sm_root; + avl_index_t where; + space_seg_t *ss, ssearch; + uint64_t *cursor = (uint64_t *)sm->sm_ppd; + uint64_t max_size = metaslab_pp_maxsize(sm); + + ASSERT(MUTEX_HELD(sm->sm_lock)); + ASSERT3U(avl_numnodes(&sm->sm_root), ==, avl_numnodes(sm->sm_pp_root)); + + if (max_size < size) + return (-1ULL); + + ssearch.ss_start = *cursor; + ssearch.ss_end = *cursor + size; + + ss = avl_find(t, &ssearch, &where); + if (ss == NULL || (ss->ss_start + size > ss->ss_end)) { + t = sm->sm_pp_root; + + if (max_size > 2 * SPA_MAXBLOCKSIZE) + size = MIN(metaslab_min_alloc_size, max_size); + + ssearch.ss_start = 0; + ssearch.ss_end = size; + ss = avl_find(t, &ssearch, &where); + if (ss == NULL) + ss = avl_nearest(t, where, AVL_AFTER); + ASSERT(ss != NULL); + } + + if (ss != NULL) { + if (ss->ss_start + size <= ss->ss_end) { + *cursor = ss->ss_start + size; + return (ss->ss_start); + } + } + return (-1ULL); +} + +static boolean_t +metaslab_ndf_fragmented(space_map_t *sm) +{ + uint64_t max_size = metaslab_pp_maxsize(sm); + + if (max_size > (metaslab_min_alloc_size * 10)) + return (B_FALSE); + return (B_TRUE); +} + + +static space_map_ops_t metaslab_ndf_ops = { + metaslab_pp_load, + metaslab_pp_unload, + metaslab_ndf_alloc, + metaslab_pp_claim, + metaslab_pp_free, + metaslab_pp_maxsize, + metaslab_ndf_fragmented +}; + +space_map_ops_t *zfs_metaslab_ops = &metaslab_df_ops; + /* * ========================================================================== * Metaslabs @@ -308,6 +702,13 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_group_add(mg, msp); + if (metaslab_debug && smo->smo_object != 0) { + mutex_enter(&msp->ms_lock); + VERIFY(space_map_load(&msp->ms_map, mg->mg_class->mc_ops, + SM_FREE, smo, spa_meta_objset(vd->vdev_spa)) == 0); + mutex_exit(&msp->ms_lock); + } + /* * If we're opening an existing pool (txg == 0) or creating * a new one (txg == TXG_INITIAL), all space is available now. @@ -318,16 +719,8 @@ metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, metaslab_sync_done(msp, 0); if (txg != 0) { - /* - * The vdev is dirty, but the metaslab isn't -- it just needs - * to have metaslab_sync_done() invoked from vdev_sync_done(). - * [We could just dirty the metaslab, but that would cause us - * to allocate a space map object for it, which is wasteful - * and would mess up the locality logic in metaslab_weight().] - */ - ASSERT(TXG_CLEAN(txg) == spa_last_synced_txg(vd->vdev_spa)); vdev_dirty(vd, 0, NULL, txg); - vdev_dirty(vd, VDD_METASLAB, msp, TXG_CLEAN(txg)); + vdev_dirty(vd, VDD_METASLAB, msp, txg); } return (msp); @@ -337,10 +730,9 @@ void metaslab_fini(metaslab_t *msp) { metaslab_group_t *mg = msp->ms_group; - int t; - vdev_space_update(mg->mg_vd, -msp->ms_map.sm_size, - -msp->ms_smo.smo_alloc, B_TRUE); + vdev_space_update(mg->mg_vd, + -msp->ms_smo.smo_alloc, 0, -msp->ms_map.sm_size); metaslab_group_remove(mg, msp); @@ -349,11 +741,16 @@ metaslab_fini(metaslab_t *msp) space_map_unload(&msp->ms_map); space_map_destroy(&msp->ms_map); - for (t = 0; t < TXG_SIZE; t++) { + for (int t = 0; t < TXG_SIZE; t++) { space_map_destroy(&msp->ms_allocmap[t]); space_map_destroy(&msp->ms_freemap[t]); } + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_destroy(&msp->ms_defermap[t]); + + ASSERT3S(msp->ms_deferspace, ==, 0); + mutex_exit(&msp->ms_lock); mutex_destroy(&msp->ms_lock); @@ -364,7 +761,6 @@ metaslab_fini(metaslab_t *msp) #define METASLAB_WEIGHT_SECONDARY (1ULL << 62) #define METASLAB_ACTIVE_MASK \ (METASLAB_WEIGHT_PRIMARY | METASLAB_WEIGHT_SECONDARY) -#define METASLAB_SMO_BONUS_MULTIPLIER 2 static uint64_t metaslab_weight(metaslab_t *msp) @@ -397,37 +793,97 @@ metaslab_weight(metaslab_t *msp) ASSERT(weight >= space && weight <= 2 * space); /* - * For locality, assign higher weight to metaslabs we've used before. + * For locality, assign higher weight to metaslabs which have + * a lower offset than what we've already activated. */ - if (smo->smo_object != 0) - weight *= METASLAB_SMO_BONUS_MULTIPLIER; + if (sm->sm_start <= mg->mg_bonus_area) + weight *= (metaslab_smo_bonus_pct / 100); ASSERT(weight >= space && - weight <= 2 * METASLAB_SMO_BONUS_MULTIPLIER * space); + weight <= 2 * (metaslab_smo_bonus_pct / 100) * space); + + if (sm->sm_loaded && !sm->sm_ops->smop_fragmented(sm)) { + /* + * If this metaslab is one we're actively using, adjust its + * weight to make it preferable to any inactive metaslab so + * we'll polish it off. + */ + weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + } + return (weight); +} + +static void +metaslab_prefetch(metaslab_group_t *mg) +{ + spa_t *spa = mg->mg_vd->vdev_spa; + metaslab_t *msp; + avl_tree_t *t = &mg->mg_metaslab_tree; + int m; + + mutex_enter(&mg->mg_lock); /* - * If this metaslab is one we're actively using, adjust its weight to - * make it preferable to any inactive metaslab so we'll polish it off. + * Prefetch the next potential metaslabs */ - weight |= (msp->ms_weight & METASLAB_ACTIVE_MASK); + for (msp = avl_first(t), m = 0; msp; msp = AVL_NEXT(t, msp), m++) { + space_map_t *sm = &msp->ms_map; + space_map_obj_t *smo = &msp->ms_smo; - return (weight); + /* If we have reached our prefetch limit then we're done */ + if (m >= metaslab_prefetch_limit) + break; + + if (!sm->sm_loaded && smo->smo_object != 0) { + mutex_exit(&mg->mg_lock); + dmu_prefetch(spa_meta_objset(spa), smo->smo_object, + 0ULL, smo->smo_objsize); + mutex_enter(&mg->mg_lock); + } + } + mutex_exit(&mg->mg_lock); } static int -metaslab_activate(metaslab_t *msp, uint64_t activation_weight) +metaslab_activate(metaslab_t *msp, uint64_t activation_weight, uint64_t size) { + metaslab_group_t *mg = msp->ms_group; space_map_t *sm = &msp->ms_map; + space_map_ops_t *sm_ops = msp->ms_group->mg_class->mc_ops; ASSERT(MUTEX_HELD(&msp->ms_lock)); if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { - int error = space_map_load(sm, &metaslab_ff_ops, - SM_FREE, &msp->ms_smo, - msp->ms_group->mg_vd->vdev_spa->spa_meta_objset); - if (error) { - metaslab_group_sort(msp->ms_group, msp, 0); - return (error); + space_map_load_wait(sm); + if (!sm->sm_loaded) { + int error = space_map_load(sm, sm_ops, SM_FREE, + &msp->ms_smo, + spa_meta_objset(msp->ms_group->mg_vd->vdev_spa)); + if (error) { + metaslab_group_sort(msp->ms_group, msp, 0); + return (error); + } + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(&msp->ms_defermap[t], + space_map_claim, sm); + } + + /* + * Track the bonus area as we activate new metaslabs. + */ + if (sm->sm_start > mg->mg_bonus_area) { + mutex_enter(&mg->mg_lock); + mg->mg_bonus_area = sm->sm_start; + mutex_exit(&mg->mg_lock); + } + + /* + * If we were able to load the map then make sure + * that this map is still able to satisfy our request. + */ + if (msp->ms_weight < size) + return (ENOSPC); + metaslab_group_sort(msp->ms_group, msp, msp->ms_weight | activation_weight); } @@ -458,7 +914,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) { vdev_t *vd = msp->ms_group->mg_vd; spa_t *spa = vd->vdev_spa; - objset_t *mos = spa->spa_meta_objset; + objset_t *mos = spa_meta_objset(spa); space_map_t *allocmap = &msp->ms_allocmap[txg & TXG_MASK]; space_map_t *freemap = &msp->ms_freemap[txg & TXG_MASK]; space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; @@ -466,9 +922,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_obj_t *smo = &msp->ms_smo_syncing; dmu_buf_t *db; dmu_tx_t *tx; - int t; - tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + ASSERT(!vd->vdev_ishole); + + if (allocmap->sm_space == 0 && freemap->sm_space == 0) + return; /* * The only state that can actually be changing concurrently with @@ -478,12 +936,12 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * We drop it whenever we call into the DMU, because the DMU * can call down to us (e.g. via zio_free()) at any time. */ - mutex_enter(&msp->ms_lock); + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); if (smo->smo_object == 0) { ASSERT(smo->smo_objsize == 0); ASSERT(smo->smo_alloc == 0); - mutex_exit(&msp->ms_lock); smo->smo_object = dmu_object_alloc(mos, DMU_OT_SPACE_MAP, 1 << SPACE_MAP_BLOCKSHIFT, DMU_OT_SPACE_MAP_HEADER, sizeof (*smo), tx); @@ -491,9 +949,10 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) dmu_write(mos, vd->vdev_ms_array, sizeof (uint64_t) * (sm->sm_start >> vd->vdev_ms_shift), sizeof (uint64_t), &smo->smo_object, tx); - mutex_enter(&msp->ms_lock); } + mutex_enter(&msp->ms_lock); + space_map_walk(freemap, space_map_add, freed_map); if (sm->sm_loaded && spa_sync_pass(spa) == 1 && smo->smo_objsize >= @@ -506,6 +965,7 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) * This metaslab is 100% allocated, * minus the content of the in-core map (sm), * minus what's been freed this txg (freed_map), + * minus deferred frees (ms_defermap[]), * minus allocations from txgs in the future * (because they haven't been committed yet). */ @@ -517,7 +977,11 @@ metaslab_sync(metaslab_t *msp, uint64_t txg) space_map_walk(sm, space_map_remove, allocmap); space_map_walk(freed_map, space_map_remove, allocmap); - for (t = 1; t < TXG_CONCURRENT_STATES; t++) + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_walk(&msp->ms_defermap[t], + space_map_remove, allocmap); + + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) space_map_walk(&msp->ms_allocmap[(txg + t) & TXG_MASK], space_map_remove, allocmap); @@ -551,9 +1015,12 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) space_map_obj_t *smosync = &msp->ms_smo_syncing; space_map_t *sm = &msp->ms_map; space_map_t *freed_map = &msp->ms_freemap[TXG_CLEAN(txg) & TXG_MASK]; + space_map_t *defer_map = &msp->ms_defermap[txg % TXG_DEFER_SIZE]; metaslab_group_t *mg = msp->ms_group; vdev_t *vd = mg->mg_vd; - int t; + int64_t alloc_delta, defer_delta; + + ASSERT(!vd->vdev_ishole); mutex_enter(&msp->ms_lock); @@ -562,16 +1029,24 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) * allocmaps and freemaps and add its capacity to the vdev. */ if (freed_map->sm_size == 0) { - for (t = 0; t < TXG_SIZE; t++) { + for (int t = 0; t < TXG_SIZE; t++) { space_map_create(&msp->ms_allocmap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); space_map_create(&msp->ms_freemap[t], sm->sm_start, sm->sm_size, sm->sm_shift, sm->sm_lock); } - vdev_space_update(vd, sm->sm_size, 0, B_TRUE); + + for (int t = 0; t < TXG_DEFER_SIZE; t++) + space_map_create(&msp->ms_defermap[t], sm->sm_start, + sm->sm_size, sm->sm_shift, sm->sm_lock); + + vdev_space_update(vd, 0, 0, sm->sm_size); } - vdev_space_update(vd, 0, smosync->smo_alloc - smo->smo_alloc, B_TRUE); + alloc_delta = smosync->smo_alloc - smo->smo_alloc; + defer_delta = freed_map->sm_space - defer_map->sm_space; + + vdev_space_update(vd, alloc_delta + defer_delta, defer_delta, 0); ASSERT(msp->ms_allocmap[txg & TXG_MASK].sm_space == 0); ASSERT(msp->ms_freemap[txg & TXG_MASK].sm_space == 0); @@ -579,13 +1054,26 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) /* * If there's a space_map_load() in progress, wait for it to complete * so that we have a consistent view of the in-core space map. - * Then, add everything we freed in this txg to the map. + * Then, add defer_map (oldest deferred frees) to this map and + * transfer freed_map (this txg's frees) to defer_map. */ space_map_load_wait(sm); - space_map_vacate(freed_map, sm->sm_loaded ? space_map_free : NULL, sm); + space_map_vacate(defer_map, sm->sm_loaded ? space_map_free : NULL, sm); + space_map_vacate(freed_map, space_map_add, defer_map); *smo = *smosync; + msp->ms_deferspace += defer_delta; + ASSERT3S(msp->ms_deferspace, >=, 0); + ASSERT3S(msp->ms_deferspace, <=, sm->sm_size); + if (msp->ms_deferspace != 0) { + /* + * Keep syncing this metaslab until all deferred frees + * are back in circulation. + */ + vdev_dirty(vd, VDD_METASLAB, msp, txg + 1); + } + /* * If the map is loaded but no longer active, evict it as soon as all * future allocations have synced. (If we unloaded it now and then @@ -594,11 +1082,11 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) if (sm->sm_loaded && (msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) { int evictable = 1; - for (t = 1; t < TXG_CONCURRENT_STATES; t++) + for (int t = 1; t < TXG_CONCURRENT_STATES; t++) if (msp->ms_allocmap[(txg + t) & TXG_MASK].sm_space) evictable = 0; - if (evictable) + if (evictable && !metaslab_debug) space_map_unload(sm); } @@ -607,6 +1095,32 @@ metaslab_sync_done(metaslab_t *msp, uint64_t txg) mutex_exit(&msp->ms_lock); } +void +metaslab_sync_reassess(metaslab_group_t *mg) +{ + vdev_t *vd = mg->mg_vd; + + /* + * Re-evaluate all metaslabs which have lower offsets than the + * bonus area. + */ + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp->ms_map.sm_start > mg->mg_bonus_area) + break; + + mutex_enter(&msp->ms_lock); + metaslab_group_sort(mg, msp, metaslab_weight(msp)); + mutex_exit(&msp->ms_lock); + } + + /* + * Prefetch the next potential metaslabs + */ + metaslab_prefetch(mg); +} + static uint64_t metaslab_distance(metaslab_t *msp, dva_t *dva) { @@ -636,11 +1150,16 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, int i; activation_weight = METASLAB_WEIGHT_PRIMARY; - for (i = 0; i < d; i++) - if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) + for (i = 0; i < d; i++) { + if (DVA_GET_VDEV(&dva[i]) == mg->mg_vd->vdev_id) { activation_weight = METASLAB_WEIGHT_SECONDARY; + break; + } + } for (;;) { + boolean_t was_active; + mutex_enter(&mg->mg_lock); for (msp = avl_first(t); msp; msp = AVL_NEXT(t, msp)) { if (msp->ms_weight < size) { @@ -648,6 +1167,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, return (-1ULL); } + was_active = msp->ms_weight & METASLAB_ACTIVE_MASK; if (activation_weight == METASLAB_WEIGHT_PRIMARY) break; @@ -673,7 +1193,9 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, * another thread may have changed the weight while we * were blocked on the metaslab lock. */ - if (msp->ms_weight < size) { + if (msp->ms_weight < size || (was_active && + !(msp->ms_weight & METASLAB_ACTIVE_MASK) && + activation_weight == METASLAB_WEIGHT_PRIMARY)) { mutex_exit(&msp->ms_lock); continue; } @@ -686,7 +1208,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, continue; } - if (metaslab_activate(msp, activation_weight) != 0) { + if (metaslab_activate(msp, activation_weight, size) != 0) { mutex_exit(&msp->ms_lock); continue; } @@ -694,7 +1216,7 @@ metaslab_group_alloc(metaslab_group_t *mg, uint64_t size, uint64_t txg, if ((offset = space_map_alloc(&msp->ms_map, size)) != -1ULL) break; - metaslab_passivate(msp, size - 1); + metaslab_passivate(msp, space_map_maxsize(&msp->ms_map)); mutex_exit(&msp->ms_lock); } @@ -720,6 +1242,8 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, vdev_t *vd; int dshift = 3; int all_zero; + int zio_lock = B_FALSE; + boolean_t allocatable; uint64_t offset = -1ULL; uint64_t asize; uint64_t distance; @@ -729,12 +1253,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, /* * For testing, make some blocks above a certain size be gang blocks. */ - if (psize >= metaslab_gang_bang && (lbolt & 3) == 0) + if (psize >= metaslab_gang_bang && (ddi_get_lbolt() & 3) == 0) return (ENOSPC); /* * Start at the rotor and loop through all mgs until we find something. - * Note that there's no locking on mc_rotor or mc_allocated because + * Note that there's no locking on mc_rotor or mc_aliquot because * nothing actually breaks if we miss a few updates -- we just won't * allocate quite as evenly. It all balances out over time. * @@ -756,10 +1280,21 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, */ if (hintdva) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&hintdva[d])); - if (flags & METASLAB_HINTBP_AVOID) - mg = vd->vdev_mg->mg_next; - else + + /* + * It's possible the vdev we're using as the hint no + * longer exists (i.e. removed). Consult the rotor when + * all else fails. + */ + if (vd != NULL) { mg = vd->vdev_mg; + + if (flags & METASLAB_HINTBP_AVOID && + mg->mg_next != NULL) + mg = mg->mg_next; + } else { + mg = mc->mc_rotor; + } } else if (d != 0) { vd = vdev_lookup_top(spa, DVA_GET_VDEV(&dva[d - 1])); mg = vd->vdev_mg->mg_next; @@ -768,21 +1303,33 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } /* - * If the hint put us into the wrong class, just follow the rotor. + * If the hint put us into the wrong metaslab class, or into a + * metaslab group that has been passivated, just follow the rotor. */ - if (mg->mg_class != mc) + if (mg->mg_class != mc || mg->mg_activation_count <= 0) mg = mc->mc_rotor; rotor = mg; top: all_zero = B_TRUE; do { + ASSERT(mg->mg_activation_count == 1); + vd = mg->mg_vd; + /* * Don't allocate from faulted devices. */ - if (!vdev_allocatable(vd)) + if (zio_lock) { + spa_config_enter(spa, SCL_ZIO, FTAG, RW_READER); + allocatable = vdev_allocatable(vd); + spa_config_exit(spa, SCL_ZIO, FTAG); + } else { + allocatable = vdev_allocatable(vd); + } + if (!allocatable) goto next; + /* * Avoid writing single-copy data to a failing vdev */ @@ -812,32 +1359,28 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, * over- or under-used relative to the pool, * and set an allocation bias to even it out. */ - if (mc->mc_allocated == 0) { + if (mc->mc_aliquot == 0) { vdev_stat_t *vs = &vd->vdev_stat; - uint64_t alloc, space; - int64_t vu, su; - - alloc = spa_get_alloc(spa); - space = spa_get_space(spa); + int64_t vu, cu; /* * Determine percent used in units of 0..1024. * (This is just to avoid floating point.) */ vu = (vs->vs_alloc << 10) / (vs->vs_space + 1); - su = (alloc << 10) / (space + 1); + cu = (mc->mc_alloc << 10) / (mc->mc_space + 1); /* * Bias by at most +/- 25% of the aliquot. */ - mg->mg_bias = ((su - vu) * + mg->mg_bias = ((cu - vu) * (int64_t)mg->mg_aliquot) / (1024 * 4); } - if (atomic_add_64_nv(&mc->mc_allocated, asize) >= + if (atomic_add_64_nv(&mc->mc_aliquot, asize) >= mg->mg_aliquot + mg->mg_bias) { mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; + mc->mc_aliquot = 0; } DVA_SET_VDEV(&dva[d], vd->vdev_id); @@ -849,7 +1392,7 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, } next: mc->mc_rotor = mg->mg_next; - mc->mc_allocated = 0; + mc->mc_aliquot = 0; } while ((mg = mg->mg_next) != rotor); if (!all_zero) { @@ -858,6 +1401,12 @@ metaslab_alloc_dva(spa_t *spa, metaslab_class_t *mc, uint64_t psize, goto top; } + if (!allocatable && !zio_lock) { + dshift = 3; + zio_lock = B_TRUE; + goto top; + } + bzero(&dva[d], sizeof (dva_t)); return (ENOSPC); @@ -923,7 +1472,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) uint64_t size = DVA_GET_ASIZE(dva); vdev_t *vd; metaslab_t *msp; - int error; + int error = 0; ASSERT(DVA_IS_VALID(dva)); @@ -938,7 +1487,12 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) mutex_enter(&msp->ms_lock); - error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY); + if ((txg != 0 && spa_writeable(spa)) || !msp->ms_map.sm_loaded) + error = metaslab_activate(msp, METASLAB_WEIGHT_SECONDARY, 0); + + if (error == 0 && !space_map_contains(&msp->ms_map, offset, size)) + error = ENOENT; + if (error || txg == 0) { /* txg == 0 indicates dry run */ mutex_exit(&msp->ms_lock); return (error); @@ -946,7 +1500,7 @@ metaslab_claim_dva(spa_t *spa, const dva_t *dva, uint64_t txg) space_map_claim(&msp->ms_map, offset, size); - if (spa_mode & FWRITE) { /* don't dirty if we're zdb(1M) */ + if (spa_writeable(spa)) { /* don't dirty if we're zdb(1M) */ if (msp->ms_allocmap[txg & TXG_MASK].sm_space == 0) vdev_dirty(vd, VDD_METASLAB, msp, txg); space_map_add(&msp->ms_allocmap[txg & TXG_MASK], offset, size); @@ -966,6 +1520,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, int error = 0; ASSERT(bp->blk_birth == 0); + ASSERT(BP_PHYSICAL_BIRTH(bp) == 0); spa_config_enter(spa, SCL_ALLOC, FTAG, RW_READER); @@ -995,7 +1550,7 @@ metaslab_alloc(spa_t *spa, metaslab_class_t *mc, uint64_t psize, blkptr_t *bp, spa_config_exit(spa, SCL_ALLOC, FTAG); - bp->blk_birth = txg; + BP_SET_BIRTH(bp, txg, txg); return (0); } @@ -1007,7 +1562,7 @@ metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now) int ndvas = BP_GET_NDVAS(bp); ASSERT(!BP_IS_HOLE(bp)); - ASSERT(!now || bp->blk_birth >= spa->spa_syncing_txg); + ASSERT(!now || bp->blk_birth >= spa_syncing_txg(spa)); spa_config_enter(spa, SCL_FREE, FTAG, RW_READER); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c b/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c index 710685dbc71e2..4cef53f951327 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/rrwlock.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -118,7 +116,7 @@ rrn_find_and_remove(rrwlock_t *rrl) rrw_node_t *prev = NULL; if (refcount_count(&rrl->rr_linked_rcount) == 0) - return (NULL); + return (B_FALSE); for (rn = tsd_get(rrw_tsd_key); rn != NULL; rn = rn->rn_next) { if (rn->rn_rrl == rrl) { @@ -159,6 +157,14 @@ static void rrw_enter_read(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); +#if !defined(DEBUG) && defined(_KERNEL) + if (!rrl->rr_writer && !rrl->rr_writer_wanted) { + rrl->rr_anon_rcount.rc_count++; + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__rdmiss); +#endif ASSERT(rrl->rr_writer != curthread); ASSERT(refcount_count(&rrl->rr_anon_rcount) >= 0); @@ -208,19 +214,28 @@ void rrw_exit(rrwlock_t *rrl, void *tag) { mutex_enter(&rrl->rr_lock); +#if !defined(DEBUG) && defined(_KERNEL) + if (!rrl->rr_writer && rrl->rr_linked_rcount.rc_count == 0) { + rrl->rr_anon_rcount.rc_count--; + if (rrl->rr_anon_rcount.rc_count == 0) + cv_broadcast(&rrl->rr_cv); + mutex_exit(&rrl->rr_lock); + return; + } + DTRACE_PROBE(zfs__rrwfastpath__exitmiss); +#endif ASSERT(!refcount_is_zero(&rrl->rr_anon_rcount) || !refcount_is_zero(&rrl->rr_linked_rcount) || rrl->rr_writer != NULL); if (rrl->rr_writer == NULL) { - if (rrn_find_and_remove(rrl)) { - if (refcount_remove(&rrl->rr_linked_rcount, tag) == 0) - cv_broadcast(&rrl->rr_cv); - - } else { - if (refcount_remove(&rrl->rr_anon_rcount, tag) == 0) - cv_broadcast(&rrl->rr_cv); - } + int64_t count; + if (rrn_find_and_remove(rrl)) + count = refcount_remove(&rrl->rr_linked_rcount, tag); + else + count = refcount_remove(&rrl->rr_anon_rcount, tag); + if (count == 0) + cv_broadcast(&rrl->rr_cv); } else { ASSERT(rrl->rr_writer == curthread); ASSERT(refcount_is_zero(&rrl->rr_anon_rcount) && diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c b/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c index ca7076cb6fd99..f515be6bb3042 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sha256.c @@ -19,111 +19,32 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ - -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include -#include - -/* - * SHA-256 checksum, as specified in FIPS 180-3, available at: - * http://csrc.nist.gov/publications/PubsFIPS.html - * - * This is a very compact implementation of SHA-256. - * It is designed to be simple and portable, not to be fast. - */ - -/* - * The literal definitions of Ch() and Maj() according to FIPS 180-3 are: - * - * Ch(x, y, z) (x & y) ^ (~x & z) - * Maj(x, y, z) (x & y) ^ (x & z) ^ (y & z) - * - * We use equivalent logical reductions here that require one less op. - */ -#define Ch(x, y, z) ((z) ^ ((x) & ((y) ^ (z)))) -#define Maj(x, y, z) (((x) & (y)) ^ ((z) & ((x) ^ (y)))) -#define Rot32(x, s) (((x) >> s) | ((x) << (32 - s))) -#define SIGMA0(x) (Rot32(x, 2) ^ Rot32(x, 13) ^ Rot32(x, 22)) -#define SIGMA1(x) (Rot32(x, 6) ^ Rot32(x, 11) ^ Rot32(x, 25)) -#define sigma0(x) (Rot32(x, 7) ^ Rot32(x, 18) ^ ((x) >> 3)) -#define sigma1(x) (Rot32(x, 17) ^ Rot32(x, 19) ^ ((x) >> 10)) - -static const uint32_t SHA256_K[64] = { - 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, - 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, - 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, - 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, - 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, - 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, - 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, - 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, - 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, - 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, - 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, - 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, - 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, - 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, - 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, - 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2 -}; - -static void -SHA256Transform(uint32_t *H, const uint8_t *cp) -{ - uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64]; - - for (t = 0; t < 16; t++, cp += 4) - W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3]; - - for (t = 16; t < 64; t++) - W[t] = sigma1(W[t - 2]) + W[t - 7] + - sigma0(W[t - 15]) + W[t - 16]; - - a = H[0]; b = H[1]; c = H[2]; d = H[3]; - e = H[4]; f = H[5]; g = H[6]; h = H[7]; - - for (t = 0; t < 64; t++) { - T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t]; - T2 = SIGMA0(a) + Maj(a, b, c); - h = g; g = f; f = e; e = d + T1; - d = c; c = b; b = a; a = T1 + T2; - } - - H[0] += a; H[1] += b; H[2] += c; H[3] += d; - H[4] += e; H[5] += f; H[6] += g; H[7] += h; -} +#include void zio_checksum_SHA256(const void *buf, uint64_t size, zio_cksum_t *zcp) { - uint32_t H[8] = { 0x6a09e667, 0xbb67ae85, 0x3c6ef372, 0xa54ff53a, - 0x510e527f, 0x9b05688c, 0x1f83d9ab, 0x5be0cd19 }; - uint8_t pad[128]; - int i, padsize; - - for (i = 0; i < (size & ~63ULL); i += 64) - SHA256Transform(H, (uint8_t *)buf + i); - - for (padsize = 0; i < size; i++) - pad[padsize++] = *((uint8_t *)buf + i); - - for (pad[padsize++] = 0x80; (padsize & 63) != 56; padsize++) - pad[padsize] = 0; - - for (i = 56; i >= 0; i -= 8) - pad[padsize++] = (size << 3) >> i; - - for (i = 0; i < padsize; i += 64) - SHA256Transform(H, pad + i); - - ZIO_SET_CHECKSUM(zcp, - (uint64_t)H[0] << 32 | H[1], - (uint64_t)H[2] << 32 | H[3], - (uint64_t)H[4] << 32 | H[5], - (uint64_t)H[6] << 32 | H[7]); + SHA2_CTX ctx; + zio_cksum_t tmp; + + SHA2Init(SHA256, &ctx); + SHA2Update(&ctx, buf, size); + SHA2Final(&tmp, &ctx); + + /* + * A prior implementation of this function had a + * private SHA256 implementation always wrote things out in + * Big Endian and there wasn't a byteswap variant of it. + * To preseve on disk compatibility we need to force that + * behaviour. + */ + zcp->zc_word[0] = BE_64(tmp.zc_word[0]); + zcp->zc_word[1] = BE_64(tmp.zc_word[1]); + zcp->zc_word[2] = BE_64(tmp.zc_word[2]); + zcp->zc_word[3] = BE_64(tmp.zc_word[3]); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c index fb1b96f8b8117..9f2876fcfd9f3 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,13 +35,14 @@ #include #include #include -#include #include #include #include #include +#include #include #include +#include #include #include #include @@ -57,24 +58,77 @@ #include #include #include -#include #include +#include + +#ifdef _KERNEL +#include +#include +#include +#include +#include +#include +#endif /* _KERNEL */ #include "zfs_prop.h" #include "zfs_comutil.h" -int zio_taskq_threads[ZIO_TYPES][ZIO_TASKQ_TYPES] = { - /* ISSUE INTR */ - { 1, 1 }, /* ZIO_TYPE_NULL */ - { 1, 8 }, /* ZIO_TYPE_READ */ - { 8, 1 }, /* ZIO_TYPE_WRITE */ - { 1, 1 }, /* ZIO_TYPE_FREE */ - { 1, 1 }, /* ZIO_TYPE_CLAIM */ - { 1, 1 }, /* ZIO_TYPE_IOCTL */ +typedef enum zti_modes { + zti_mode_fixed, /* value is # of threads (min 1) */ + zti_mode_online_percent, /* value is % of online CPUs */ + zti_mode_batch, /* cpu-intensive; value is ignored */ + zti_mode_null, /* don't create a taskq */ + zti_nmodes +} zti_modes_t; + +#define ZTI_FIX(n) { zti_mode_fixed, (n) } +#define ZTI_PCT(n) { zti_mode_online_percent, (n) } +#define ZTI_BATCH { zti_mode_batch, 0 } +#define ZTI_NULL { zti_mode_null, 0 } + +#define ZTI_ONE ZTI_FIX(1) + +typedef struct zio_taskq_info { + enum zti_modes zti_mode; + uint_t zti_value; +} zio_taskq_info_t; + +static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { + "issue", "issue_high", "intr", "intr_high" +}; + +/* + * Define the taskq threads for the following I/O types: + * NULL, READ, WRITE, FREE, CLAIM, and IOCTL + */ +const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { + /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_FIX(8), ZTI_NULL, ZTI_BATCH, ZTI_NULL }, + { ZTI_BATCH, ZTI_FIX(5), ZTI_FIX(8), ZTI_FIX(5) }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, + { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, }; static void spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); static boolean_t spa_has_active_shared_spare(spa_t *spa); +static int spa_load_impl(spa_t *spa, uint64_t, nvlist_t *config, + spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + char **ereport); + +uint_t zio_taskq_batch_pct = 100; /* 1 thread per cpu in pset */ +id_t zio_taskq_psrset_bind = PS_NONE; +boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */ +uint_t zio_taskq_basedc = 80; /* base duty cycle */ + +boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */ + +/* + * This (illegal) pool name is used when temporarily importing a spa_t in order + * to get the vdev stats associated with the imported devices. + */ +#define TRYIMPORT_NAME "$import" /* * ========================================================================== @@ -110,38 +164,41 @@ spa_prop_add_list(nvlist_t *nvl, zpool_prop_t prop, char *strval, static void spa_prop_get_config(spa_t *spa, nvlist_t **nvp) { - uint64_t size = spa_get_space(spa); - uint64_t used = spa_get_alloc(spa); + uint64_t size; + uint64_t alloc; uint64_t cap, version; zprop_source_t src = ZPROP_SRC_NONE; spa_config_dirent_t *dp; ASSERT(MUTEX_HELD(&spa->spa_props_lock)); - /* - * readonly properties - */ - spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_USED, NULL, used, src); - spa_prop_add_list(*nvp, ZPOOL_PROP_AVAILABLE, NULL, size - used, src); - - cap = (size == 0) ? 0 : (used * 100 / size); - spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + if (spa->spa_root_vdev != NULL) { + alloc = metaslab_class_get_alloc(spa_normal_class(spa)); + size = metaslab_class_get_space(spa_normal_class(spa)); + spa_prop_add_list(*nvp, ZPOOL_PROP_NAME, spa_name(spa), 0, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_SIZE, NULL, size, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_ALLOCATED, NULL, alloc, src); + spa_prop_add_list(*nvp, ZPOOL_PROP_FREE, NULL, + size - alloc, src); + + cap = (size == 0) ? 0 : (alloc * 100 / size); + spa_prop_add_list(*nvp, ZPOOL_PROP_CAPACITY, NULL, cap, src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_DEDUPRATIO, NULL, + ddt_get_pool_dedup_ratio(spa), src); + + spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, + spa->spa_root_vdev->vdev_state, src); + + version = spa_version(spa); + if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) + src = ZPROP_SRC_DEFAULT; + else + src = ZPROP_SRC_LOCAL; + spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); + } spa_prop_add_list(*nvp, ZPOOL_PROP_GUID, NULL, spa_guid(spa), src); - spa_prop_add_list(*nvp, ZPOOL_PROP_HEALTH, NULL, - spa->spa_root_vdev->vdev_state, src); - - /* - * settable properties that are not stored in the pool property object. - */ - version = spa_version(spa); - if (version == zpool_prop_default_numeric(ZPOOL_PROP_VERSION)) - src = ZPROP_SRC_DEFAULT; - else - src = ZPROP_SRC_LOCAL; - spa_prop_add_list(*nvp, ZPOOL_PROP_VERSION, NULL, version, src); if (spa->spa_root != NULL) spa_prop_add_list(*nvp, ZPOOL_PROP_ALTROOT, spa->spa_root, @@ -164,9 +221,9 @@ spa_prop_get_config(spa_t *spa, nvlist_t **nvp) int spa_prop_get(spa_t *spa, nvlist_t **nvp) { + objset_t *mos = spa->spa_meta_objset; zap_cursor_t zc; zap_attribute_t za; - objset_t *mos = spa->spa_meta_objset; int err; VERIFY(nvlist_alloc(nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); @@ -179,7 +236,7 @@ spa_prop_get(spa_t *spa, nvlist_t **nvp) spa_prop_get_config(spa, nvp); /* If no pool property object, no more prop to get. */ - if (spa->spa_pool_props_object == 0) { + if (mos == NULL || spa->spa_pool_props_object == 0) { mutex_exit(&spa->spa_props_lock); return (0); } @@ -300,12 +357,18 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) case ZPOOL_PROP_DELEGATION: case ZPOOL_PROP_AUTOREPLACE: case ZPOOL_PROP_LISTSNAPS: + case ZPOOL_PROP_AUTOEXPAND: error = nvpair_value_uint64(elem, &intval); if (!error && intval > 1) error = EINVAL; break; case ZPOOL_PROP_BOOTFS: + /* + * If the pool version is less than SPA_VERSION_BOOTFS, + * or the pool is still being created (version == 0), + * the bootfs property cannot be set. + */ if (spa_version(spa) < SPA_VERSION_BOOTFS) { error = ENOTSUP; break; @@ -332,12 +395,14 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) break; } - if (error = dmu_objset_open(strval, DMU_OST_ZFS, - DS_MODE_USER | DS_MODE_READONLY, &os)) + if (error = dmu_objset_hold(strval, FTAG, &os)) break; - /* We don't support gzip bootable datasets */ - if ((error = dsl_prop_get_integer(strval, + /* Must be ZPL and not gzip compressed. */ + + if (dmu_objset_type(os) != DMU_OST_ZFS) { + error = ENOTSUP; + } else if ((error = dsl_prop_get_integer(strval, zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL)) == 0 && !BOOTFS_COMPRESS_VALID(compress)) { @@ -345,7 +410,7 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) } else { objnum = dmu_objset_id(os); } - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); } break; @@ -393,6 +458,16 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) strcmp(slash, "/..") == 0) error = EINVAL; break; + + case ZPOOL_PROP_DEDUPDITTO: + if (spa_version(spa) < SPA_VERSION_DEDUP) + error = ENOTSUP; + else + error = nvpair_value_uint64(elem, &intval); + if (error == 0 && + intval != 0 && intval < ZIO_DEDUPDITTO_MIN) + error = EINVAL; + break; } if (error) @@ -412,16 +487,60 @@ spa_prop_validate(spa_t *spa, nvlist_t *props) return (error); } +void +spa_configfile_set(spa_t *spa, nvlist_t *nvp, boolean_t need_sync) +{ + char *cachefile; + spa_config_dirent_t *dp; + + if (nvlist_lookup_string(nvp, zpool_prop_to_name(ZPOOL_PROP_CACHEFILE), + &cachefile) != 0) + return; + + dp = kmem_alloc(sizeof (spa_config_dirent_t), + KM_SLEEP); + + if (cachefile[0] == '\0') + dp->scd_path = spa_strdup(spa_config_path); + else if (strcmp(cachefile, "none") == 0) + dp->scd_path = NULL; + else + dp->scd_path = spa_strdup(cachefile); + + list_insert_head(&spa->spa_config_list, dp); + if (need_sync) + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); +} + int spa_prop_set(spa_t *spa, nvlist_t *nvp) { int error; + nvpair_t *elem; + boolean_t need_sync = B_FALSE; + zpool_prop_t prop; if ((error = spa_prop_validate(spa, nvp)) != 0) return (error); - return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, - spa, nvp, 3)); + elem = NULL; + while ((elem = nvlist_next_nvpair(nvp, elem)) != NULL) { + if ((prop = zpool_name_to_prop( + nvpair_name(elem))) == ZPROP_INVAL) + return (EINVAL); + + if (prop == ZPOOL_PROP_CACHEFILE || prop == ZPOOL_PROP_ALTROOT) + continue; + + need_sync = B_TRUE; + break; + } + + if (need_sync) + return (dsl_sync_task_do(spa_get_dsl(spa), NULL, spa_sync_props, + spa, nvp, 3)); + else + return (0); } /* @@ -482,26 +601,185 @@ spa_get_errlists(spa_t *spa, avl_tree_t *last, avl_tree_t *scrub) offsetof(spa_error_entry_t, se_avl)); } +static taskq_t * +spa_taskq_create(spa_t *spa, const char *name, enum zti_modes mode, + uint_t value) +{ + uint_t flags = TASKQ_PREPOPULATE; + boolean_t batch = B_FALSE; + + switch (mode) { + case zti_mode_null: + return (NULL); /* no taskq needed */ + + case zti_mode_fixed: + ASSERT3U(value, >=, 1); + value = MAX(value, 1); + break; + + case zti_mode_batch: + batch = B_TRUE; + flags |= TASKQ_THREADS_CPU_PCT; + value = zio_taskq_batch_pct; + break; + + case zti_mode_online_percent: + flags |= TASKQ_THREADS_CPU_PCT; + break; + + default: + panic("unrecognized mode for %s taskq (%u:%u) in " + "spa_activate()", + name, mode, value); + break; + } + + if (zio_taskq_sysdc && spa->spa_proc != &p0) { + if (batch) + flags |= TASKQ_DC_BATCH; + + return (taskq_create_sysdc(name, value, 50, INT_MAX, + spa->spa_proc, zio_taskq_basedc, flags)); + } + return (taskq_create_proc(name, value, maxclsyspri, 50, INT_MAX, + spa->spa_proc, flags)); +} + +static void +spa_create_zio_taskqs(spa_t *spa) +{ + for (int t = 0; t < ZIO_TYPES; t++) { + for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { + const zio_taskq_info_t *ztip = &zio_taskqs[t][q]; + enum zti_modes mode = ztip->zti_mode; + uint_t value = ztip->zti_value; + char name[32]; + + (void) snprintf(name, sizeof (name), + "%s_%s", zio_type_name[t], zio_taskq_types[q]); + + spa->spa_zio_taskq[t][q] = + spa_taskq_create(spa, name, mode, value); + } + } +} + +#ifdef _KERNEL +static void +spa_thread(void *arg) +{ + callb_cpr_t cprinfo; + + spa_t *spa = arg; + user_t *pu = PTOU(curproc); + + CALLB_CPR_INIT(&cprinfo, &spa->spa_proc_lock, callb_generic_cpr, + spa->spa_name); + + ASSERT(curproc != &p0); + (void) snprintf(pu->u_psargs, sizeof (pu->u_psargs), + "zpool-%s", spa->spa_name); + (void) strlcpy(pu->u_comm, pu->u_psargs, sizeof (pu->u_comm)); + + /* bind this thread to the requested psrset */ + if (zio_taskq_psrset_bind != PS_NONE) { + pool_lock(); + mutex_enter(&cpu_lock); + mutex_enter(&pidlock); + mutex_enter(&curproc->p_lock); + + if (cpupart_bind_thread(curthread, zio_taskq_psrset_bind, + 0, NULL, NULL) == 0) { + curthread->t_bind_pset = zio_taskq_psrset_bind; + } else { + cmn_err(CE_WARN, + "Couldn't bind process for zfs pool \"%s\" to " + "pset %d\n", spa->spa_name, zio_taskq_psrset_bind); + } + + mutex_exit(&curproc->p_lock); + mutex_exit(&pidlock); + mutex_exit(&cpu_lock); + pool_unlock(); + } + + if (zio_taskq_sysdc) { + sysdc_thread_enter(curthread, 100, 0); + } + + spa->spa_proc = curproc; + spa->spa_did = curthread->t_did; + + spa_create_zio_taskqs(spa); + + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_CREATED); + + spa->spa_proc_state = SPA_PROC_ACTIVE; + cv_broadcast(&spa->spa_proc_cv); + + CALLB_CPR_SAFE_BEGIN(&cprinfo); + while (spa->spa_proc_state == SPA_PROC_ACTIVE) + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + CALLB_CPR_SAFE_END(&cprinfo, &spa->spa_proc_lock); + + ASSERT(spa->spa_proc_state == SPA_PROC_DEACTIVATE); + spa->spa_proc_state = SPA_PROC_GONE; + spa->spa_proc = &p0; + cv_broadcast(&spa->spa_proc_cv); + CALLB_CPR_EXIT(&cprinfo); /* drops spa_proc_lock */ + + mutex_enter(&curproc->p_lock); + lwp_exit(); +} +#endif + /* * Activate an uninitialized pool. */ static void -spa_activate(spa_t *spa) +spa_activate(spa_t *spa, int mode) { ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); spa->spa_state = POOL_STATE_ACTIVE; - - spa->spa_normal_class = metaslab_class_create(); - spa->spa_log_class = metaslab_class_create(); - - for (int t = 0; t < ZIO_TYPES; t++) { - for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - spa->spa_zio_taskq[t][q] = taskq_create("spa_zio", - zio_taskq_threads[t][q], maxclsyspri, 50, - INT_MAX, TASKQ_PREPOPULATE); + spa->spa_mode = mode; + + spa->spa_normal_class = metaslab_class_create(spa, zfs_metaslab_ops); + spa->spa_log_class = metaslab_class_create(spa, zfs_metaslab_ops); + + /* Try to create a covering process */ + mutex_enter(&spa->spa_proc_lock); + ASSERT(spa->spa_proc_state == SPA_PROC_NONE); + ASSERT(spa->spa_proc == &p0); + spa->spa_did = 0; + + /* Only create a process if we're going to be around a while. */ + if (spa_create_process && strcmp(spa->spa_name, TRYIMPORT_NAME) != 0) { + if (newproc(spa_thread, (caddr_t)spa, syscid, maxclsyspri, + NULL, 0) == 0) { + spa->spa_proc_state = SPA_PROC_CREATED; + while (spa->spa_proc_state == SPA_PROC_CREATED) { + cv_wait(&spa->spa_proc_cv, + &spa->spa_proc_lock); + } + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + ASSERT(spa->spa_proc != &p0); + ASSERT(spa->spa_did != 0); + } else { +#ifdef _KERNEL + cmn_err(CE_WARN, + "Couldn't create process for zfs pool \"%s\"\n", + spa->spa_name); +#endif } } + mutex_exit(&spa->spa_proc_lock); + + /* If we didn't create a process, we need to create our taskqs. */ + if (spa->spa_proc == &p0) { + spa_create_zio_taskqs(spa); + } list_create(&spa->spa_config_dirty_list, sizeof (vdev_t), offsetof(vdev_t, vdev_config_dirty_node)); @@ -528,7 +806,7 @@ spa_deactivate(spa_t *spa) ASSERT(spa->spa_sync_on == B_FALSE); ASSERT(spa->spa_dsl_pool == NULL); ASSERT(spa->spa_root_vdev == NULL); - + ASSERT(spa->spa_async_zio_root == NULL); ASSERT(spa->spa_state != POOL_STATE_UNINITIALIZED); txg_list_destroy(&spa->spa_vdev_txg_list); @@ -538,7 +816,8 @@ spa_deactivate(spa_t *spa) for (int t = 0; t < ZIO_TYPES; t++) { for (int q = 0; q < ZIO_TASKQ_TYPES; q++) { - taskq_destroy(spa->spa_zio_taskq[t][q]); + if (spa->spa_zio_taskq[t][q] != NULL) + taskq_destroy(spa->spa_zio_taskq[t][q]); spa->spa_zio_taskq[t][q] = NULL; } } @@ -559,6 +838,31 @@ spa_deactivate(spa_t *spa) avl_destroy(&spa->spa_errlist_last); spa->spa_state = POOL_STATE_UNINITIALIZED; + + mutex_enter(&spa->spa_proc_lock); + if (spa->spa_proc_state != SPA_PROC_NONE) { + ASSERT(spa->spa_proc_state == SPA_PROC_ACTIVE); + spa->spa_proc_state = SPA_PROC_DEACTIVATE; + cv_broadcast(&spa->spa_proc_cv); + while (spa->spa_proc_state == SPA_PROC_DEACTIVATE) { + ASSERT(spa->spa_proc != &p0); + cv_wait(&spa->spa_proc_cv, &spa->spa_proc_lock); + } + ASSERT(spa->spa_proc_state == SPA_PROC_GONE); + spa->spa_proc_state = SPA_PROC_NONE; + } + ASSERT(spa->spa_proc == &p0); + mutex_exit(&spa->spa_proc_lock); + + /* + * We want to make sure spa_thread() has actually exited the ZFS + * module, so that the module can't be unloaded out from underneath + * it. + */ + if (spa->spa_did != 0) { + thread_join(spa->spa_did); + spa->spa_did = 0; + } } /* @@ -572,7 +876,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, int atype) { nvlist_t **child; - uint_t c, children; + uint_t children; int error; if ((error = vdev_alloc(spa, vdp, nv, parent, id, atype)) != 0) @@ -593,7 +897,7 @@ spa_config_parse(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, return (EINVAL); } - for (c = 0; c < children; c++) { + for (int c = 0; c < children; c++) { vdev_t *vd; if ((error = spa_config_parse(spa, &vd, child[c], *vdp, c, atype)) != 0) { @@ -634,15 +938,10 @@ spa_unload(spa_t *spa) /* * Wait for any outstanding async I/O to complete. */ - mutex_enter(&spa->spa_async_root_lock); - while (spa->spa_async_root_count != 0) - cv_wait(&spa->spa_async_root_cv, &spa->spa_async_root_lock); - mutex_exit(&spa->spa_async_root_lock); - - /* - * Drop and purge level 2 cache - */ - spa_l2cache_drop(spa); + if (spa->spa_async_zio_root != NULL) { + (void) zio_wait(spa->spa_async_zio_root); + spa->spa_async_zio_root = NULL; + } /* * Close the dsl pool. @@ -650,8 +949,18 @@ spa_unload(spa_t *spa) if (spa->spa_dsl_pool) { dsl_pool_close(spa->spa_dsl_pool); spa->spa_dsl_pool = NULL; + spa->spa_meta_objset = NULL; } + ddt_unload(spa); + + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + + /* + * Drop and purge level 2 cache + */ + spa_l2cache_drop(spa); + /* * Close all vdevs. */ @@ -686,6 +995,8 @@ spa_unload(spa_t *spa) spa->spa_l2cache.sav_count = 0; spa->spa_async_suspended = 0; + + spa_config_exit(spa, SCL_ALL, FTAG); } /* @@ -775,6 +1086,7 @@ spa_load_spares(spa_t *spa) } vd->vdev_top = vd; + vd->vdev_aux = &spa->spa_spares; if (vdev_open(vd) != 0) continue; @@ -816,7 +1128,7 @@ spa_load_l2cache(spa_t *spa) nvlist_t **l2cache; uint_t nl2cache; int i, j, oldnvdevs; - uint64_t guid, size; + uint64_t guid; vdev_t *vd, **oldvdevs, **newvdevs; spa_aux_vdev_t *sav = &spa->spa_l2cache; @@ -880,12 +1192,8 @@ spa_load_l2cache(spa_t *spa) (void) vdev_validate_aux(vd); - if (!vdev_is_dead(vd)) { - size = vdev_get_rsize(vd); - l2arc_add_vdev(spa, vd, - VDEV_LABEL_START_SIZE, - size - VDEV_LABEL_START_SIZE); - } + if (!vdev_is_dead(vd)) + l2arc_add_vdev(spa, vd); } } @@ -897,12 +1205,9 @@ spa_load_l2cache(spa_t *spa) vd = oldvdevs[i]; if (vd != NULL) { - if ((spa_mode & FWRITE) && - spa_l2cache_exists(vd->vdev_guid, &pool) && - pool != 0ULL && - l2arc_vdev_present(vd)) { + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - } (void) vdev_close(vd); spa_l2cache_remove(vd); } @@ -951,7 +1256,8 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) dmu_buf_rele(db, FTAG); packed = kmem_alloc(nvsize, KM_SLEEP); - error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed); + error = dmu_read(spa->spa_meta_objset, obj, 0, nvsize, packed, + DMU_READ_PREFETCH); if (error == 0) error = nvlist_unpack(packed, nvsize, value, 0); kmem_free(packed, nvsize); @@ -966,9 +1272,7 @@ load_nvlist(spa_t *spa, uint64_t obj, nvlist_t **value) static void spa_check_removed(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) spa_check_removed(vd->vdev_child[c]); if (vd->vdev_ops->vdev_op_leaf && vdev_is_dead(vd)) { @@ -977,6 +1281,30 @@ spa_check_removed(vdev_t *vd) } } +/* + * Load the slog device state from the config object since it's possible + * that the label does not contain the most up-to-date information. + */ +void +spa_load_log_state(spa_t *spa, nvlist_t *nv) +{ + vdev_t *ovd, *rvd = spa->spa_root_vdev; + + /* + * Load the original root vdev tree from the passed config. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + VERIFY(spa_config_parse(spa, &ovd, nv, NULL, 0, VDEV_ALLOC_LOAD) == 0); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *cvd = rvd->vdev_child[c]; + if (cvd->vdev_islog) + vdev_load_log_state(cvd, ovd->vdev_child[c]); + } + vdev_free(ovd); + spa_config_exit(spa, SCL_ALL, FTAG); +} + /* * Check for missing log devices */ @@ -989,140 +1317,458 @@ spa_check_logs(spa_t *spa) case SPA_LOG_UNKNOWN: if (dmu_objset_find(spa->spa_name, zil_check_log_chain, NULL, DS_FIND_CHILDREN)) { - spa->spa_log_state = SPA_LOG_MISSING; + spa_set_log_state(spa, SPA_LOG_MISSING); return (1); } break; - - case SPA_LOG_CLEAR: - (void) dmu_objset_find(spa->spa_name, zil_clear_log_chain, NULL, - DS_FIND_CHILDREN); - break; } - spa->spa_log_state = SPA_LOG_GOOD; return (0); } -/* - * Load an existing storage pool, using the pool's builtin spa_config as a - * source of configuration information. - */ -static int -spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) +static boolean_t +spa_passivate_log(spa_t *spa) { - int error = 0; - nvlist_t *nvroot = NULL; - vdev_t *rvd; - uberblock_t *ub = &spa->spa_uberblock; - uint64_t config_cache_txg = spa->spa_config_txg; - uint64_t pool_guid; - uint64_t version; - uint64_t autoreplace = 0; - char *ereport = FM_EREPORT_ZFS_POOL; + vdev_t *rvd = spa->spa_root_vdev; + boolean_t slog_found = B_FALSE; - ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); - spa->spa_load_state = state; + if (!spa_has_slogs(spa)) + return (B_FALSE); - if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || - nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) { - error = EINVAL; - goto out; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_islog) { + metaslab_group_passivate(mg); + slog_found = B_TRUE; + } } - /* - * Versioning wasn't explicitly added to the label until later, so if - * it's not present treat it as the initial version. - */ - if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, &version) != 0) - version = SPA_VERSION_INITIAL; + return (slog_found); +} - (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, - &spa->spa_config_txg); +static void +spa_activate_log(spa_t *spa) +{ + vdev_t *rvd = spa->spa_root_vdev; - if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && - spa_guid_exists(pool_guid, 0)) { - error = EEXIST; - goto out; + ASSERT(spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + metaslab_group_t *mg = tvd->vdev_mg; + + if (tvd->vdev_islog) + metaslab_group_activate(mg); } +} - spa->spa_load_guid = pool_guid; +int +spa_offline_log(spa_t *spa) +{ + int error = 0; - /* - * Parse the configuration into a vdev tree. We explicitly set the - * value that will be returned by spa_version() since parsing the - * configuration requires knowing the version number. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa->spa_ubsync.ub_version = version; - error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, VDEV_ALLOC_LOAD); - spa_config_exit(spa, SCL_ALL, FTAG); + if ((error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN)) == 0) { - if (error != 0) - goto out; + /* + * We successfully offlined the log device, sync out the + * current txg so that the "stubby" block can be removed + * by zil_sync(). + */ + txg_wait_synced(spa->spa_dsl_pool, 0); + } + return (error); +} - ASSERT(spa->spa_root_vdev == rvd); - ASSERT(spa_guid(spa) == pool_guid); +static void +spa_aux_check_removed(spa_aux_vdev_t *sav) +{ + for (int i = 0; i < sav->sav_count; i++) + spa_check_removed(sav->sav_vdevs[i]); +} - /* - * Try to open all vdevs, loading each label in the process. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_open(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); - if (error != 0) - goto out; +void +spa_claim_notify(zio_t *zio) +{ + spa_t *spa = zio->io_spa; - /* - * Validate the labels for all leaf vdevs. We need to grab the config - * lock because all label I/O is done with ZIO_FLAG_CONFIG_WRITER. - */ - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - error = vdev_validate(rvd); - spa_config_exit(spa, SCL_ALL, FTAG); + if (zio->io_error) + return; - if (error != 0) - goto out; + mutex_enter(&spa->spa_props_lock); /* any mutex will do */ + if (spa->spa_claim_max_txg < zio->io_bp->blk_birth) + spa->spa_claim_max_txg = zio->io_bp->blk_birth; + mutex_exit(&spa->spa_props_lock); +} - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; - } +typedef struct spa_load_error { + uint64_t sle_meta_count; + uint64_t sle_data_count; +} spa_load_error_t; - /* - * Find the best uberblock. - */ - vdev_uberblock_load(NULL, rvd, ub); +static void +spa_load_verify_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + spa_load_error_t *sle = zio->io_private; + dmu_object_type_t type = BP_GET_TYPE(bp); + int error = zio->io_error; - /* - * If we weren't able to find a single valid uberblock, return failure. + if (error) { + if ((BP_GET_LEVEL(bp) != 0 || dmu_ot[type].ot_metadata) && + type != DMU_OT_INTENT_LOG) + atomic_add_64(&sle->sle_meta_count, 1); + else + atomic_add_64(&sle->sle_data_count, 1); + } + zio_data_buf_free(zio->io_data, zio->io_size); +} + +/*ARGSUSED*/ +static int +spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) +{ + if (bp != NULL) { + zio_t *rio = arg; + size_t size = BP_GET_PSIZE(bp); + void *data = zio_data_buf_alloc(size); + + zio_nowait(zio_read(rio, spa, bp, data, size, + spa_load_verify_done, rio->io_private, ZIO_PRIORITY_SCRUB, + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SCRUB | ZIO_FLAG_RAW, zb)); + } + return (0); +} + +static int +spa_load_verify(spa_t *spa) +{ + zio_t *rio; + spa_load_error_t sle = { 0 }; + zpool_rewind_policy_t policy; + boolean_t verify_ok = B_FALSE; + int error; + + zpool_get_rewind_policy(spa->spa_config, &policy); + + if (policy.zrp_request & ZPOOL_NEVER_REWIND) + return (0); + + rio = zio_root(spa, NULL, &sle, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE); + + error = traverse_pool(spa, spa->spa_verify_min_txg, + TRAVERSE_PRE | TRAVERSE_PREFETCH, spa_load_verify_cb, rio); + + (void) zio_wait(rio); + + spa->spa_load_meta_errors = sle.sle_meta_count; + spa->spa_load_data_errors = sle.sle_data_count; + + if (!error && sle.sle_meta_count <= policy.zrp_maxmeta && + sle.sle_data_count <= policy.zrp_maxdata) { + verify_ok = B_TRUE; + spa->spa_load_txg = spa->spa_uberblock.ub_txg; + spa->spa_load_txg_ts = spa->spa_uberblock.ub_timestamp; + } else { + spa->spa_load_max_txg = spa->spa_uberblock.ub_txg; + } + + if (error) { + if (error != ENXIO && error != EIO) + error = EIO; + return (error); + } + + return (verify_ok ? 0 : EIO); +} + +/* + * Find a value in the pool props object. + */ +static void +spa_prop_find(spa_t *spa, zpool_prop_t prop, uint64_t *val) +{ + (void) zap_lookup(spa->spa_meta_objset, spa->spa_pool_props_object, + zpool_prop_to_name(prop), sizeof (uint64_t), 1, val); +} + +/* + * Find a value in the pool directory object. + */ +static int +spa_dir_prop(spa_t *spa, const char *name, uint64_t *val) +{ + return (zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, + name, sizeof (uint64_t), 1, val)); +} + +static int +spa_vdev_err(vdev_t *vdev, vdev_aux_t aux, int err) +{ + vdev_set_state(vdev, B_TRUE, VDEV_STATE_CANT_OPEN, aux); + return (err); +} + +/* + * Fix up config after a partly-completed split. This is done with the + * ZPOOL_CONFIG_SPLIT nvlist. Both the splitting pool and the split-off + * pool have that entry in their config, but only the splitting one contains + * a list of all the guids of the vdevs that are being split off. + * + * This function determines what to do with that list: either rejoin + * all the disks to the pool, or complete the splitting process. To attempt + * the rejoin, each disk that is offlined is marked online again, and + * we do a reopen() call. If the vdev label for every disk that was + * marked online indicates it was successfully split off (VDEV_AUX_SPLIT_POOL) + * then we call vdev_split() on each disk, and complete the split. + * + * Otherwise we leave the config alone, with all the vdevs in place in + * the original pool. + */ +static void +spa_try_repair(spa_t *spa, nvlist_t *config) +{ + uint_t extracted; + uint64_t *glist; + uint_t i, gcount; + nvlist_t *nvl; + vdev_t **vd; + boolean_t attempt_reopen; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, &nvl) != 0) + return; + + /* check that the config is complete */ + if (nvlist_lookup_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + &glist, &gcount) != 0) + return; + + vd = kmem_zalloc(gcount * sizeof (vdev_t *), KM_SLEEP); + + /* attempt to online all the vdevs & validate */ + attempt_reopen = B_TRUE; + for (i = 0; i < gcount; i++) { + if (glist[i] == 0) /* vdev is hole */ + continue; + + vd[i] = spa_lookup_by_guid(spa, glist[i], B_FALSE); + if (vd[i] == NULL) { + /* + * Don't bother attempting to reopen the disks; + * just do the split. + */ + attempt_reopen = B_FALSE; + } else { + /* attempt to re-online it */ + vd[i]->vdev_offline = B_FALSE; + } + } + + if (attempt_reopen) { + vdev_reopen(spa->spa_root_vdev); + + /* check each device to see what state it's in */ + for (extracted = 0, i = 0; i < gcount; i++) { + if (vd[i] != NULL && + vd[i]->vdev_stat.vs_aux != VDEV_AUX_SPLIT_POOL) + break; + ++extracted; + } + } + + /* + * If every disk has been moved to the new pool, or if we never + * even attempted to look at them, then we split them off for + * good. */ - if (ub->ub_txg == 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = ENXIO; - goto out; + if (!attempt_reopen || gcount == extracted) { + for (i = 0; i < gcount; i++) + if (vd[i] != NULL) + vdev_split(vd[i]); + vdev_reopen(spa->spa_root_vdev); } + kmem_free(vd, gcount * sizeof (vdev_t *)); +} + +static int +spa_load(spa_t *spa, spa_load_state_t state, spa_import_type_t type, + boolean_t mosconfig) +{ + nvlist_t *config = spa->spa_config; + char *ereport = FM_EREPORT_ZFS_POOL; + int error; + uint64_t pool_guid; + nvlist_t *nvl; + + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) + return (EINVAL); + /* - * If the pool is newer than the code, we can't open it. + * Versioning wasn't explicitly added to the label until later, so if + * it's not present treat it as the initial version. */ - if (ub->ub_version > SPA_VERSION) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_VERSION_NEWER); - error = ENOTSUP; - goto out; + if (nvlist_lookup_uint64(config, ZPOOL_CONFIG_VERSION, + &spa->spa_ubsync.ub_version) != 0) + spa->spa_ubsync.ub_version = SPA_VERSION_INITIAL; + + (void) nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, + &spa->spa_config_txg); + + if ((state == SPA_LOAD_IMPORT || state == SPA_LOAD_TRYIMPORT) && + spa_guid_exists(pool_guid, 0)) { + error = EEXIST; + } else { + spa->spa_load_guid = pool_guid; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_SPLIT, + &nvl) == 0) { + VERIFY(nvlist_dup(nvl, &spa->spa_config_splitting, + KM_SLEEP) == 0); + } + + error = spa_load_impl(spa, pool_guid, config, state, type, + mosconfig, &ereport); + } + + spa->spa_minref = refcount_count(&spa->spa_refcount); + if (error && error != EBADF) + zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); + spa->spa_load_state = error ? SPA_LOAD_ERROR : SPA_LOAD_NONE; + spa->spa_ena = 0; + + return (error); +} + +/* + * Load an existing storage pool, using the pool's builtin spa_config as a + * source of configuration information. + */ +static int +spa_load_impl(spa_t *spa, uint64_t pool_guid, nvlist_t *config, + spa_load_state_t state, spa_import_type_t type, boolean_t mosconfig, + char **ereport) +{ + int error = 0; + nvlist_t *nvconfig, *nvroot = NULL; + vdev_t *rvd; + uberblock_t *ub = &spa->spa_uberblock; + uint64_t config_cache_txg = spa->spa_config_txg; + int orig_mode = spa->spa_mode; + int parse; + + /* + * If this is an untrusted config, access the pool in read-only mode. + * This prevents things like resilvering recently removed devices. + */ + if (!mosconfig) + spa->spa_mode = FREAD; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + + spa->spa_load_state = state; + + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot)) + return (EINVAL); + + parse = (type == SPA_IMPORT_EXISTING ? + VDEV_ALLOC_LOAD : VDEV_ALLOC_SPLIT); + + /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + + /* + * Parse the configuration into a vdev tree. We explicitly set the + * value that will be returned by spa_version() since parsing the + * configuration requires knowing the version number. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvroot, NULL, 0, parse); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) + return (error); + + ASSERT(spa->spa_root_vdev == rvd); + + if (type != SPA_IMPORT_ASSEMBLE) { + ASSERT(spa_guid(spa) == pool_guid); + } + + /* + * Try to open all vdevs, loading each label in the process. + */ + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_open(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error != 0) + return (error); + + /* + * We need to validate the vdev labels against the configuration that + * we have in hand, which is dependent on the setting of mosconfig. If + * mosconfig is true then we're validating the vdev labels based on + * that config. Otherwise, we're validating against the cached config + * (zpool.cache) that was read when we loaded the zfs module, and then + * later we will recursively call spa_load() and validate against + * the vdev config. + * + * If we're assembling a new pool that's been split off from an + * existing pool, the labels haven't yet been updated so we skip + * validation for now. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = vdev_validate(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (error != 0) + return (error); + + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); } + /* + * Find the best uberblock. + */ + vdev_uberblock_load(NULL, rvd, ub); + + /* + * If we weren't able to find a single valid uberblock, return failure. + */ + if (ub->ub_txg == 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, ENXIO)); + + /* + * If the pool is newer than the code, we can't open it. + */ + if (ub->ub_version > SPA_VERSION) + return (spa_vdev_err(rvd, VDEV_AUX_VERSION_NEWER, ENOTSUP)); + /* * If the vdev guid sum doesn't match the uberblock, we have an * incomplete configuration. */ - if (rvd->vdev_guid_sum != ub->ub_guid_sum && mosconfig) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_GUID_SUM); - error = ENXIO; - goto out; + if (mosconfig && type != SPA_IMPORT_ASSEMBLE && + rvd->vdev_guid_sum != ub->ub_guid_sum) + return (spa_vdev_err(rvd, VDEV_AUX_BAD_GUID_SUM, ENXIO)); + + if (type != SPA_IMPORT_ASSEMBLE && spa->spa_config_splitting) { + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_try_repair(spa, config); + spa_config_exit(spa, SCL_ALL, FTAG); + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; } /* @@ -1130,219 +1776,165 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) */ spa->spa_state = POOL_STATE_ACTIVE; spa->spa_ubsync = spa->spa_uberblock; - spa->spa_first_txg = spa_last_synced_txg(spa) + 1; + spa->spa_verify_min_txg = spa->spa_extreme_rewind ? + TXG_INITIAL - 1 : spa_last_synced_txg(spa) - TXG_DEFER_SIZE - 1; + spa->spa_first_txg = spa->spa_last_ubsync_txg ? + spa->spa_last_ubsync_txg : spa_last_synced_txg(spa) + 1; + spa->spa_claim_max_txg = spa->spa_first_txg; + error = dsl_pool_open(spa, spa->spa_first_txg, &spa->spa_dsl_pool); - if (error) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - goto out; - } + if (error) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa->spa_meta_objset = spa->spa_dsl_pool->dp_meta_objset; - if (zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_CONFIG, - sizeof (uint64_t), 1, &spa->spa_config_object) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (spa_dir_prop(spa, DMU_POOL_CONFIG, &spa->spa_config_object) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + if (load_nvlist(spa, spa->spa_config_object, &nvconfig) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (!mosconfig) { - nvlist_t *newconfig; uint64_t hostid; + nvlist_t *policy = NULL; - if (load_nvlist(spa, spa->spa_config_object, &newconfig) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - - if (!spa_is_root(spa) && nvlist_lookup_uint64(newconfig, + if (!spa_is_root(spa) && nvlist_lookup_uint64(nvconfig, ZPOOL_CONFIG_HOSTID, &hostid) == 0) { char *hostname; unsigned long myhostid = 0; - VERIFY(nvlist_lookup_string(newconfig, + VERIFY(nvlist_lookup_string(nvconfig, ZPOOL_CONFIG_HOSTNAME, &hostname) == 0); +#ifdef _KERNEL + myhostid = zone_get_hostid(NULL); +#else /* _KERNEL */ + /* + * We're emulating the system's hostid in userland, so + * we can't use zone_get_hostid(). + */ (void) ddi_strtoul(hw_serial, NULL, 10, &myhostid); +#endif /* _KERNEL */ if (hostid != 0 && myhostid != 0 && - (unsigned long)hostid != myhostid) { + hostid != myhostid) { cmn_err(CE_WARN, "pool '%s' could not be " "loaded as it was last accessed by " "another system (host: %s hostid: 0x%lx). " "See: http://www.sun.com/msg/ZFS-8000-EY", spa_name(spa), hostname, (unsigned long)hostid); - error = EBADF; - goto out; + return (EBADF); } } + if (nvlist_lookup_nvlist(spa->spa_config, + ZPOOL_REWIND_POLICY, &policy) == 0) + VERIFY(nvlist_add_nvlist(nvconfig, + ZPOOL_REWIND_POLICY, policy) == 0); - spa_config_set(spa, newconfig); + spa_config_set(spa, nvconfig); spa_unload(spa); spa_deactivate(spa); - spa_activate(spa); + spa_activate(spa, orig_mode); - return (spa_load(spa, newconfig, state, B_TRUE)); + return (spa_load(spa, state, SPA_IMPORT_EXISTING, B_TRUE)); } - if (zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + if (spa_dir_prop(spa, DMU_POOL_SYNC_BPLIST, + &spa->spa_deferred_bplist_obj) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the bit that tells us to use the new accounting function * (raid-z deflation). If we have an older pool, this will not * be present. */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_DEFLATE, - sizeof (uint64_t), 1, &spa->spa_deflate); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_DEFLATE, &spa->spa_deflate); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the persistent error log. If we have an older pool, this will * not be present. */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, - sizeof (uint64_t), 1, &spa->spa_errlog_last); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_LAST, &spa->spa_errlog_last); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, - sizeof (uint64_t), 1, &spa->spa_errlog_scrub); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_ERRLOG_SCRUB, + &spa->spa_errlog_scrub); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); /* * Load the history object. If we have an older pool, this * will not be present. */ - error = zap_lookup(spa->spa_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_HISTORY, - sizeof (uint64_t), 1, &spa->spa_history); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_HISTORY, &spa->spa_history); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + + /* + * If we're assembling the pool from the split-off vdevs of + * an existing pool, we don't want to attach the spares & cache + * devices. + */ /* * Load any hot spares for this pool. */ - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SPARES, sizeof (uint64_t), 1, &spa->spa_spares.sav_object); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - if (error == 0) { + error = spa_dir_prop(spa, DMU_POOL_SPARES, &spa->spa_spares.sav_object); + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_SPARES); if (load_nvlist(spa, spa->spa_spares.sav_object, - &spa->spa_spares.sav_config) != 0) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + &spa->spa_spares.sav_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_spares(spa); spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_spares.sav_sync = B_TRUE; } /* * Load any level 2 ARC devices for this pool. */ - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_L2CACHE, sizeof (uint64_t), 1, + error = spa_dir_prop(spa, DMU_POOL_L2CACHE, &spa->spa_l2cache.sav_object); - if (error != 0 && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } - if (error == 0) { + if (error != 0 && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); + if (error == 0 && type != SPA_IMPORT_ASSEMBLE) { ASSERT(spa_version(spa) >= SPA_VERSION_L2CACHE); if (load_nvlist(spa, spa->spa_l2cache.sav_object, - &spa->spa_l2cache.sav_config) != 0) { - vdev_set_state(rvd, B_TRUE, - VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + &spa->spa_l2cache.sav_config) != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa_load_l2cache(spa); spa_config_exit(spa, SCL_ALL, FTAG); + } else if (error == 0) { + spa->spa_l2cache.sav_sync = B_TRUE; } - if (spa_check_logs(spa)) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LOG); - error = ENXIO; - ereport = FM_EREPORT_ZFS_LOG_REPLAY; - goto out; - } - - spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); - error = zap_lookup(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_PROPS, sizeof (uint64_t), 1, &spa->spa_pool_props_object); - - if (error && error != ENOENT) { - vdev_set_state(rvd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_CORRUPT_DATA); - error = EIO; - goto out; - } + error = spa_dir_prop(spa, DMU_POOL_PROPS, &spa->spa_pool_props_object); + if (error && error != ENOENT) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); if (error == 0) { - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_BOOTFS), - sizeof (uint64_t), 1, &spa->spa_bootfs); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_AUTOREPLACE), - sizeof (uint64_t), 1, &autoreplace); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_DELEGATION), - sizeof (uint64_t), 1, &spa->spa_delegation); - (void) zap_lookup(spa->spa_meta_objset, - spa->spa_pool_props_object, - zpool_prop_to_name(ZPOOL_PROP_FAILUREMODE), - sizeof (uint64_t), 1, &spa->spa_failmode); + uint64_t autoreplace; + + spa_prop_find(spa, ZPOOL_PROP_BOOTFS, &spa->spa_bootfs); + spa_prop_find(spa, ZPOOL_PROP_AUTOREPLACE, &autoreplace); + spa_prop_find(spa, ZPOOL_PROP_DELEGATION, &spa->spa_delegation); + spa_prop_find(spa, ZPOOL_PROP_FAILUREMODE, &spa->spa_failmode); + spa_prop_find(spa, ZPOOL_PROP_AUTOEXPAND, &spa->spa_autoexpand); + spa_prop_find(spa, ZPOOL_PROP_DEDUPDITTO, + &spa->spa_dedup_ditto); + + spa->spa_autoreplace = (autoreplace != 0); } /* @@ -1352,8 +1944,18 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) * unopenable vdevs so that the normal autoreplace handler can take * over. */ - if (autoreplace && state != SPA_LOAD_TRYIMPORT) + if (spa->spa_autoreplace && state != SPA_LOAD_TRYIMPORT) { spa_check_removed(spa->spa_root_vdev); + /* + * For the import case, this is done in spa_import(), because + * at this point we're using the spare definitions from + * the MOS config, not necessarily from the userland config. + */ + if (state != SPA_LOAD_IMPORT) { + spa_aux_check_removed(&spa->spa_spares); + spa_aux_check_removed(&spa->spa_l2cache); + } + } /* * Load the vdev state for all toplevel vdevs. @@ -1371,43 +1973,91 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) * Check the state of the root vdev. If it can't be opened, it * indicates one or more toplevel vdevs are faulted. */ - if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) { - error = ENXIO; - goto out; - } + if (rvd->vdev_state <= VDEV_STATE_CANT_OPEN) + return (ENXIO); - if ((spa_mode & FWRITE) && state != SPA_LOAD_TRYIMPORT) { - dmu_tx_t *tx; - int need_update = B_FALSE; - int c; + /* + * Load the DDTs (dedup tables). + */ + error = ddt_load(spa); + if (error != 0) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, EIO)); - /* - * Claim log blocks that haven't been committed yet. - * This must all happen in a single txg. - */ - tx = dmu_tx_create_assigned(spa_get_dsl(spa), + spa_update_dspace(spa); + + if (state != SPA_LOAD_TRYIMPORT) { + error = spa_load_verify(spa); + if (error) + return (spa_vdev_err(rvd, VDEV_AUX_CORRUPT_DATA, + error)); + } + + /* + * Load the intent log state and check log integrity. If we're + * assembling a pool from a split, the log is not transferred over. + */ + if (type != SPA_IMPORT_ASSEMBLE) { + VERIFY(nvlist_lookup_nvlist(nvconfig, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + spa_load_log_state(spa, nvroot); + nvlist_free(nvconfig); + + if (spa_check_logs(spa)) { + *ereport = FM_EREPORT_ZFS_LOG_REPLAY; + return (spa_vdev_err(rvd, VDEV_AUX_BAD_LOG, ENXIO)); + } + } + + if (spa_writeable(spa) && (state == SPA_LOAD_RECOVER || + spa->spa_load_max_txg == UINT64_MAX)) { + dmu_tx_t *tx; + int need_update = B_FALSE; + + ASSERT(state != SPA_LOAD_TRYIMPORT); + + /* + * Claim log blocks that haven't been committed yet. + * This must all happen in a single txg. + * Note: spa_claim_max_txg is updated by spa_claim_notify(), + * invoked from zil_claim_log_block()'s i/o done callback. + * Price of rollback is that we abandon the log. + */ + spa->spa_claiming = B_TRUE; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), spa_first_txg(spa)); (void) dmu_objset_find(spa_name(spa), zil_claim, tx, DS_FIND_CHILDREN); dmu_tx_commit(tx); + spa->spa_claiming = B_FALSE; + + spa_set_log_state(spa, SPA_LOG_GOOD); spa->spa_sync_on = B_TRUE; txg_sync_start(spa->spa_dsl_pool); /* - * Wait for all claims to sync. + * Wait for all claims to sync. We sync up to the highest + * claimed log block birth time so that claimed log blocks + * don't appear to be from the future. spa_claim_max_txg + * will have been set for us by either zil_check_log_chain() + * (invoked from spa_check_logs()) or zil_claim() above. */ - txg_wait_synced(spa->spa_dsl_pool, 0); + txg_wait_synced(spa->spa_dsl_pool, spa->spa_claim_max_txg); /* * If the config cache is stale, or we have uninitialized * metaslabs (see spa_vdev_add()), then update the config. + * + * If spa_load_verbatim is true, trust the current + * in-core spa_config and update the disk labels. */ if (config_cache_txg != spa->spa_config_txg || - state == SPA_LOAD_IMPORT) + state == SPA_LOAD_IMPORT || spa->spa_load_verbatim || + state == SPA_LOAD_RECOVER) need_update = B_TRUE; - for (c = 0; c < rvd->vdev_children; c++) + for (int c = 0; c < rvd->vdev_children; c++) if (rvd->vdev_child[c]->vdev_ms_array == 0) need_update = B_TRUE; @@ -1417,17 +2067,104 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) */ if (need_update) spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + + /* + * Check all DTLs to see if anything needs resilvering. + */ + if (vdev_resilver_needed(rvd, NULL, NULL)) + spa_async_request(spa, SPA_ASYNC_RESILVER); + + /* + * Delete any inconsistent datasets. + */ + (void) dmu_objset_find(spa_name(spa), + dsl_destroy_inconsistent, NULL, DS_FIND_CHILDREN); + + /* + * Clean up any stale temporary dataset userrefs. + */ + dsl_pool_clean_tmp_userrefs(spa->spa_dsl_pool); } - error = 0; -out: - spa->spa_minref = refcount_count(&spa->spa_refcount); - if (error && error != EBADF) - zfs_ereport_post(ereport, spa, NULL, NULL, 0, 0); - spa->spa_load_state = SPA_LOAD_NONE; - spa->spa_ena = 0; + return (0); +} - return (error); +static int +spa_load_retry(spa_t *spa, spa_load_state_t state, int mosconfig) +{ + spa_unload(spa); + spa_deactivate(spa); + + spa->spa_load_max_txg--; + + spa_activate(spa, spa_mode_global); + spa_async_suspend(spa); + + return (spa_load(spa, state, SPA_IMPORT_EXISTING, mosconfig)); +} + +static int +spa_load_best(spa_t *spa, spa_load_state_t state, int mosconfig, + uint64_t max_request, int rewind_flags) +{ + nvlist_t *config = NULL; + int load_error, rewind_error; + uint64_t safe_rewind_txg; + uint64_t min_txg; + + if (spa->spa_load_txg && state == SPA_LOAD_RECOVER) { + spa->spa_load_max_txg = spa->spa_load_txg; + spa_set_log_state(spa, SPA_LOG_CLEAR); + } else { + spa->spa_load_max_txg = max_request; + } + + load_error = rewind_error = spa_load(spa, state, SPA_IMPORT_EXISTING, + mosconfig); + if (load_error == 0) + return (0); + + if (spa->spa_root_vdev != NULL) + config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + + spa->spa_last_ubsync_txg = spa->spa_uberblock.ub_txg; + spa->spa_last_ubsync_txg_ts = spa->spa_uberblock.ub_timestamp; + + if (rewind_flags & ZPOOL_NEVER_REWIND) { + nvlist_free(config); + return (load_error); + } + + /* Price of rolling back is discarding txgs, including log */ + if (state == SPA_LOAD_RECOVER) + spa_set_log_state(spa, SPA_LOG_CLEAR); + + spa->spa_load_max_txg = spa->spa_last_ubsync_txg; + safe_rewind_txg = spa->spa_last_ubsync_txg - TXG_DEFER_SIZE; + min_txg = (rewind_flags & ZPOOL_EXTREME_REWIND) ? + TXG_INITIAL : safe_rewind_txg; + + /* + * Continue as long as we're finding errors, we're still within + * the acceptable rewind range, and we're still finding uberblocks + */ + while (rewind_error && spa->spa_uberblock.ub_txg >= min_txg && + spa->spa_uberblock.ub_txg <= spa->spa_load_max_txg) { + if (spa->spa_load_max_txg < safe_rewind_txg) + spa->spa_extreme_rewind = B_TRUE; + rewind_error = spa_load_retry(spa, state, mosconfig); + } + + if (config) + spa_rewind_data_to_nvlist(spa, config); + + spa->spa_extreme_rewind = B_FALSE; + spa->spa_load_max_txg = UINT64_MAX; + + if (config && (rewind_error || state != SPA_LOAD_RECOVER)) + spa_config_set(spa, config); + + return (state == SPA_LOAD_RECOVER ? rewind_error : load_error); } /* @@ -1443,9 +2180,12 @@ spa_load(spa_t *spa, nvlist_t *config, spa_load_state_t state, int mosconfig) * ambiguous state. */ static int -spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) +spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t *nvpolicy, + nvlist_t **config) { spa_t *spa; + zpool_rewind_policy_t policy; + spa_load_state_t state = SPA_LOAD_OPEN; int error; int locked = B_FALSE; @@ -1467,11 +2207,31 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) mutex_exit(&spa_namespace_lock); return (ENOENT); } + + zpool_get_rewind_policy(nvpolicy ? nvpolicy : spa->spa_config, &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; + if (spa->spa_state == POOL_STATE_UNINITIALIZED) { - spa_activate(spa); + spa_activate(spa, spa_mode_global); + + if (spa->spa_last_open_failed && (policy.zrp_request & + (ZPOOL_NO_REWIND | ZPOOL_NEVER_REWIND))) { + if (config != NULL && spa->spa_config) + VERIFY(nvlist_dup(spa->spa_config, + config, KM_SLEEP) == 0); + spa_deactivate(spa); + if (locked) + mutex_exit(&spa_namespace_lock); + return (spa->spa_last_open_failed); + } + + if (state != SPA_LOAD_RECOVER) + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; - error = spa_load(spa, spa->spa_config, SPA_LOAD_OPEN, B_FALSE); + error = spa_load_best(spa, state, B_FALSE, policy.zrp_txg, + policy.zrp_request); if (error == EBADF) { /* @@ -1496,38 +2256,49 @@ spa_open_common(const char *pool, spa_t **spapp, void *tag, nvlist_t **config) * information: the state of each vdev after the * attempted vdev_open(). Return this to the user. */ - if (config != NULL && spa->spa_root_vdev != NULL) - *config = spa_config_generate(spa, NULL, -1ULL, - B_TRUE); + if (config != NULL && spa->spa_config) + VERIFY(nvlist_dup(spa->spa_config, config, + KM_SLEEP) == 0); spa_unload(spa); spa_deactivate(spa); - spa->spa_last_open_failed = B_TRUE; + spa->spa_last_open_failed = error; if (locked) mutex_exit(&spa_namespace_lock); *spapp = NULL; return (error); - } else { - spa->spa_last_open_failed = B_FALSE; } + } spa_open_ref(spa, tag); - if (locked) - mutex_exit(&spa_namespace_lock); - - *spapp = spa; if (config != NULL) *config = spa_config_generate(spa, NULL, -1ULL, B_TRUE); + if (locked) { + spa->spa_last_open_failed = 0; + spa->spa_last_ubsync_txg = 0; + spa->spa_load_txg = 0; + mutex_exit(&spa_namespace_lock); + } + + *spapp = spa; + return (0); } +int +spa_open_rewind(const char *name, spa_t **spapp, void *tag, nvlist_t *policy, + nvlist_t **config) +{ + return (spa_open_common(name, spapp, tag, policy, config)); +} + int spa_open(const char *name, spa_t **spapp, void *tag) { - return (spa_open_common(name, spapp, tag, NULL)); + return (spa_open_common(name, spapp, tag, NULL, NULL)); } /* @@ -1572,6 +2343,8 @@ spa_add_spares(spa_t *spa, nvlist_t *config) uint_t vsc; uint64_t pool; + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + if (spa->spa_spares.sav_count == 0) return; @@ -1619,11 +2392,11 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) vdev_stat_t *vs; uint_t vsc; + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + if (spa->spa_l2cache.sav_count == 0) return; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) == 0); VERIFY(nvlist_lookup_nvlist_array(spa->spa_l2cache.sav_config, @@ -1657,8 +2430,6 @@ spa_add_l2cache(spa_t *spa, nvlist_t *config) vdev_get_stats(vd, vs); } } - - spa_config_exit(spa, SCL_CONFIG, FTAG); } int @@ -1668,18 +2439,29 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) spa_t *spa; *config = NULL; - error = spa_open_common(name, &spa, FTAG, config); + error = spa_open_common(name, &spa, FTAG, NULL, config); - if (spa && *config != NULL) { - VERIFY(nvlist_add_uint64(*config, ZPOOL_CONFIG_ERRCOUNT, - spa_get_errlog_size(spa)) == 0); + if (spa != NULL) { + /* + * This still leaves a window of inconsistency where the spares + * or l2cache devices could change and the config would be + * self-inconsistent. + */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); - if (spa_suspended(spa)) + if (*config != NULL) { VERIFY(nvlist_add_uint64(*config, - ZPOOL_CONFIG_SUSPENDED, spa->spa_failmode) == 0); + ZPOOL_CONFIG_ERRCOUNT, + spa_get_errlog_size(spa)) == 0); + + if (spa_suspended(spa)) + VERIFY(nvlist_add_uint64(*config, + ZPOOL_CONFIG_SUSPENDED, + spa->spa_failmode) == 0); - spa_add_spares(spa, *config); - spa_add_l2cache(spa, *config); + spa_add_spares(spa, *config); + spa_add_l2cache(spa, *config); + } } /* @@ -1701,8 +2483,10 @@ spa_get_stats(const char *name, nvlist_t **config, char *altroot, size_t buflen) } } - if (spa != NULL) + if (spa != NULL) { + spa_config_exit(spa, SCL_CONFIG, FTAG); spa_close(spa, FTAG); + } return (error); } @@ -1873,11 +2657,9 @@ spa_l2cache_drop(spa_t *spa) vd = sav->sav_vdevs[i]; ASSERT(vd != NULL); - if ((spa_mode & FWRITE) && - spa_l2cache_exists(vd->vdev_guid, &pool) && pool != 0ULL && - l2arc_vdev_present(vd)) { + if (spa_l2cache_exists(vd->vdev_guid, &pool) && + pool != 0ULL && l2arc_vdev_present(vd)) l2arc_remove_vdev(vd); - } if (vd->vdev_isl2cache) spa_l2cache_remove(vd); vdev_clear_stats(vd); @@ -1897,7 +2679,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, vdev_t *rvd; dsl_pool_t *dp; dmu_tx_t *tx; - int c, error = 0; + int error = 0; uint64_t txg = TXG_INITIAL; nvlist_t **spares, **l2cache; uint_t nspares, nl2cache; @@ -1917,13 +2699,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, */ (void) nvlist_lookup_string(props, zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, altroot); - spa_activate(spa); - - spa->spa_uberblock.ub_txg = txg - 1; + spa = spa_add(pool, NULL, altroot); + spa_activate(spa, spa_mode_global); if (props && (error = spa_prop_validate(spa, props))) { - spa_unload(spa); spa_deactivate(spa); spa_remove(spa); mutex_exit(&spa_namespace_lock); @@ -1934,9 +2713,18 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, &version) != 0) version = SPA_VERSION; ASSERT(version <= SPA_VERSION); + + spa->spa_first_txg = txg; + spa->spa_uberblock.ub_txg = txg - 1; spa->spa_uberblock.ub_version = version; spa->spa_ubsync = spa->spa_uberblock; + /* + * Create "The Godfather" zio to hold all async IOs + */ + spa->spa_async_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_GODFATHER); + /* * Create the root vdev. */ @@ -1954,9 +2742,10 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (error = vdev_create(rvd, txg, B_FALSE)) == 0 && (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { - for (c = 0; c < rvd->vdev_children; c++) - vdev_init(rvd->vdev_child[c], txg); - vdev_config_dirty(rvd); + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_metaslab_set_size(rvd->vdev_child[c]); + vdev_expand(rvd->vdev_child[c], txg); + } } spa_config_exit(spa, SCL_ALL, FTAG); @@ -2002,6 +2791,13 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_dsl_pool = dp = dsl_pool_create(spa, zplprops, txg); spa->spa_meta_objset = dp->dp_meta_objset; + /* + * Create DDTs (dedup tables). + */ + ddt_create(spa); + + spa_update_dspace(spa); + tx = dmu_tx_create_assigned(dp, txg); /* @@ -2032,14 +2828,14 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, * because sync-to-convergence takes longer if the blocksize * keeps changing. */ - spa->spa_sync_bplist_obj = bplist_create(spa->spa_meta_objset, + spa->spa_deferred_bplist_obj = bplist_create(spa->spa_meta_objset, 1 << 14, tx); - dmu_object_set_compress(spa->spa_meta_objset, spa->spa_sync_bplist_obj, - ZIO_COMPRESS_OFF, tx); + dmu_object_set_compress(spa->spa_meta_objset, + spa->spa_deferred_bplist_obj, ZIO_COMPRESS_OFF, tx); if (zap_add(spa->spa_meta_objset, DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_SYNC_BPLIST, - sizeof (uint64_t), 1, &spa->spa_sync_bplist_obj, tx) != 0) { + sizeof (uint64_t), 1, &spa->spa_deferred_bplist_obj, tx) != 0) { cmn_err(CE_PANIC, "failed to add bplist"); } @@ -2055,8 +2851,12 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, spa->spa_bootfs = zpool_prop_default_numeric(ZPOOL_PROP_BOOTFS); spa->spa_delegation = zpool_prop_default_numeric(ZPOOL_PROP_DELEGATION); spa->spa_failmode = zpool_prop_default_numeric(ZPOOL_PROP_FAILUREMODE); - if (props) + spa->spa_autoexpand = zpool_prop_default_numeric(ZPOOL_PROP_AUTOEXPAND); + + if (props != NULL) { + spa_configfile_set(spa, props, B_FALSE); spa_sync_props(spa, props, CRED(), tx); + } dmu_tx_commit(tx); @@ -2073,403 +2873,410 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, if (version >= SPA_VERSION_ZPOOL_HISTORY && history_str != NULL) (void) spa_history_log(spa, history_str, LOG_CMD_POOL_CREATE); - - mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_CREATE); spa->spa_minref = refcount_count(&spa->spa_refcount); + mutex_exit(&spa_namespace_lock); + return (0); } +#ifdef _KERNEL /* - * Import the given pool into the system. We set up the necessary spa_t and - * then call spa_load() to do the dirty work. + * Get the root pool information from the root disk, then import the root pool + * during the system boot up time. */ -static int -spa_import_common(const char *pool, nvlist_t *config, nvlist_t *props, - boolean_t isroot, boolean_t allowfaulted) +extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); + +static nvlist_t * +spa_generate_rootconf(char *devpath, char *devid, uint64_t *guid) { - spa_t *spa; - char *altroot = NULL; - int error, loaderr; - nvlist_t *nvroot; - nvlist_t **spares, **l2cache; - uint_t nspares, nl2cache; + nvlist_t *config; + nvlist_t *nvtop, *nvroot; + uint64_t pgid; - /* - * If a pool with this name exists, return failure. - */ - mutex_enter(&spa_namespace_lock); - if ((spa = spa_lookup(pool)) != NULL) { - if (isroot) { - /* - * Remove the existing root pool from the - * namespace so that we can replace it with - * the correct config we just read in. - */ - ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); - spa_remove(spa); - } else { - mutex_exit(&spa_namespace_lock); - return (EEXIST); - } - } + if (vdev_disk_read_rootlabel(devpath, devid, &config) != 0) + return (NULL); /* - * Create and initialize the spa structure. + * Add this top-level vdev to the child array. */ - (void) nvlist_lookup_string(props, - zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - spa = spa_add(pool, altroot); - spa_activate(spa); - - if (allowfaulted) - spa->spa_import_faulted = B_TRUE; - spa->spa_is_root = isroot; + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, + &pgid) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_GUID, guid) == 0); /* - * Pass off the heavy lifting to spa_load(). - * Pass TRUE for mosconfig (unless this is a root pool) because - * the user-supplied config is actually the one to trust when - * doing an import. + * Put this pool's top-level vdevs into a root vdev. */ - loaderr = error = spa_load(spa, config, SPA_LOAD_IMPORT, !isroot); + VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, + VDEV_TYPE_ROOT) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); + VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); + VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, + &nvtop, 1) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); /* - * Toss any existing sparelist, as it doesn't have any validity anymore, - * and conflicts with spa_has_spare(). + * Replace the existing vdev_tree with the new root vdev in + * this pool's configuration (remove the old, add the new). */ - if (!isroot && spa->spa_spares.sav_config) { - nvlist_free(spa->spa_spares.sav_config); - spa->spa_spares.sav_config = NULL; - spa_load_spares(spa); - } - if (!isroot && spa->spa_l2cache.sav_config) { - nvlist_free(spa->spa_l2cache.sav_config); - spa->spa_l2cache.sav_config = NULL; - spa_load_l2cache(spa); - } + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); + nvlist_free(nvroot); + return (config); +} - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, - &nvroot) == 0); - if (error == 0) - error = spa_validate_aux(spa, nvroot, -1ULL, VDEV_ALLOC_SPARE); - if (error == 0) - error = spa_validate_aux(spa, nvroot, -1ULL, - VDEV_ALLOC_L2CACHE); - spa_config_exit(spa, SCL_ALL, FTAG); +/* + * Walk the vdev tree and see if we can find a device with "better" + * configuration. A configuration is "better" if the label on that + * device has a more recent txg. + */ +static void +spa_alt_rootvdev(vdev_t *vd, vdev_t **avd, uint64_t *txg) +{ + for (int c = 0; c < vd->vdev_children; c++) + spa_alt_rootvdev(vd->vdev_child[c], avd, txg); - if (error != 0 || (props && (error = spa_prop_set(spa, props)))) { - if (loaderr != 0 && loaderr != EINVAL && allowfaulted) { - /* - * If we failed to load the pool, but 'allowfaulted' is - * set, then manually set the config as if the config - * passed in was specified in the cache file. - */ - error = 0; - spa->spa_import_faulted = B_FALSE; - if (spa->spa_config == NULL) - spa->spa_config = spa_config_generate(spa, - NULL, -1ULL, B_TRUE); - spa_unload(spa); - spa_deactivate(spa); - spa_config_sync(spa, B_FALSE, B_TRUE); - } else { - spa_unload(spa); - spa_deactivate(spa); - spa_remove(spa); + if (vd->vdev_ops->vdev_op_leaf) { + nvlist_t *label; + uint64_t label_txg; + + if (vdev_disk_read_rootlabel(vd->vdev_physpath, vd->vdev_devid, + &label) != 0) + return; + + VERIFY(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_TXG, + &label_txg) == 0); + + /* + * Do we have a better boot device? + */ + if (label_txg > *txg) { + *txg = label_txg; + *avd = vd; } - mutex_exit(&spa_namespace_lock); - return (error); + nvlist_free(label); } +} + +/* + * Import a root pool. + * + * For x86. devpath_list will consist of devid and/or physpath name of + * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). + * The GRUB "findroot" command will return the vdev we should boot. + * + * For Sparc, devpath_list consists the physpath name of the booting device + * no matter the rootpool is a single device pool or a mirrored pool. + * e.g. + * "/pci@1f,0/ide@d/disk@0,0:a" + */ +int +spa_import_rootpool(char *devpath, char *devid) +{ + spa_t *spa; + vdev_t *rvd, *bvd, *avd = NULL; + nvlist_t *config, *nvtop; + uint64_t guid, txg; + char *pname; + int error; /* - * Override any spares and level 2 cache devices as specified by - * the user, as these may have correct device names/devids, etc. + * Read the label from the boot device and generate a configuration. */ - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0) { - if (spa->spa_spares.sav_config) - VERIFY(nvlist_remove(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); - else - VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, - ZPOOL_CONFIG_SPARES, spares, nspares) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_spares(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - spa->spa_spares.sav_sync = B_TRUE; + config = spa_generate_rootconf(devpath, devid, &guid); +#if defined(_OBP) && defined(_KERNEL) + if (config == NULL) { + if (strstr(devpath, "/iscsi/ssd") != NULL) { + /* iscsi boot */ + get_iscsi_bootpath_phy(devpath); + config = spa_generate_rootconf(devpath, devid, &guid); + } } - if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, - &l2cache, &nl2cache) == 0) { - if (spa->spa_l2cache.sav_config) - VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); - else - VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, - ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); - spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); - spa_load_l2cache(spa); - spa_config_exit(spa, SCL_ALL, FTAG); - spa->spa_l2cache.sav_sync = B_TRUE; +#endif + if (config == NULL) { + cmn_err(CE_NOTE, "Can not read the pool label from '%s'", + devpath); + return (EIO); } - if (spa_mode & FWRITE) { + VERIFY(nvlist_lookup_string(config, ZPOOL_CONFIG_POOL_NAME, + &pname) == 0); + VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(pname)) != NULL) { /* - * Update the config cache to include the newly-imported pool. + * Remove the existing root pool from the namespace so that we + * can replace it with the correct config we just read in. */ - spa_config_update_common(spa, SPA_CONFIG_UPDATE_POOL, isroot); + spa_remove(spa); } - spa->spa_import_faulted = B_FALSE; - mutex_exit(&spa_namespace_lock); - - return (0); -} + spa = spa_add(pname, config, NULL); + spa->spa_is_root = B_TRUE; + spa->spa_load_verbatim = B_TRUE; -#ifdef _KERNEL -/* - * Build a "root" vdev for a top level vdev read in from a rootpool - * device label. - */ -static void -spa_build_rootpool_config(nvlist_t *config) -{ - nvlist_t *nvtop, *nvroot; - uint64_t pgid; + /* + * Build up a vdev tree based on the boot device's label config. + */ + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvtop) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + error = spa_config_parse(spa, &rvd, nvtop, NULL, 0, + VDEV_ALLOC_ROOTPOOL); + spa_config_exit(spa, SCL_ALL, FTAG); + if (error) { + mutex_exit(&spa_namespace_lock); + nvlist_free(config); + cmn_err(CE_NOTE, "Can not parse the config for pool '%s'", + pname); + return (error); + } /* - * Add this top-level vdev to the child array. + * Get the boot vdev. */ - VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvtop) - == 0); - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_GUID, &pgid) - == 0); + if ((bvd = vdev_lookup_by_guid(rvd, guid)) == NULL) { + cmn_err(CE_NOTE, "Can not find the boot vdev for guid %llu", + (u_longlong_t)guid); + error = ENOENT; + goto out; + } /* - * Put this pool's top-level vdevs into a root vdev. + * Determine if there is a better boot device. */ - VERIFY(nvlist_alloc(&nvroot, NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_string(nvroot, ZPOOL_CONFIG_TYPE, VDEV_TYPE_ROOT) - == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_ID, 0ULL) == 0); - VERIFY(nvlist_add_uint64(nvroot, ZPOOL_CONFIG_GUID, pgid) == 0); - VERIFY(nvlist_add_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN, - &nvtop, 1) == 0); + avd = bvd; + spa_alt_rootvdev(rvd, &avd, &txg); + if (avd != bvd) { + cmn_err(CE_NOTE, "The boot device is 'degraded'. Please " + "try booting from '%s'", avd->vdev_path); + error = EINVAL; + goto out; + } /* - * Replace the existing vdev_tree with the new root vdev in - * this pool's configuration (remove the old, add the new). + * If the boot device is part of a spare vdev then ensure that + * we're booting off the active spare. */ - VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); - nvlist_free(nvroot); + if (bvd->vdev_parent->vdev_ops == &vdev_spare_ops && + !bvd->vdev_isspare) { + cmn_err(CE_NOTE, "The boot device is currently spared. Please " + "try booting from '%s'", + bvd->vdev_parent->vdev_child[1]->vdev_path); + error = EINVAL; + goto out; + } + + error = 0; + spa_history_log_version(spa, LOG_POOL_IMPORT); +out: + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + vdev_free(rvd); + spa_config_exit(spa, SCL_ALL, FTAG); + mutex_exit(&spa_namespace_lock); + + nvlist_free(config); + return (error); } +#endif + /* - * Get the root pool information from the root disk, then import the root pool - * during the system boot up time. + * Take a pool and insert it into the namespace as if it had been loaded at + * boot. */ -extern int vdev_disk_read_rootlabel(char *, char *, nvlist_t **); - int -spa_check_rootconf(char *devpath, char *devid, nvlist_t **bestconf, - uint64_t *besttxg) +spa_import_verbatim(const char *pool, nvlist_t *config, nvlist_t *props) { - nvlist_t *config; - uint64_t txg; - int error; + spa_t *spa; + char *altroot = NULL; - if (error = vdev_disk_read_rootlabel(devpath, devid, &config)) - return (error); + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); + } - VERIFY(nvlist_lookup_uint64(config, ZPOOL_CONFIG_POOL_TXG, &txg) == 0); + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + spa = spa_add(pool, config, altroot); - if (bestconf != NULL) - *bestconf = config; - else - nvlist_free(config); - *besttxg = txg; - return (0); -} + spa->spa_load_verbatim = B_TRUE; -boolean_t -spa_rootdev_validate(nvlist_t *nv) -{ - uint64_t ival; + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); - if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &ival) == 0 || - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &ival) == 0 || - nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &ival) == 0) - return (B_FALSE); + spa_config_sync(spa, B_FALSE, B_TRUE); - return (B_TRUE); -} + mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); + return (0); +} /* - * Given the boot device's physical path or devid, check if the device - * is in a valid state. If so, return the configuration from the vdev - * label. + * Import a non-root pool into the system. */ int -spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf) +spa_import(const char *pool, nvlist_t *config, nvlist_t *props) { - nvlist_t *conf = NULL; - uint64_t txg = 0; - nvlist_t *nvtop, **child; - char *type; - char *bootpath = NULL; - uint_t children, c; - char *tmp; + spa_t *spa; + char *altroot = NULL; + spa_load_state_t state = SPA_LOAD_IMPORT; + zpool_rewind_policy_t policy; int error; + nvlist_t *nvroot; + nvlist_t **spares, **l2cache; + uint_t nspares, nl2cache; - if (devpath && ((tmp = strchr(devpath, ' ')) != NULL)) - *tmp = '\0'; - if (error = spa_check_rootconf(devpath, devid, &conf, &txg)) { - cmn_err(CE_NOTE, "error reading device label"); - return (error); - } - if (txg == 0) { - cmn_err(CE_NOTE, "this device is detached"); - nvlist_free(conf); - return (EINVAL); + /* + * If a pool with this name exists, return failure. + */ + mutex_enter(&spa_namespace_lock); + if (spa_lookup(pool) != NULL) { + mutex_exit(&spa_namespace_lock); + return (EEXIST); } - VERIFY(nvlist_lookup_nvlist(conf, ZPOOL_CONFIG_VDEV_TREE, - &nvtop) == 0); - VERIFY(nvlist_lookup_string(nvtop, ZPOOL_CONFIG_TYPE, &type) == 0); + zpool_get_rewind_policy(config, &policy); + if (policy.zrp_request & ZPOOL_DO_REWIND) + state = SPA_LOAD_RECOVER; - if (strcmp(type, VDEV_TYPE_DISK) == 0) { - if (spa_rootdev_validate(nvtop)) { - goto out; - } else { - nvlist_free(conf); - return (EINVAL); - } - } + /* + * Create and initialize the spa structure. + */ + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); + spa = spa_add(pool, config, altroot); + spa_activate(spa, spa_mode_global); - ASSERT(strcmp(type, VDEV_TYPE_MIRROR) == 0); + /* + * Don't start async tasks until we know everything is healthy. + */ + spa_async_suspend(spa); - VERIFY(nvlist_lookup_nvlist_array(nvtop, ZPOOL_CONFIG_CHILDREN, - &child, &children) == 0); + /* + * Pass off the heavy lifting to spa_load(). Pass TRUE for mosconfig + * because the user-supplied config is actually the one to trust when + * doing an import. + */ + if (state != SPA_LOAD_RECOVER) + spa->spa_last_ubsync_txg = spa->spa_load_txg = 0; + error = spa_load_best(spa, state, B_TRUE, policy.zrp_txg, + policy.zrp_request); /* - * Go thru vdevs in the mirror to see if the given device - * has the most recent txg. Only the device with the most - * recent txg has valid information and should be booted. + * Propagate anything learned about failing or best txgs + * back to caller */ - for (c = 0; c < children; c++) { - char *cdevid, *cpath; - uint64_t tmptxg; + spa_rewind_data_to_nvlist(spa, config); - if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_PHYS_PATH, - &cpath) != 0) - return (EINVAL); - if (nvlist_lookup_string(child[c], ZPOOL_CONFIG_DEVID, - &cdevid) != 0) - return (EINVAL); - if ((spa_check_rootconf(cpath, cdevid, NULL, - &tmptxg) == 0) && (tmptxg > txg)) { - txg = tmptxg; - VERIFY(nvlist_lookup_string(child[c], - ZPOOL_CONFIG_PATH, &bootpath) == 0); - } + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + /* + * Toss any existing sparelist, as it doesn't have any validity + * anymore, and conflicts with spa_has_spare(). + */ + if (spa->spa_spares.sav_config) { + nvlist_free(spa->spa_spares.sav_config); + spa->spa_spares.sav_config = NULL; + spa_load_spares(spa); + } + if (spa->spa_l2cache.sav_config) { + nvlist_free(spa->spa_l2cache.sav_config); + spa->spa_l2cache.sav_config = NULL; + spa_load_l2cache(spa); } - /* Does the best device match the one we've booted from? */ - if (bootpath) { - cmn_err(CE_NOTE, "try booting from '%s'", bootpath); - return (EINVAL); + VERIFY(nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, + &nvroot) == 0); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_SPARE); + if (error == 0) + error = spa_validate_aux(spa, nvroot, -1ULL, + VDEV_ALLOC_L2CACHE); + spa_config_exit(spa, SCL_ALL, FTAG); + + if (props != NULL) + spa_configfile_set(spa, props, B_FALSE); + + if (error != 0 || (props && spa_writeable(spa) && + (error = spa_prop_set(spa, props)))) { + spa_unload(spa); + spa_deactivate(spa); + spa_remove(spa); + mutex_exit(&spa_namespace_lock); + return (error); } -out: - *bestconf = conf; - return (0); -} -/* - * Import a root pool. - * - * For x86. devpath_list will consist of devid and/or physpath name of - * the vdev (e.g. "id1,sd@SSEAGATE..." or "/pci@1f,0/ide@d/disk@0,0:a"). - * The GRUB "findroot" command will return the vdev we should boot. - * - * For Sparc, devpath_list consists the physpath name of the booting device - * no matter the rootpool is a single device pool or a mirrored pool. - * e.g. - * "/pci@1f,0/ide@d/disk@0,0:a" - */ -int -spa_import_rootpool(char *devpath, char *devid) -{ - nvlist_t *conf = NULL; - char *pname; - int error; + spa_async_resume(spa); /* - * Get the vdev pathname and configuation from the most - * recently updated vdev (highest txg). + * Override any spares and level 2 cache devices as specified by + * the user, as these may have correct device names/devids, etc. */ - if (error = spa_get_rootconf(devpath, devid, &conf)) - goto msg_out; + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_SPARES, + &spares, &nspares) == 0) { + if (spa->spa_spares.sav_config) + VERIFY(nvlist_remove(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_spares.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_spares.sav_config, + ZPOOL_CONFIG_SPARES, spares, nspares) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_spares(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_spares.sav_sync = B_TRUE; + } + if (nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_L2CACHE, + &l2cache, &nl2cache) == 0) { + if (spa->spa_l2cache.sav_config) + VERIFY(nvlist_remove(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, DATA_TYPE_NVLIST_ARRAY) == 0); + else + VERIFY(nvlist_alloc(&spa->spa_l2cache.sav_config, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_nvlist_array(spa->spa_l2cache.sav_config, + ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache) == 0); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); + spa_load_l2cache(spa); + spa_config_exit(spa, SCL_ALL, FTAG); + spa->spa_l2cache.sav_sync = B_TRUE; + } /* - * Add type "root" vdev to the config. + * Check for any removed devices. */ - spa_build_rootpool_config(conf); + if (spa->spa_autoreplace) { + spa_aux_check_removed(&spa->spa_spares); + spa_aux_check_removed(&spa->spa_l2cache); + } - VERIFY(nvlist_lookup_string(conf, ZPOOL_CONFIG_POOL_NAME, &pname) == 0); + if (spa_writeable(spa)) { + /* + * Update the config cache to include the newly-imported pool. + */ + spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + } /* - * We specify 'allowfaulted' for this to be treated like spa_open() - * instead of spa_import(). This prevents us from marking vdevs as - * persistently unavailable, and generates FMA ereports as if it were a - * pool open, not import. + * It's possible that the pool was expanded while it was exported. + * We kick off an async task to handle this for us. */ - error = spa_import_common(pname, conf, NULL, B_TRUE, B_TRUE); - ASSERT(error != EEXIST); + spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); - nvlist_free(conf); - return (error); - -msg_out: - cmn_err(CE_NOTE, "\n" - " *************************************************** \n" - " * This device is not bootable! * \n" - " * It is either offlined or detached or faulted. * \n" - " * Please try to boot from a different device. * \n" - " *************************************************** "); - - return (error); -} -#endif - -/* - * Import a non-root pool into the system. - */ -int -spa_import(const char *pool, nvlist_t *config, nvlist_t *props) -{ - return (spa_import_common(pool, config, props, B_FALSE, B_FALSE)); -} + mutex_exit(&spa_namespace_lock); + spa_history_log_version(spa, LOG_POOL_IMPORT); -int -spa_import_faulted(const char *pool, nvlist_t *config, nvlist_t *props) -{ - return (spa_import_common(pool, config, props, B_FALSE, B_TRUE)); + return (0); } - -/* - * This (illegal) pool name is used when temporarily importing a spa_t in order - * to get the vdev stats associated with the imported devices. - */ -#define TRYIMPORT_NAME "$import" - nvlist_t * spa_tryimport(nvlist_t *tryconfig) { @@ -2477,6 +3284,7 @@ spa_tryimport(nvlist_t *tryconfig) char *poolname; spa_t *spa; uint64_t state; + int error; if (nvlist_lookup_string(tryconfig, ZPOOL_CONFIG_POOL_NAME, &poolname)) return (NULL); @@ -2488,15 +3296,15 @@ spa_tryimport(nvlist_t *tryconfig) * Create and initialize the spa structure. */ mutex_enter(&spa_namespace_lock); - spa = spa_add(TRYIMPORT_NAME, NULL); - spa_activate(spa); + spa = spa_add(TRYIMPORT_NAME, tryconfig, NULL); + spa_activate(spa, FREAD); /* * Pass off the heavy lifting to spa_load(). * Pass TRUE for mosconfig because the user-supplied config * is actually the one to trust when doing an import. */ - (void) spa_load(spa, tryconfig, SPA_LOAD_TRYIMPORT, B_TRUE); + error = spa_load(spa, SPA_LOAD_TRYIMPORT, SPA_IMPORT_EXISTING, B_TRUE); /* * If 'tryconfig' was at least parsable, return the current config. @@ -2515,7 +3323,7 @@ spa_tryimport(nvlist_t *tryconfig) * copy it out so that external consumers can tell which * pools are bootable. */ - if (spa->spa_bootfs) { + if ((!error || error == EEXIST) && spa->spa_bootfs) { char *tmpname = kmem_alloc(MAXPATHLEN, KM_SLEEP); /* @@ -2545,8 +3353,10 @@ spa_tryimport(nvlist_t *tryconfig) /* * Add the list of hot spares and level 2 cache devices. */ + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); spa_add_spares(spa, config); spa_add_l2cache(spa, config); + spa_config_exit(spa, SCL_CONFIG, FTAG); } spa_unload(spa); @@ -2563,18 +3373,19 @@ spa_tryimport(nvlist_t *tryconfig) * The act of destroying or exporting a pool is very simple. We make sure there * is no more pending I/O and any references to the pool are gone. Then, we * update the pool state and sync all the labels to disk, removing the - * configuration from the cache afterwards. + * configuration from the cache afterwards. If the 'hardforce' flag is set, then + * we don't sync the labels or remove the configuration cache. */ static int spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, - boolean_t force) + boolean_t force, boolean_t hardforce) { spa_t *spa; if (oldconfig) *oldconfig = NULL; - if (!(spa_mode & FWRITE)) + if (!(spa_mode_global & FWRITE)) return (EROFS); mutex_enter(&spa_namespace_lock); @@ -2635,7 +3446,7 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, * so mark them all dirty. spa_unload() will do the * final sync that pushes these changes out. */ - if (new_state != POOL_STATE_UNINITIALIZED) { + if (new_state != POOL_STATE_UNINITIALIZED && !hardforce) { spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); spa->spa_state = new_state; spa->spa_final_txg = spa_last_synced_txg(spa) + 1; @@ -2655,7 +3466,8 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, VERIFY(nvlist_dup(spa->spa_config, oldconfig, 0) == 0); if (new_state != POOL_STATE_UNINITIALIZED) { - spa_config_sync(spa, B_TRUE, B_TRUE); + if (!hardforce) + spa_config_sync(spa, B_TRUE, B_TRUE); spa_remove(spa); } mutex_exit(&spa_namespace_lock); @@ -2669,16 +3481,19 @@ spa_export_common(char *pool, int new_state, nvlist_t **oldconfig, int spa_destroy(char *pool) { - return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, B_FALSE)); + return (spa_export_common(pool, POOL_STATE_DESTROYED, NULL, + B_FALSE, B_FALSE)); } /* * Export a storage pool. */ int -spa_export(char *pool, nvlist_t **oldconfig, boolean_t force) +spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, + boolean_t hardforce) { - return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, force)); + return (spa_export_common(pool, POOL_STATE_EXPORTED, oldconfig, + force, hardforce)); } /* @@ -2689,7 +3504,7 @@ int spa_reset(char *pool) { return (spa_export_common(pool, POOL_STATE_UNINITIALIZED, NULL, - B_FALSE)); + B_FALSE, B_FALSE)); } /* @@ -2704,8 +3519,8 @@ spa_reset(char *pool) int spa_vdev_add(spa_t *spa, nvlist_t *nvroot) { - uint64_t txg; - int c, error; + uint64_t txg, id; + int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *tvd; nvlist_t **spares, **l2cache; @@ -2744,10 +3559,20 @@ spa_vdev_add(spa_t *spa, nvlist_t *nvroot) /* * Transfer each new top-level vdev from vd to rvd. */ - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { + + /* + * Set the vdev id to the first hole, if one exists. + */ + for (id = 0; id < rvd->vdev_children; id++) { + if (rvd->vdev_child[id]->vdev_ishole) { + vdev_free(rvd->vdev_child[id]); + break; + } + } tvd = vd->vdev_child[c]; vdev_remove_child(vd, tvd); - tvd->vdev_id = rvd->vdev_children; + tvd->vdev_id = id; vdev_add_child(rvd, tvd); vdev_config_dirty(tvd); } @@ -2808,7 +3633,6 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) vdev_t *rvd = spa->spa_root_vdev; vdev_t *oldvd, *newvd, *newrootvd, *pvd, *tvd; vdev_ops_t *pvops; - dmu_tx_t *tx; char *oldvdpath, *newvdpath; int newvd_isspare; int error; @@ -2887,10 +3711,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) } /* - * Compare the new device size with the replaceable/attachable - * device size. + * Make sure the new device is big enough. */ - if (newvd->vdev_psize < vdev_get_rsize(oldvd)) + if (newvd->vdev_asize < vdev_get_min_asize(oldvd)) return (spa_vdev_exit(spa, newrootvd, txg, EOVERFLOW)); /* @@ -2932,14 +3755,9 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ vdev_remove_child(newrootvd, newvd); newvd->vdev_id = pvd->vdev_children; + newvd->vdev_crtxg = oldvd->vdev_crtxg; vdev_add_child(pvd, newvd); - /* - * If newvd is smaller than oldvd, but larger than its rsize, - * the addition of newvd may have decreased our parent's asize. - */ - pvd->vdev_asize = MIN(pvd->vdev_asize, newvd->vdev_asize); - tvd = newvd->vdev_top; ASSERT(pvd->vdev_top == tvd); ASSERT(tvd->vdev_parent == rvd); @@ -2952,13 +3770,14 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) */ open_txg = txg + TXG_CONCURRENT_STATES - 1; - mutex_enter(&newvd->vdev_dtl_lock); - space_map_add(&newvd->vdev_dtl_map, TXG_INITIAL, - open_txg - TXG_INITIAL + 1); - mutex_exit(&newvd->vdev_dtl_lock); + vdev_dtl_dirty(newvd, DTL_MISSING, + TXG_INITIAL, open_txg - TXG_INITIAL + 1); - if (newvd->vdev_isspare) + if (newvd->vdev_isspare) { spa_spare_activate(newvd); + spa_event_notify(spa, newvd, ESC_ZFS_VDEV_SPARE); + } + oldvdpath = spa_strdup(oldvd->vdev_path); newvdpath = spa_strdup(newvd->vdev_path); newvd_isspare = newvd->vdev_isspare; @@ -2970,17 +3789,11 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) (void) spa_vdev_exit(spa, newrootvd, open_txg, 0); - tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); - if (dmu_tx_assign(tx, TXG_WAIT) == 0) { - spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, tx, - CRED(), "%s vdev=%s %s vdev=%s", - replacing && newvd_isspare ? "spare in" : - replacing ? "replace" : "attach", newvdpath, - replacing ? "for" : "to", oldvdpath); - dmu_tx_commit(tx); - } else { - dmu_tx_abort(tx); - } + spa_history_internal_log(LOG_POOL_VDEV_ATTACH, spa, NULL, + CRED(), "%s vdev=%s %s vdev=%s", + replacing && newvd_isspare ? "spare in" : + replacing ? "replace" : "attach", newvdpath, + replacing ? "for" : "to", oldvdpath); spa_strfree(oldvdpath); spa_strfree(newvdpath); @@ -2999,15 +3812,16 @@ spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing) * is a replacing vdev. */ int -spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) +spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, int replace_done) { uint64_t txg; - int c, t, error; + int error; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd, *pvd, *cvd, *tvd; boolean_t unspare = B_FALSE; uint64_t unspare_guid; size_t len; + char *vdpath; txg = spa_vdev_enter(spa); @@ -3021,6 +3835,22 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) pvd = vd->vdev_parent; + /* + * If the parent/child relationship is not as expected, don't do it. + * Consider M(A,R(B,C)) -- that is, a mirror of A with a replacing + * vdev that's replacing B with C. The user's intent in replacing + * is to go from M(A,B) to M(A,C). If the user decides to cancel + * the replace by detaching C, the expected behavior is to end up + * M(A,B). But suppose that right after deciding to detach C, + * the replacement of B completes. We would have M(A,C), and then + * ask to detach C, which would leave us with just A -- not what + * the user wanted. To prevent this, we make sure that the + * parent/child relationship hasn't changed -- in this example, + * that C's parent is still the replacing vdev R. + */ + if (pvd->vdev_guid != pguid && pguid != 0) + return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + /* * If replace_done is specified, only remove this device if it's * the first child of a replacing vdev. For the 'spare' vdev, either @@ -3047,36 +3877,13 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); /* - * If there's only one replica, you can't detach it. + * If this device has the only valid copy of some data, + * we cannot safely detach it. */ - if (pvd->vdev_children <= 1) + if (vdev_dtl_required(vd)) return (spa_vdev_exit(spa, NULL, txg, EBUSY)); - /* - * If all siblings have non-empty DTLs, this device may have the only - * valid copy of the data, which means we cannot safely detach it. - * - * XXX -- as in the vdev_offline() case, we really want a more - * precise DTL check. - */ - for (c = 0; c < pvd->vdev_children; c++) { - uint64_t dirty; - - cvd = pvd->vdev_child[c]; - if (cvd == vd) - continue; - if (vdev_is_dead(cvd)) - continue; - mutex_enter(&cvd->vdev_dtl_lock); - dirty = cvd->vdev_dtl_map.sm_space | - cvd->vdev_dtl_scrub.sm_space; - mutex_exit(&cvd->vdev_dtl_lock); - if (!dirty) - break; - } - - if (c == pvd->vdev_children) - return (spa_vdev_exit(spa, NULL, txg, EBUSY)); + ASSERT(pvd->vdev_children >= 2); /* * If we are detaching the second disk from a replacing vdev, then @@ -3102,7 +3909,7 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) * active spare list for the pool. */ if (pvd->vdev_ops == &vdev_spare_ops && - vd->vdev_id == 0) + vd->vdev_id == 0 && pvd->vdev_child[1]->vdev_isspare) unspare = B_TRUE; /* @@ -3128,80 +3935,369 @@ spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done) /* * If we need to remove the remaining child from the list of hot spares, - * do it now, marking the vdev as no longer a spare in the process. We - * must do this before vdev_remove_parent(), because that can change the - * GUID if it creates a new toplevel GUID. + * do it now, marking the vdev as no longer a spare in the process. + * We must do this before vdev_remove_parent(), because that can + * change the GUID if it creates a new toplevel GUID. For a similar + * reason, we must remove the spare now, in the same txg as the detach; + * otherwise someone could attach a new sibling, change the GUID, and + * the subsequent attempt to spa_vdev_remove(unspare_guid) would fail. */ if (unspare) { ASSERT(cvd->vdev_isspare); spa_spare_remove(cvd); unspare_guid = cvd->vdev_guid; + (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + } + + /* + * If the parent mirror/replacing vdev only has one child, + * the parent is no longer needed. Remove it from the tree. + */ + if (pvd->vdev_children == 1) + vdev_remove_parent(cvd); + + /* + * We don't set tvd until now because the parent we just removed + * may have been the previous top-level vdev. + */ + tvd = cvd->vdev_top; + ASSERT(tvd->vdev_parent == rvd); + + /* + * Reevaluate the parent vdev state. + */ + vdev_propagate_state(cvd); + + /* + * If the 'autoexpand' property is set on the pool then automatically + * try to expand the size of the pool. For example if the device we + * just detached was smaller than the others, it may be possible to + * add metaslabs (i.e. grow the pool). We need to reopen the vdev + * first so that we can obtain the updated sizes of the leaf vdevs. + */ + if (spa->spa_autoexpand) { + vdev_reopen(tvd); + vdev_expand(tvd, txg); + } + + vdev_config_dirty(tvd); + + /* + * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that + * vd->vdev_detached is set and free vd's DTL object in syncing context. + * But first make sure we're not on any *other* txg's DTL list, to + * prevent vd from being accessed after it's freed. + */ + vdpath = spa_strdup(vd->vdev_path); + for (int t = 0; t < TXG_SIZE; t++) + (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); + vd->vdev_detached = B_TRUE; + vdev_dirty(tvd, VDD_DTL, vd, txg); + + spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + + error = spa_vdev_exit(spa, vd, txg, 0); + + spa_history_internal_log(LOG_POOL_VDEV_DETACH, spa, NULL, CRED(), + "vdev=%s", vdpath); + spa_strfree(vdpath); + + /* + * If this was the removal of the original device in a hot spare vdev, + * then we want to go through and remove the device from the hot spare + * list of every other pool. + */ + if (unspare) { + spa_t *myspa = spa; + spa = NULL; + mutex_enter(&spa_namespace_lock); + while ((spa = spa_next(spa)) != NULL) { + if (spa->spa_state != POOL_STATE_ACTIVE) + continue; + if (spa == myspa) + continue; + spa_open_ref(spa, FTAG); + mutex_exit(&spa_namespace_lock); + (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); + mutex_enter(&spa_namespace_lock); + spa_close(spa, FTAG); + } + mutex_exit(&spa_namespace_lock); + } + + return (error); +} + +/* + * Split a set of devices from their mirrors, and create a new pool from them. + */ +int +spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp) +{ + int error = 0; + uint64_t txg, *glist; + spa_t *newspa; + uint_t c, children, lastlog; + nvlist_t **child, *nvl, *tmp; + dmu_tx_t *tx; + char *altroot = NULL; + vdev_t *rvd, **vml = NULL; /* vdev modify list */ + boolean_t activate_slog; + + if (!spa_writeable(spa)) + return (EROFS); + + txg = spa_vdev_enter(spa); + + /* clear the log and flush everything up to now */ + activate_slog = spa_passivate_log(spa); + (void) spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + error = spa_offline_log(spa); + txg = spa_vdev_config_enter(spa); + + if (activate_slog) + spa_activate_log(spa); + + if (error != 0) + return (spa_vdev_exit(spa, NULL, txg, error)); + + /* check new spa name before going any further */ + if (spa_lookup(newname) != NULL) + return (spa_vdev_exit(spa, NULL, txg, EEXIST)); + + /* + * scan through all the children to ensure they're all mirrors + */ + if (nvlist_lookup_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, &nvl) != 0 || + nvlist_lookup_nvlist_array(nvl, ZPOOL_CONFIG_CHILDREN, &child, + &children) != 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* first, check to ensure we've got the right child count */ + rvd = spa->spa_root_vdev; + lastlog = 0; + for (c = 0; c < rvd->vdev_children; c++) { + vdev_t *vd = rvd->vdev_child[c]; + + /* don't count the holes & logs as children */ + if (vd->vdev_islog || vd->vdev_ishole) { + if (lastlog == 0) + lastlog = c; + continue; + } + + lastlog = 0; + } + if (children != (lastlog != 0 ? lastlog : rvd->vdev_children)) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + /* next, ensure no spare or cache devices are part of the split */ + if (nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_SPARES, &tmp) == 0 || + nvlist_lookup_nvlist(nvl, ZPOOL_CONFIG_L2CACHE, &tmp) == 0) + return (spa_vdev_exit(spa, NULL, txg, EINVAL)); + + vml = kmem_zalloc(children * sizeof (vdev_t *), KM_SLEEP); + glist = kmem_zalloc(children * sizeof (uint64_t), KM_SLEEP); + + /* then, loop over each vdev and validate it */ + for (c = 0; c < children; c++) { + uint64_t is_hole = 0; + + (void) nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_IS_HOLE, + &is_hole); + + if (is_hole != 0) { + if (spa->spa_root_vdev->vdev_child[c]->vdev_ishole || + spa->spa_root_vdev->vdev_child[c]->vdev_islog) { + continue; + } else { + error = EINVAL; + break; + } + } + + /* which disk is going to be split? */ + if (nvlist_lookup_uint64(child[c], ZPOOL_CONFIG_GUID, + &glist[c]) != 0) { + error = EINVAL; + break; + } + + /* look it up in the spa */ + vml[c] = spa_lookup_by_guid(spa, glist[c], B_FALSE); + if (vml[c] == NULL) { + error = ENODEV; + break; + } + + /* make sure there's nothing stopping the split */ + if (vml[c]->vdev_parent->vdev_ops != &vdev_mirror_ops || + vml[c]->vdev_islog || + vml[c]->vdev_ishole || + vml[c]->vdev_isspare || + vml[c]->vdev_isl2cache || + !vdev_writeable(vml[c]) || + vml[c]->vdev_children != 0 || + vml[c]->vdev_state != VDEV_STATE_HEALTHY || + c != spa->spa_root_vdev->vdev_child[c]->vdev_id) { + error = EINVAL; + break; + } + + if (vdev_dtl_required(vml[c])) { + error = EBUSY; + break; + } + + /* we need certain info from the top level */ + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_ARRAY, + vml[c]->vdev_top->vdev_ms_array) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_METASLAB_SHIFT, + vml[c]->vdev_top->vdev_ms_shift) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASIZE, + vml[c]->vdev_top->vdev_asize) == 0); + VERIFY(nvlist_add_uint64(child[c], ZPOOL_CONFIG_ASHIFT, + vml[c]->vdev_top->vdev_ashift) == 0); + } + + if (error != 0) { + kmem_free(vml, children * sizeof (vdev_t *)); + kmem_free(glist, children * sizeof (uint64_t)); + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* stop writers from using the disks */ + for (c = 0; c < children; c++) { + if (vml[c] != NULL) + vml[c]->vdev_offline = B_TRUE; } + vdev_reopen(spa->spa_root_vdev); /* - * If the parent mirror/replacing vdev only has one child, - * the parent is no longer needed. Remove it from the tree. + * Temporarily record the splitting vdevs in the spa config. This + * will disappear once the config is regenerated. */ - if (pvd->vdev_children == 1) - vdev_remove_parent(cvd); + VERIFY(nvlist_alloc(&nvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64_array(nvl, ZPOOL_CONFIG_SPLIT_LIST, + glist, children) == 0); + kmem_free(glist, children * sizeof (uint64_t)); + + VERIFY(nvlist_add_nvlist(spa->spa_config, ZPOOL_CONFIG_SPLIT, + nvl) == 0); + spa->spa_config_splitting = nvl; + vdev_config_dirty(spa->spa_root_vdev); - /* - * We don't set tvd until now because the parent we just removed - * may have been the previous top-level vdev. - */ - tvd = cvd->vdev_top; - ASSERT(tvd->vdev_parent == rvd); + /* configure and create the new pool */ + VERIFY(nvlist_add_string(config, ZPOOL_CONFIG_POOL_NAME, newname) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_STATE, + exp ? POOL_STATE_EXPORTED : POOL_STATE_ACTIVE) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VERSION, + spa_version(spa)) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_TXG, + spa->spa_config_txg) == 0); + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, + spa_generate_guid(NULL)) == 0); + (void) nvlist_lookup_string(props, + zpool_prop_to_name(ZPOOL_PROP_ALTROOT), &altroot); - /* - * Reevaluate the parent vdev state. - */ - vdev_propagate_state(cvd); + /* add the new pool to the namespace */ + newspa = spa_add(newname, config, altroot); + newspa->spa_config_txg = spa->spa_config_txg; + spa_set_log_state(newspa, SPA_LOG_CLEAR); - /* - * If the device we just detached was smaller than the others, it may be - * possible to add metaslabs (i.e. grow the pool). vdev_metaslab_init() - * can't fail because the existing metaslabs are already in core, so - * there's nothing to read from disk. - */ - VERIFY(vdev_metaslab_init(tvd, txg) == 0); + /* release the spa config lock, retaining the namespace lock */ + spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); - vdev_config_dirty(tvd); + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 1); - /* - * Mark vd's DTL as dirty in this txg. vdev_dtl_sync() will see that - * vd->vdev_detached is set and free vd's DTL object in syncing context. - * But first make sure we're not on any *other* txg's DTL list, to - * prevent vd from being accessed after it's freed. - */ - for (t = 0; t < TXG_SIZE; t++) - (void) txg_list_remove_this(&tvd->vdev_dtl_list, vd, t); - vd->vdev_detached = B_TRUE; - vdev_dirty(tvd, VDD_DTL, vd, txg); + spa_activate(newspa, spa_mode_global); + spa_async_suspend(newspa); - spa_event_notify(spa, vd, ESC_ZFS_VDEV_REMOVE); + /* create the new pool from the disks of the original pool */ + error = spa_load(newspa, SPA_LOAD_IMPORT, SPA_IMPORT_ASSEMBLE, B_TRUE); + if (error) + goto out; - error = spa_vdev_exit(spa, vd, txg, 0); + /* if that worked, generate a real config for the new pool */ + if (newspa->spa_root_vdev != NULL) { + VERIFY(nvlist_alloc(&newspa->spa_config_splitting, + NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(newspa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, spa_guid(spa)) == 0); + spa_config_set(newspa, spa_config_generate(newspa, NULL, -1ULL, + B_TRUE)); + } - /* - * If this was the removal of the original device in a hot spare vdev, - * then we want to go through and remove the device from the hot spare - * list of every other pool. - */ - if (unspare) { - spa = NULL; - mutex_enter(&spa_namespace_lock); - while ((spa = spa_next(spa)) != NULL) { - if (spa->spa_state != POOL_STATE_ACTIVE) - continue; - spa_open_ref(spa, FTAG); - mutex_exit(&spa_namespace_lock); - (void) spa_vdev_remove(spa, unspare_guid, B_TRUE); - mutex_enter(&spa_namespace_lock); - spa_close(spa, FTAG); + /* set the props */ + if (props != NULL) { + spa_configfile_set(newspa, props, B_FALSE); + error = spa_prop_set(newspa, props); + if (error) + goto out; + } + + /* flush everything */ + txg = spa_vdev_config_enter(newspa); + vdev_config_dirty(newspa->spa_root_vdev); + (void) spa_vdev_config_exit(newspa, NULL, txg, 0, FTAG); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 2); + + spa_async_resume(newspa); + + /* finally, update the original pool's config */ + txg = spa_vdev_config_enter(spa); + tx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error != 0) + dmu_tx_abort(tx); + for (c = 0; c < children; c++) { + if (vml[c] != NULL) { + vdev_split(vml[c]); + if (error == 0) + spa_history_internal_log(LOG_POOL_VDEV_DETACH, + spa, tx, CRED(), "vdev=%s", + vml[c]->vdev_path); + vdev_free(vml[c]); } - mutex_exit(&spa_namespace_lock); } + vdev_config_dirty(spa->spa_root_vdev); + spa->spa_config_splitting = NULL; + nvlist_free(nvl); + if (error == 0) + dmu_tx_commit(tx); + (void) spa_vdev_exit(spa, NULL, txg, 0); + + if (zio_injection_enabled) + zio_handle_panic_injection(spa, FTAG, 3); + + /* split is complete; log a history record */ + spa_history_internal_log(LOG_POOL_SPLIT, newspa, NULL, CRED(), + "split new pool %s from pool %s", newname, spa_name(spa)); + + kmem_free(vml, children * sizeof (vdev_t *)); + + /* if we're not going to mount the filesystems in userland, export */ + if (exp) + error = spa_export_common(newname, POOL_STATE_EXPORTED, NULL, + B_FALSE, B_FALSE); + + return (error); +out: + spa_unload(newspa); + spa_deactivate(newspa); + spa_remove(newspa); + + txg = spa_vdev_config_enter(spa); + nvlist_free(spa->spa_config_splitting); + spa->spa_config_splitting = NULL; + (void) spa_vdev_exit(spa, NULL, txg, error); + + kmem_free(vml, children * sizeof (vdev_t *)); return (error); } @@ -3246,20 +4342,112 @@ spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count, kmem_free(newdev, (count - 1) * sizeof (void *)); } +/* + * Removing a device from the vdev namespace requires several steps + * and can take a significant amount of time. As a result we use + * the spa_vdev_config_[enter/exit] functions which allow us to + * grab and release the spa_config_lock while still holding the namespace + * lock. During each step the configuration is synced out. + */ + +/* + * Evacuate the device. + */ +int +spa_vdev_remove_evacuate(spa_t *spa, vdev_t *vd) +{ + int error = 0; + uint64_t txg; + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == 0); + ASSERT(vd == vd->vdev_top); + + /* + * Evacuate the device. We don't hold the config lock as writer + * since we need to do I/O but we do keep the + * spa_namespace_lock held. Once this completes the device + * should no longer have any blocks allocated on it. + */ + if (vd->vdev_islog) { + error = dmu_objset_find(spa_name(spa), zil_vdev_offline, + NULL, DS_FIND_CHILDREN); + } else { + error = ENOTSUP; /* until we have bp rewrite */ + } + + txg_wait_synced(spa_get_dsl(spa), 0); + + if (error) + return (error); + + /* + * The evacuation succeeded. Remove any remaining MOS metadata + * associated with this vdev, and wait for these changes to sync. + */ + txg = spa_vdev_config_enter(spa); + vd->vdev_removing = B_TRUE; + vdev_dirty(vd, 0, NULL, txg); + vdev_config_dirty(vd); + spa_vdev_config_exit(spa, NULL, txg, 0, FTAG); + + return (0); +} + +/* + * Complete the removal by cleaning up the namespace. + */ +void +spa_vdev_remove_from_namespace(spa_t *spa, vdev_t *vd) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t id = vd->vdev_id; + boolean_t last_vdev = (id == (rvd->vdev_children - 1)); + + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); + ASSERT(vd == vd->vdev_top); + + (void) vdev_label_init(vd, 0, VDEV_LABEL_REMOVE); + + if (list_link_active(&vd->vdev_state_dirty_node)) + vdev_state_clean(vd); + if (list_link_active(&vd->vdev_config_dirty_node)) + vdev_config_clean(vd); + + vdev_free(vd); + + if (last_vdev) { + vdev_compact_children(rvd); + } else { + vd = vdev_alloc_common(spa, id, 0, &vdev_hole_ops); + vdev_add_child(rvd, vd); + } + vdev_config_dirty(rvd); + + /* + * Reassess the health of our root vdev. + */ + vdev_reopen(rvd); +} + /* * Remove a device from the pool. Currently, this supports removing only hot - * spares and level 2 ARC devices. + * spares, slogs, and level 2 ARC devices. */ int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) { vdev_t *vd; + metaslab_group_t *mg; nvlist_t **spares, **l2cache, *nv; + uint64_t txg = 0; uint_t nspares, nl2cache; - uint64_t txg; int error = 0; + boolean_t locked = MUTEX_HELD(&spa_namespace_lock); - txg = spa_vdev_enter(spa); + if (!locked) + txg = spa_vdev_enter(spa); vd = spa_lookup_by_guid(spa, guid, B_FALSE); @@ -3290,6 +4478,49 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) ZPOOL_CONFIG_L2CACHE, l2cache, nl2cache, nv); spa_load_l2cache(spa); spa->spa_l2cache.sav_sync = B_TRUE; + } else if (vd != NULL && vd->vdev_islog) { + ASSERT(!locked); + ASSERT(vd == vd->vdev_top); + + /* + * XXX - Once we have bp-rewrite this should + * become the common case. + */ + + mg = vd->vdev_mg; + + /* + * Stop allocating from this vdev. + */ + metaslab_group_passivate(mg); + + /* + * Wait for the youngest allocations and frees to sync, + * and then wait for the deferral of those frees to finish. + */ + spa_vdev_config_exit(spa, NULL, + txg + TXG_CONCURRENT_STATES + TXG_DEFER_SIZE, 0, FTAG); + + /* + * Attempt to evacuate the vdev. + */ + error = spa_vdev_remove_evacuate(spa, vd); + + txg = spa_vdev_config_enter(spa); + + /* + * If we couldn't evacuate the vdev, unwind. + */ + if (error) { + metaslab_group_activate(mg); + return (spa_vdev_exit(spa, NULL, txg, error)); + } + + /* + * Clean up the vdev namespace. + */ + spa_vdev_remove_from_namespace(spa, vd); + } else if (vd != NULL) { /* * Normal vdevs cannot be removed (yet). @@ -3302,7 +4533,10 @@ spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare) error = ENOENT; } - return (spa_vdev_exit(spa, NULL, txg, error)); + if (!locked) + return (spa_vdev_exit(spa, NULL, txg, error)); + + return (error); } /* @@ -3313,9 +4547,8 @@ static vdev_t * spa_vdev_resilver_done_hunt(vdev_t *vd) { vdev_t *newvd, *oldvd; - int c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { oldvd = spa_vdev_resilver_done_hunt(vd->vdev_child[c]); if (oldvd != NULL) return (oldvd); @@ -3328,13 +4561,9 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) oldvd = vd->vdev_child[0]; newvd = vd->vdev_child[1]; - mutex_enter(&newvd->vdev_dtl_lock); - if (newvd->vdev_dtl_map.sm_space == 0 && - newvd->vdev_dtl_scrub.sm_space == 0) { - mutex_exit(&newvd->vdev_dtl_lock); + if (vdev_dtl_empty(newvd, DTL_MISSING) && + !vdev_dtl_required(oldvd)) return (oldvd); - } - mutex_exit(&newvd->vdev_dtl_lock); } /* @@ -3344,15 +4573,12 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) newvd = vd->vdev_child[0]; oldvd = vd->vdev_child[1]; - mutex_enter(&newvd->vdev_dtl_lock); if (newvd->vdev_unspare && - newvd->vdev_dtl_map.sm_space == 0 && - newvd->vdev_dtl_scrub.sm_space == 0) { + vdev_dtl_empty(newvd, DTL_MISSING) && + !vdev_dtl_required(oldvd)) { newvd->vdev_unspare = 0; - mutex_exit(&newvd->vdev_dtl_lock); return (oldvd); } - mutex_exit(&newvd->vdev_dtl_lock); } return (NULL); @@ -3361,90 +4587,78 @@ spa_vdev_resilver_done_hunt(vdev_t *vd) static void spa_vdev_resilver_done(spa_t *spa) { - vdev_t *vd; - vdev_t *pvd; - uint64_t guid; - uint64_t pguid = 0; + vdev_t *vd, *pvd, *ppvd; + uint64_t guid, sguid, pguid, ppguid; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); while ((vd = spa_vdev_resilver_done_hunt(spa->spa_root_vdev)) != NULL) { + pvd = vd->vdev_parent; + ppvd = pvd->vdev_parent; guid = vd->vdev_guid; + pguid = pvd->vdev_guid; + ppguid = ppvd->vdev_guid; + sguid = 0; /* * If we have just finished replacing a hot spared device, then * we need to detach the parent's first child (the original hot * spare) as well. */ - pvd = vd->vdev_parent; - if (pvd->vdev_parent->vdev_ops == &vdev_spare_ops && - pvd->vdev_id == 0) { + if (ppvd->vdev_ops == &vdev_spare_ops && pvd->vdev_id == 0) { ASSERT(pvd->vdev_ops == &vdev_replacing_ops); - ASSERT(pvd->vdev_parent->vdev_children == 2); - pguid = pvd->vdev_parent->vdev_child[1]->vdev_guid; + ASSERT(ppvd->vdev_children == 2); + sguid = ppvd->vdev_child[1]->vdev_guid; } - spa_config_exit(spa, SCL_CONFIG, FTAG); - if (spa_vdev_detach(spa, guid, B_TRUE) != 0) + spa_config_exit(spa, SCL_ALL, FTAG); + if (spa_vdev_detach(spa, guid, pguid, B_TRUE) != 0) return; - if (pguid != 0 && spa_vdev_detach(spa, pguid, B_TRUE) != 0) + if (sguid && spa_vdev_detach(spa, sguid, ppguid, B_TRUE) != 0) return; - spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_config_enter(spa, SCL_ALL, FTAG, RW_WRITER); } - spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_config_exit(spa, SCL_ALL, FTAG); } /* - * Update the stored path for this vdev. Dirty the vdev configuration, relying - * on spa_vdev_enter/exit() to synchronize the labels and cache. + * Update the stored path or FRU for this vdev. */ int -spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +spa_vdev_set_common(spa_t *spa, uint64_t guid, const char *value, + boolean_t ispath) { vdev_t *vd; - uint64_t txg; - txg = spa_vdev_enter(spa); - - if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) { - /* - * Determine if this is a reference to a hot spare device. If - * it is, update the path manually as there is no associated - * vdev_t that can be synced to disk. - */ - nvlist_t **spares; - uint_t i, nspares; - - if (spa->spa_spares.sav_config != NULL) { - VERIFY(nvlist_lookup_nvlist_array( - spa->spa_spares.sav_config, ZPOOL_CONFIG_SPARES, - &spares, &nspares) == 0); - for (i = 0; i < nspares; i++) { - uint64_t theguid; - VERIFY(nvlist_lookup_uint64(spares[i], - ZPOOL_CONFIG_GUID, &theguid) == 0); - if (theguid == guid) { - VERIFY(nvlist_add_string(spares[i], - ZPOOL_CONFIG_PATH, newpath) == 0); - spa_load_spares(spa); - spa->spa_spares.sav_sync = B_TRUE; - return (spa_vdev_exit(spa, NULL, txg, - 0)); - } - } - } + spa_vdev_state_enter(spa, SCL_ALL); - return (spa_vdev_exit(spa, NULL, txg, ENOENT)); - } + if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) + return (spa_vdev_state_exit(spa, NULL, ENOENT)); if (!vd->vdev_ops->vdev_op_leaf) - return (spa_vdev_exit(spa, NULL, txg, ENOTSUP)); + return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + + if (ispath) { + spa_strfree(vd->vdev_path); + vd->vdev_path = spa_strdup(value); + } else { + if (vd->vdev_fru != NULL) + spa_strfree(vd->vdev_fru); + vd->vdev_fru = spa_strdup(value); + } - spa_strfree(vd->vdev_path); - vd->vdev_path = spa_strdup(newpath); + return (spa_vdev_state_exit(spa, vd, 0)); +} - vdev_config_dirty(vd->vdev_top); +int +spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath) +{ + return (spa_vdev_set_common(spa, guid, newpath, B_TRUE)); +} - return (spa_vdev_exit(spa, NULL, txg, 0)); +int +spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru) +{ + return (spa_vdev_set_common(spa, guid, newfru, B_FALSE)); } /* @@ -3497,7 +4711,17 @@ spa_async_remove(spa_t *spa, vdev_t *vd) if (vd->vdev_remove_wanted) { vd->vdev_remove_wanted = 0; vdev_set_state(vd, B_FALSE, VDEV_STATE_REMOVED, VDEV_AUX_NONE); - vdev_clear(spa, vd); + + /* + * We want to clear the stats, but we don't want to do a full + * vdev_clear() as that will cause us to throw away + * degraded/faulted state as well as attempt to reopen the + * device, all of which is a waste. + */ + vd->vdev_stat.vs_read_errors = 0; + vd->vdev_stat.vs_write_errors = 0; + vd->vdev_stat.vs_checksum_errors = 0; + vdev_state_dirty(vd->vdev_top); } @@ -3517,6 +4741,37 @@ spa_async_probe(spa_t *spa, vdev_t *vd) spa_async_probe(spa, vd->vdev_child[c]); } +static void +spa_async_autoexpand(spa_t *spa, vdev_t *vd) +{ + sysevent_id_t eid; + nvlist_t *attr; + char *physpath; + + if (!spa->spa_autoexpand) + return; + + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + spa_async_autoexpand(spa, cvd); + } + + if (!vd->vdev_ops->vdev_op_leaf || vd->vdev_physpath == NULL) + return; + + physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); + (void) snprintf(physpath, MAXPATHLEN, "/devices%s", vd->vdev_physpath); + + VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); + + (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, + ESC_DEV_DLE, attr, &eid, DDI_SLEEP); + + nvlist_free(attr); + kmem_free(physpath, MAXPATHLEN); +} + static void spa_async_thread(spa_t *spa) { @@ -3533,16 +4788,31 @@ spa_async_thread(spa_t *spa) * See if the config needs to be updated. */ if (tasks & SPA_ASYNC_CONFIG_UPDATE) { + uint64_t old_space, new_space; + mutex_enter(&spa_namespace_lock); + old_space = metaslab_class_get_space(spa_normal_class(spa)); spa_config_update(spa, SPA_CONFIG_UPDATE_POOL); + new_space = metaslab_class_get_space(spa_normal_class(spa)); mutex_exit(&spa_namespace_lock); + + /* + * If the pool grew as a result of the config update, + * then log an internal history event. + */ + if (new_space != old_space) { + spa_history_internal_log(LOG_POOL_VDEV_ONLINE, + spa, NULL, CRED(), + "pool '%s' size: %llu(+%llu)", + spa_name(spa), new_space, new_space - old_space); + } } /* * See if any devices need to be marked REMOVED. */ if (tasks & SPA_ASYNC_REMOVE) { - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); spa_async_remove(spa, spa->spa_root_vdev); for (int i = 0; i < spa->spa_l2cache.sav_count; i++) spa_async_remove(spa, spa->spa_l2cache.sav_vdevs[i]); @@ -3551,11 +4821,17 @@ spa_async_thread(spa_t *spa) (void) spa_vdev_state_exit(spa, NULL, 0); } + if ((tasks & SPA_ASYNC_AUTOEXPAND) && !spa_suspended(spa)) { + spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); + spa_async_autoexpand(spa, spa->spa_root_vdev); + spa_config_exit(spa, SCL_CONFIG, FTAG); + } + /* * See if any devices need to be probed. */ if (tasks & SPA_ASYNC_PROBE) { - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); spa_async_probe(spa, spa->spa_root_vdev); (void) spa_vdev_state_exit(spa, NULL, 0); } @@ -3626,38 +4902,34 @@ spa_async_request(spa_t *spa, int task) * SPA syncing routines * ========================================================================== */ - static void -spa_sync_deferred_frees(spa_t *spa, uint64_t txg) +spa_sync_deferred_bplist(spa_t *spa, bplist_t *bpl, dmu_tx_t *tx, uint64_t txg) { - bplist_t *bpl = &spa->spa_sync_bplist; - dmu_tx_t *tx; blkptr_t blk; uint64_t itor = 0; - zio_t *zio; - int error; uint8_t c = 1; - zio = zio_root(spa, NULL, NULL, ZIO_FLAG_CANFAIL); - while (bplist_iterate(bpl, &itor, &blk) == 0) { ASSERT(blk.blk_birth < txg); - zio_nowait(zio_free(zio, spa, txg, &blk, NULL, NULL, - ZIO_FLAG_MUSTSUCCEED)); + zio_free(spa, txg, &blk); } - error = zio_wait(zio); - ASSERT3U(error, ==, 0); - - tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); bplist_vacate(bpl, tx); /* * Pre-dirty the first block so we sync to convergence faster. * (Usually only the first block is needed.) */ - dmu_write(spa->spa_meta_objset, spa->spa_sync_bplist_obj, 0, 1, &c, tx); - dmu_tx_commit(tx); + dmu_write(bpl->bpl_mos, spa->spa_deferred_bplist_obj, 0, 1, &c, tx); +} + +static void +spa_sync_free(void *arg, const blkptr_t *bp, dmu_tx_t *tx) +{ + zio_t *zio = arg; + + zio_nowait(zio_free_sync(zio, zio->io_spa, dmu_tx_get_txg(tx), bp, + zio->io_flags)); } static void @@ -3775,7 +5047,6 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) zpool_prop_t prop; const char *propname; zprop_type_t proptype; - spa_config_dirent_t *dp; mutex_enter(&spa->spa_props_lock); @@ -3808,31 +5079,14 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) case ZPOOL_PROP_CACHEFILE: /* - * 'cachefile' is a non-persistent property, but note - * an async request that the config cache needs to be - * udpated. + * 'cachefile' is also a non-persisitent property. */ - VERIFY(nvpair_value_string(elem, &strval) == 0); - - dp = kmem_alloc(sizeof (spa_config_dirent_t), KM_SLEEP); - - if (strval[0] == '\0') - dp->scd_path = spa_strdup(spa_config_path); - else if (strcmp(strval, "none") == 0) - dp->scd_path = NULL; - else - dp->scd_path = spa_strdup(strval); - - list_insert_head(&spa->spa_config_list, dp); - spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); break; default: /* * Set pool property values in the poolprops mos object. */ if (spa->spa_pool_props_object == 0) { - objset_t *mos = spa->spa_meta_objset; - VERIFY((spa->spa_pool_props_object = zap_create(mos, DMU_OT_POOL_PROPS, DMU_OT_NONE, 0, tx)) > 0); @@ -3879,6 +5133,13 @@ spa_sync_props(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx) case ZPOOL_PROP_FAILUREMODE: spa->spa_failmode = intval; break; + case ZPOOL_PROP_AUTOEXPAND: + spa->spa_autoexpand = intval; + spa_async_request(spa, SPA_ASYNC_AUTOEXPAND); + break; + case ZPOOL_PROP_DEDUPDITTO: + spa->spa_dedup_ditto = intval; + break; default: break; } @@ -3905,11 +5166,11 @@ spa_sync(spa_t *spa, uint64_t txg) { dsl_pool_t *dp = spa->spa_dsl_pool; objset_t *mos = spa->spa_meta_objset; - bplist_t *bpl = &spa->spa_sync_bplist; + bplist_t *defer_bpl = &spa->spa_deferred_bplist; + bplist_t *free_bpl = &spa->spa_free_bplist[txg & TXG_MASK]; vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd; dmu_tx_t *tx; - int dirty_vdevs; int error; /* @@ -3925,13 +5186,26 @@ spa_sync(spa_t *spa, uint64_t txg) * into config changes that go out with this transaction group. */ spa_config_enter(spa, SCL_STATE, FTAG, RW_READER); - while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { - vdev_state_clean(vd); - vdev_config_dirty(vd); + while (list_head(&spa->spa_state_dirty_list) != NULL) { + /* + * We need the write lock here because, for aux vdevs, + * calling vdev_config_dirty() modifies sav_config. + * This is ugly and will become unnecessary when we + * eliminate the aux vdev wart by integrating all vdevs + * into the root vdev tree. + */ + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_WRITER); + while ((vd = list_head(&spa->spa_state_dirty_list)) != NULL) { + vdev_state_clean(vd); + vdev_config_dirty(vd); + } + spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); + spa_config_enter(spa, SCL_CONFIG | SCL_STATE, FTAG, RW_READER); } spa_config_exit(spa, SCL_STATE, FTAG); - VERIFY(0 == bplist_open(bpl, mos, spa->spa_sync_bplist_obj)); + VERIFY(0 == bplist_open(defer_bpl, mos, spa->spa_deferred_bplist_obj)); tx = dmu_tx_create_assigned(dp, txg); @@ -3977,13 +5251,13 @@ spa_sync(spa_t *spa, uint64_t txg) if (!txg_list_empty(&dp->dp_dirty_datasets, txg) || !txg_list_empty(&dp->dp_dirty_dirs, txg) || !txg_list_empty(&dp->dp_sync_tasks, txg)) - spa_sync_deferred_frees(spa, txg); + spa_sync_deferred_bplist(spa, defer_bpl, tx, txg); /* * Iterate to convergence. */ do { - spa->spa_sync_pass++; + int pass = ++spa->spa_sync_pass; spa_sync_config_object(spa, tx); spa_sync_aux_dev(spa, &spa->spa_spares, tx, @@ -3993,18 +5267,29 @@ spa_sync(spa_t *spa, uint64_t txg) spa_errlog_sync(spa, txg); dsl_pool_sync(dp, txg); - dirty_vdevs = 0; - while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) { - vdev_sync(vd, txg); - dirty_vdevs++; + if (pass <= SYNC_PASS_DEFERRED_FREE) { + zio_t *zio = zio_root(spa, NULL, NULL, 0); + bplist_sync(free_bpl, spa_sync_free, zio, tx); + VERIFY(zio_wait(zio) == 0); + } else { + bplist_sync(free_bpl, bplist_enqueue_cb, defer_bpl, tx); } - bplist_sync(bpl, tx); - } while (dirty_vdevs); + ddt_sync(spa, txg); + + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight > 0) + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + mutex_exit(&spa->spa_scrub_lock); + + while (vd = txg_list_remove(&spa->spa_vdev_txg_list, txg)) + vdev_sync(vd, txg); + + } while (dmu_objset_is_dirty(mos, txg)); - bplist_close(bpl); + ASSERT(free_bpl->bpl_queue == NULL); - dprintf("txg %llu passes %d\n", txg, spa->spa_sync_pass); + bplist_close(defer_bpl); /* * Rewrite the vdev configuration (which includes the uberblock) @@ -4027,9 +5312,8 @@ spa_sync(spa_t *spa, uint64_t txg) int svdcount = 0; int children = rvd->vdev_children; int c0 = spa_get_random(children); - int c; - for (c = 0; c < children; c++) { + for (int c = 0; c < children; c++) { vd = rvd->vdev_child[(c0 + c) % children]; if (vd->vdev_ms_array == 0 || vd->vdev_islog) continue; @@ -4037,10 +5321,16 @@ spa_sync(spa_t *spa, uint64_t txg) if (svdcount == SPA_DVAS_PER_BP) break; } - error = vdev_config_sync(svd, svdcount, txg); + error = vdev_config_sync(svd, svdcount, txg, B_FALSE); + if (error != 0) + error = vdev_config_sync(svd, svdcount, txg, + B_TRUE); } else { error = vdev_config_sync(rvd->vdev_child, - rvd->vdev_children, txg); + rvd->vdev_children, txg, B_FALSE); + if (error != 0) + error = vdev_config_sync(rvd->vdev_child, + rvd->vdev_children, txg, B_TRUE); } spa_config_exit(spa, SCL_STATE, FTAG); @@ -4070,10 +5360,7 @@ spa_sync(spa_t *spa, uint64_t txg) spa->spa_ubsync = spa->spa_uberblock; - /* - * Clean up the ZIL records for the synced txg. - */ - dsl_pool_zil_clean(dp); + dsl_pool_sync_done(dp, txg); /* * Update usable space statistics. @@ -4081,6 +5368,8 @@ spa_sync(spa_t *spa, uint64_t txg) while (vd = txg_list_remove(&spa->spa_vdev_txg_list, TXG_CLEAN(txg))) vdev_sync_done(vd, txg); + spa_update_dspace(spa); + /* * It had better be the case that we didn't dirty anything * since vdev_config_sync(). @@ -4088,10 +5377,15 @@ spa_sync(spa_t *spa, uint64_t txg) ASSERT(txg_list_empty(&dp->dp_dirty_datasets, txg)); ASSERT(txg_list_empty(&dp->dp_dirty_dirs, txg)); ASSERT(txg_list_empty(&spa->spa_vdev_txg_list, txg)); - ASSERT(bpl->bpl_queue == NULL); + ASSERT(defer_bpl->bpl_queue == NULL); + ASSERT(free_bpl->bpl_queue == NULL); + + spa->spa_sync_pass = 0; spa_config_exit(spa, SCL_CONFIG, FTAG); + spa_handle_ignored_writes(spa); + /* * If any async tasks have been requested, kick them off. */ @@ -4161,7 +5455,7 @@ spa_evict_all(void) } vdev_t * -spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) +spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t aux) { vdev_t *vd; int i; @@ -4169,12 +5463,18 @@ spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache) if ((vd = vdev_lookup_by_guid(spa->spa_root_vdev, guid)) != NULL) return (vd); - if (l2cache) { + if (aux) { for (i = 0; i < spa->spa_l2cache.sav_count; i++) { vd = spa->spa_l2cache.sav_vdevs[i]; if (vd->vdev_guid == guid) return (vd); } + + for (i = 0; i < spa->spa_spares.sav_count; i++) { + vd = spa->spa_spares.sav_vdevs[i]; + if (vd->vdev_guid == guid) + return (vd); + } } return (NULL); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c index ee425a91694f2..68a40bec89bec 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_config.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,6 +36,7 @@ #include #ifdef _KERNEL #include +#include #endif /* @@ -74,7 +75,6 @@ spa_config_load(void) void *buf = NULL; nvlist_t *nvlist, *child; nvpair_t *nvpair; - spa_t *spa; char *pathname; struct _buf *file; uint64_t fsize; @@ -118,7 +118,6 @@ spa_config_load(void) mutex_enter(&spa_namespace_lock); nvpair = NULL; while ((nvpair = nvlist_next_nvpair(nvlist, nvpair)) != NULL) { - if (nvpair_type(nvpair) != DATA_TYPE_NVLIST) continue; @@ -126,13 +125,7 @@ spa_config_load(void) if (spa_lookup(nvpair_name(nvpair)) != NULL) continue; - spa = spa_add(nvpair_name(nvpair), NULL); - - /* - * We blindly duplicate the configuration here. If it's - * invalid, we will catch it when the pool is first opened. - */ - VERIFY(nvlist_dup(child, &spa->spa_config, 0) == 0); + (void) spa_add(nvpair_name(nvpair), child, NULL); } mutex_exit(&spa_namespace_lock); @@ -208,6 +201,9 @@ spa_config_sync(spa_t *target, boolean_t removing, boolean_t postsysevent) ASSERT(MUTEX_HELD(&spa_namespace_lock)); + if (rootdir == NULL || !(spa_mode_global & FWRITE)) + return; + /* * Iterate over all cachefiles for the pool, past or present. When the * cachefile is changed, the new one is pushed onto this list, allowing @@ -309,6 +305,24 @@ spa_config_set(spa_t *spa, nvlist_t *config) mutex_exit(&spa->spa_props_lock); } +/* Add discovered rewind info, if any to the provided nvlist */ +void +spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *tonvl) +{ + int64_t loss = 0; + + if (tonvl == NULL || spa->spa_load_txg == 0) + return; + + VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_TIME, + spa->spa_load_txg_ts) == 0); + if (spa->spa_last_ubsync_txg) + loss = spa->spa_last_ubsync_txg_ts - spa->spa_load_txg_ts; + VERIFY(nvlist_add_int64(tonvl, ZPOOL_CONFIG_REWIND_TIME, loss) == 0); + VERIFY(nvlist_add_uint64(tonvl, ZPOOL_CONFIG_LOAD_DATA_ERRORS, + spa->spa_load_data_errors) == 0); +} + /* * Generate the pool's configuration based on the current in-core state. * We infer whether to generate a complete config or just one top-level config @@ -321,6 +335,7 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) vdev_t *rvd = spa->spa_root_vdev; unsigned long hostid = 0; boolean_t locked = B_FALSE; + uint64_t split_guid; if (vd == NULL) { vd = rvd; @@ -349,7 +364,15 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) txg) == 0); VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_POOL_GUID, spa_guid(spa)) == 0); +#ifdef _KERNEL + hostid = zone_get_hostid(NULL); +#else /* _KERNEL */ + /* + * We're emulating the system's hostid in userland, so we can't use + * zone_get_hostid(). + */ (void) ddi_strtoul(hw_serial, NULL, 10, &hostid); +#endif /* _KERNEL */ if (hostid != 0) { VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_HOSTID, hostid) == 0); @@ -369,36 +392,79 @@ spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats) VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_IS_LOG, 1ULL) == 0); vd = vd->vdev_top; /* label contains top config */ + } else { + /* + * Only add the (potentially large) split information + * in the mos config, and not in the vdev labels + */ + if (spa->spa_config_splitting != NULL) + VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_SPLIT, + spa->spa_config_splitting) == 0); + } + + /* + * Add the top-level config. We even add this on pools which + * don't support holes in the namespace as older pools will + * just ignore it. + */ + vdev_top_config_generate(spa, config); + + /* + * If we're splitting, record the original pool's guid. + */ + if (spa->spa_config_splitting != NULL && + nvlist_lookup_uint64(spa->spa_config_splitting, + ZPOOL_CONFIG_SPLIT_GUID, &split_guid) == 0) { + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_SPLIT_GUID, + split_guid) == 0); } nvroot = vdev_config_generate(spa, vd, getstats, B_FALSE, B_FALSE); VERIFY(nvlist_add_nvlist(config, ZPOOL_CONFIG_VDEV_TREE, nvroot) == 0); nvlist_free(nvroot); + if (getstats && spa_load_state(spa) == SPA_LOAD_NONE) { + ddt_histogram_t *ddh; + ddt_stat_t *dds; + ddt_object_t *ddo; + + ddh = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh); + VERIFY(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_HISTOGRAM, + (uint64_t *)ddh, sizeof (*ddh) / sizeof (uint64_t)) == 0); + kmem_free(ddh, sizeof (ddt_histogram_t)); + + ddo = kmem_zalloc(sizeof (ddt_object_t), KM_SLEEP); + ddt_get_dedup_object_stats(spa, ddo); + VERIFY(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_OBJ_STATS, + (uint64_t *)ddo, sizeof (*ddo) / sizeof (uint64_t)) == 0); + kmem_free(ddo, sizeof (ddt_object_t)); + + dds = kmem_zalloc(sizeof (ddt_stat_t), KM_SLEEP); + ddt_get_dedup_stats(spa, dds); + VERIFY(nvlist_add_uint64_array(config, + ZPOOL_CONFIG_DDT_STATS, + (uint64_t *)dds, sizeof (*dds) / sizeof (uint64_t)) == 0); + kmem_free(dds, sizeof (ddt_stat_t)); + } + + spa_rewind_data_to_nvlist(spa, config); + if (locked) spa_config_exit(spa, SCL_CONFIG | SCL_STATE, FTAG); return (config); } -/* - * For a pool that's not currently a booting rootpool, update all disk labels, - * generate a fresh config based on the current in-core state, and sync the - * global config cache. - */ -void -spa_config_update(spa_t *spa, int what) -{ - spa_config_update_common(spa, what, FALSE); -} - /* * Update all disk labels, generate a fresh config based on the current * in-core state, and sync the global config cache (do not sync the config * cache if this is a booting rootpool). */ void -spa_config_update_common(spa_t *spa, int what, boolean_t isroot) +spa_config_update(spa_t *spa, int what) { vdev_t *rvd = spa->spa_root_vdev; uint64_t txg; @@ -420,10 +486,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot) */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - if (tvd->vdev_ms_array == 0) { - vdev_init(tvd, txg); - vdev_config_dirty(tvd); - } + if (tvd->vdev_ms_array == 0) + vdev_metaslab_set_size(tvd); + vdev_expand(tvd, txg); } } spa_config_exit(spa, SCL_ALL, FTAG); @@ -436,9 +501,9 @@ spa_config_update_common(spa_t *spa, int what, boolean_t isroot) /* * Update the global config cache to reflect the new mosconfig. */ - if (!isroot) + if (!spa->spa_is_root) spa_config_sync(spa, B_FALSE, what != SPA_CONFIG_UPDATE_POOL); if (what == SPA_CONFIG_UPDATE_POOL) - spa_config_update_common(spa, SPA_CONFIG_UPDATE_VDEVS, isroot); + spa_config_update(spa, SPA_CONFIG_UPDATE_VDEVS); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c index c642bd768b497..4c834e2d4e0a0 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_errlog.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * Routines to manage the on-disk persistent error log. * @@ -60,9 +58,8 @@ * This is a stripped-down version of strtoull, suitable only for converting * lowercase hexidecimal numbers that don't overflow. */ -#ifdef _KERNEL -static uint64_t -strtonum(char *str, char **nptr) +uint64_t +strtonum(const char *str, char **nptr) { uint64_t val = 0; char c; @@ -82,11 +79,11 @@ strtonum(char *str, char **nptr) str++; } - *nptr = str; + if (nptr) + *nptr = (char *)str; return (val); } -#endif /* * Convert a bookmark to a string. @@ -135,7 +132,7 @@ spa_log_error(spa_t *spa, zio_t *zio) * If we are trying to import a pool, ignore any errors, as we won't be * writing to the pool any time soon. */ - if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) return; mutex_enter(&spa->spa_errlist_lock); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c index c997240c148f2..b713d66ee9040 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_history.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -105,7 +103,8 @@ spa_history_create_obj(spa_t *spa, dmu_tx_t *tx) * Figure out maximum size of history log. We set it at * 1% of pool size, with a max of 32MB and min of 128KB. */ - shpp->sh_phys_max_off = spa_get_dspace(spa) / 100; + shpp->sh_phys_max_off = + metaslab_class_get_dspace(spa_normal_class(spa)) / 100; shpp->sh_phys_max_off = MIN(shpp->sh_phys_max_off, 32<<20); shpp->sh_phys_max_off = MAX(shpp->sh_phys_max_off, 128<<10); @@ -127,12 +126,12 @@ spa_history_advance_bof(spa_t *spa, spa_history_phys_t *shpp) firstread = MIN(sizeof (reclen), shpp->sh_phys_max_off - phys_bof); if ((err = dmu_read(mos, spa->spa_history, phys_bof, firstread, - buf)) != 0) + buf, DMU_READ_PREFETCH)) != 0) return (err); if (firstread != sizeof (reclen)) { if ((err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, sizeof (reclen) - firstread, - buf + firstread)) != 0) + buf + firstread, DMU_READ_PREFETCH)) != 0) return (err); } @@ -380,10 +379,11 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) return (0); } - err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf); + err = dmu_read(mos, spa->spa_history, phys_read_off, read_len, buf, + DMU_READ_PREFETCH); if (leftover && err == 0) { err = dmu_read(mos, spa->spa_history, shpp->sh_pool_create_len, - leftover, buf + read_len); + leftover, buf + read_len, DMU_READ_PREFETCH); } mutex_exit(&spa->spa_history_lock); @@ -391,13 +391,12 @@ spa_history_get(spa_t *spa, uint64_t *offp, uint64_t *len, char *buf) return (err); } -void -spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +static void +log_internal(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, va_list adx) { history_arg_t *hap; char *str; - va_list adx; /* * If this is part of creating a pool, not everything is @@ -409,9 +408,7 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa, hap = kmem_alloc(sizeof (history_arg_t), KM_SLEEP); str = kmem_alloc(HIS_MAX_RECORD_LEN, KM_SLEEP); - va_start(adx, fmt); (void) vsnprintf(str, HIS_MAX_RECORD_LEN, fmt, adx); - va_end(adx); hap->ha_log_type = LOG_INTERNAL; hap->ha_history_str = str; @@ -426,3 +423,48 @@ spa_history_internal_log(history_internal_events_t event, spa_t *spa, } /* spa_history_log_sync() will free hap and str */ } + +void +spa_history_internal_log(history_internal_events_t event, spa_t *spa, + dmu_tx_t *tx, cred_t *cr, const char *fmt, ...) +{ + dmu_tx_t *htx = tx; + va_list adx; + + /* create a tx if we didn't get one */ + if (tx == NULL) { + htx = dmu_tx_create_dd(spa_get_dsl(spa)->dp_mos_dir); + if (dmu_tx_assign(htx, TXG_WAIT) != 0) { + dmu_tx_abort(htx); + return; + } + } + + va_start(adx, fmt); + log_internal(event, spa, htx, cr, fmt, adx); + va_end(adx); + + /* if we didn't get a tx from the caller, commit the one we made */ + if (tx == NULL) + dmu_tx_commit(htx); +} + +void +spa_history_log_version(spa_t *spa, history_internal_events_t event) +{ +#ifdef _KERNEL + uint64_t current_vers = spa_version(spa); + + if (current_vers >= SPA_VERSION_ZPOOL_HISTORY) { + spa_history_internal_log(event, spa, NULL, CRED(), + "pool spa %llu; zfs spa %llu; zpl %d; uts %s %s %s %s", + (u_longlong_t)current_vers, SPA_VERSION, ZPL_VERSION, + utsname.nodename, utsname.release, utsname.version, + utsname.machine); + } + cmn_err(CE_CONT, "!%s version %llu pool %s using %llu", + event == LOG_POOL_IMPORT ? "imported" : + event == LOG_POOL_CREATE ? "created" : "accessed", + (u_longlong_t)current_vers, spa_name(spa), SPA_VERSION); +#endif +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c index 36046e6df1c04..5a48dc6093a7a 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/spa_misc.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -43,8 +43,8 @@ #include #include #include -#include #include +#include #include "zfs_prop.h" /* @@ -186,7 +186,7 @@ * * SCL_VDEV * Held as reader to prevent changes to the vdev tree during trivial - * inquiries such as bp_get_dasize(). SCL_VDEV is distinct from the + * inquiries such as bp_get_dsize(). SCL_VDEV is distinct from the * other locks, and lower than all of them, to ensure that it's safe * to acquire regardless of caller context. * @@ -230,7 +230,7 @@ static kmutex_t spa_l2cache_lock; static avl_tree_t spa_l2cache_avl; kmem_cache_t *spa_buffer_pool; -int spa_mode; +int spa_mode_global; #ifdef ZFS_DEBUG /* Everything except dprintf is on by default in debug builds */ @@ -310,8 +310,12 @@ spa_config_tryenter(spa_t *spa, int locks, void *tag, krw_t rw) void spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) { + int wlocks_held = 0; + for (int i = 0; i < SCL_LOCKS; i++) { spa_config_lock_t *scl = &spa->spa_config_lock[i]; + if (scl->scl_writer == curthread) + wlocks_held |= (1 << i); if (!(locks & (1 << i))) continue; mutex_enter(&scl->scl_lock); @@ -331,6 +335,7 @@ spa_config_enter(spa_t *spa, int locks, void *tag, krw_t rw) (void) refcount_add(&scl->scl_count, tag); mutex_exit(&scl->scl_lock); } + ASSERT(wlocks_held <= locks); } void @@ -415,7 +420,7 @@ spa_lookup(const char *name) * exist by calling spa_lookup() first. */ spa_t * -spa_add(const char *name, const char *altroot) +spa_add(const char *name, nvlist_t *config, const char *altroot) { spa_t *spa; spa_config_dirent_t *dp; @@ -425,31 +430,37 @@ spa_add(const char *name, const char *altroot) spa = kmem_zalloc(sizeof (spa_t), KM_SLEEP); mutex_init(&spa->spa_async_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_async_root_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_errlist_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&spa->spa_sync_bplist.bpl_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_errlog_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_history_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_proc_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&spa->spa_props_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_scrub_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL); - cv_init(&spa->spa_async_root_cv, NULL, CV_DEFAULT, NULL); + cv_init(&spa->spa_proc_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_scrub_io_cv, NULL, CV_DEFAULT, NULL); cv_init(&spa->spa_suspend_cv, NULL, CV_DEFAULT, NULL); + for (int t = 0; t < TXG_SIZE; t++) + bplist_init(&spa->spa_free_bplist[t]); + bplist_init(&spa->spa_deferred_bplist); + (void) strlcpy(spa->spa_name, name, sizeof (spa->spa_name)); spa->spa_state = POOL_STATE_UNINITIALIZED; spa->spa_freeze_txg = UINT64_MAX; spa->spa_final_txg = UINT64_MAX; + spa->spa_load_max_txg = UINT64_MAX; + spa->spa_proc = &p0; + spa->spa_proc_state = SPA_PROC_NONE; refcount_create(&spa->spa_refcount); spa_config_lock_init(spa); avl_add(&spa_namespace_avl, spa); - mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL); - /* * Set the alternate root, if there is one. */ @@ -468,6 +479,9 @@ spa_add(const char *name, const char *altroot) dp->scd_path = spa_strdup(spa_config_path); list_insert_head(&spa->spa_config_list, dp); + if (config != NULL) + VERIFY(nvlist_dup(config, &spa->spa_config, 0) == 0); + return (spa); } @@ -484,6 +498,8 @@ spa_remove(spa_t *spa) ASSERT(MUTEX_HELD(&spa_namespace_lock)); ASSERT(spa->spa_state == POOL_STATE_UNINITIALIZED); + nvlist_free(spa->spa_config_splitting); + avl_remove(&spa_namespace_avl, spa); cv_broadcast(&spa_namespace_cv); @@ -507,20 +523,24 @@ spa_remove(spa_t *spa) spa_config_lock_destroy(spa); + for (int t = 0; t < TXG_SIZE; t++) + bplist_fini(&spa->spa_free_bplist[t]); + bplist_fini(&spa->spa_deferred_bplist); + cv_destroy(&spa->spa_async_cv); - cv_destroy(&spa->spa_async_root_cv); + cv_destroy(&spa->spa_proc_cv); cv_destroy(&spa->spa_scrub_io_cv); cv_destroy(&spa->spa_suspend_cv); mutex_destroy(&spa->spa_async_lock); - mutex_destroy(&spa->spa_async_root_lock); - mutex_destroy(&spa->spa_scrub_lock); - mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_errlist_lock); - mutex_destroy(&spa->spa_sync_bplist.bpl_lock); + mutex_destroy(&spa->spa_errlog_lock); mutex_destroy(&spa->spa_history_lock); + mutex_destroy(&spa->spa_proc_lock); mutex_destroy(&spa->spa_props_lock); + mutex_destroy(&spa->spa_scrub_lock); mutex_destroy(&spa->spa_suspend_lock); + mutex_destroy(&spa->spa_vdev_top_lock); kmem_free(spa, sizeof (spa_t)); } @@ -814,12 +834,6 @@ spa_l2cache_activate(vdev_t *vd) mutex_exit(&spa_l2cache_lock); } -void -spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc) -{ - vdev_space_update(vd, space, alloc, B_FALSE); -} - /* * ========================================================================== * SPA vdev locking @@ -834,7 +848,20 @@ spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc) uint64_t spa_vdev_enter(spa_t *spa) { + mutex_enter(&spa->spa_vdev_top_lock); mutex_enter(&spa_namespace_lock); + return (spa_vdev_config_enter(spa)); +} + +/* + * Internal implementation for spa_vdev_enter(). Used when a vdev + * operation requires multiple syncs (i.e. removing a device) while + * keeping the spa_namespace_lock held. + */ +uint64_t +spa_vdev_config_enter(spa_t *spa) +{ + ASSERT(MUTEX_HELD(&spa_namespace_lock)); spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); @@ -842,14 +869,14 @@ spa_vdev_enter(spa_t *spa) } /* - * Unlock the spa_t after adding or removing a vdev. Besides undoing the - * locking of spa_vdev_enter(), we also want make sure the transactions have - * synced to disk, and then update the global configuration cache with the new - * information. + * Used in combination with spa_vdev_config_enter() to allow the syncing + * of multiple transactions without releasing the spa_namespace_lock. */ -int -spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) +void +spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error, char *tag) { + ASSERT(MUTEX_HELD(&spa_namespace_lock)); + int config_changed = B_FALSE; ASSERT(txg > spa_last_synced_txg(spa)); @@ -867,10 +894,25 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) if (error == 0 && !list_is_empty(&spa->spa_config_dirty_list)) { dsl_pool_scrub_restart(spa->spa_dsl_pool); config_changed = B_TRUE; + spa->spa_config_generation++; } + /* + * Verify the metaslab classes. + */ + ASSERT(metaslab_class_validate(spa_normal_class(spa)) == 0); + ASSERT(metaslab_class_validate(spa_log_class(spa)) == 0); + spa_config_exit(spa, SCL_ALL, spa); + /* + * Panic the system if the specified tag requires it. This + * is useful for ensuring that configurations are updated + * transactionally. + */ + if (zio_injection_enabled) + zio_handle_panic_injection(spa, tag, 0); + /* * Note: this txg_wait_synced() is important because it ensures * that there won't be more than one config change per txg. @@ -880,8 +922,10 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) txg_wait_synced(spa->spa_dsl_pool, txg); if (vd != NULL) { - ASSERT(!vd->vdev_detached || vd->vdev_dtl.smo_object == 0); + ASSERT(!vd->vdev_detached || vd->vdev_dtl_smo.smo_object == 0); + spa_config_enter(spa, SCL_ALL, spa, RW_WRITER); vdev_free(vd); + spa_config_exit(spa, SCL_ALL, spa); } /* @@ -889,8 +933,20 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) */ if (config_changed) spa_config_sync(spa, B_FALSE, B_TRUE); +} +/* + * Unlock the spa_t after adding or removing a vdev. Besides undoing the + * locking of spa_vdev_enter(), we also want make sure the transactions have + * synced to disk, and then update the global configuration cache with the new + * information. + */ +int +spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) +{ + spa_vdev_config_exit(spa, vd, txg, error, FTAG); mutex_exit(&spa_namespace_lock); + mutex_exit(&spa->spa_vdev_top_lock); return (error); } @@ -899,18 +955,37 @@ spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error) * Lock the given spa_t for the purpose of changing vdev state. */ void -spa_vdev_state_enter(spa_t *spa) +spa_vdev_state_enter(spa_t *spa, int oplocks) { - spa_config_enter(spa, SCL_STATE_ALL, spa, RW_WRITER); + int locks = SCL_STATE_ALL | oplocks; + + spa_config_enter(spa, locks, spa, RW_WRITER); + spa->spa_vdev_locks = locks; } int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error) { - if (vd != NULL) + if (vd != NULL || error == 0) + vdev_dtl_reassess(vd ? vd->vdev_top : spa->spa_root_vdev, + 0, 0, B_FALSE); + + if (vd != NULL) { vdev_state_dirty(vd->vdev_top); + spa->spa_config_generation++; + } + + ASSERT3U(spa->spa_vdev_locks, >=, SCL_STATE_ALL); + spa_config_exit(spa, spa->spa_vdev_locks, spa); - spa_config_exit(spa, SCL_STATE_ALL, spa); + /* + * If anything changed, wait for it to sync. This ensures that, + * from the system administrator's perspective, zpool(1M) commands + * are synchronous. This is important for things like zpool offline: + * when the command completes, you expect no further I/O from ZFS. + */ + if (vd != NULL) + txg_wait_synced(spa->spa_dsl_pool, 0); return (error); } @@ -1044,48 +1119,30 @@ spa_get_random(uint64_t range) return (r % range); } -void -sprintf_blkptr(char *buf, int len, const blkptr_t *bp) +uint64_t +spa_generate_guid(spa_t *spa) { - int d; + uint64_t guid = spa_get_random(-1ULL); - if (bp == NULL) { - (void) snprintf(buf, len, ""); - return; + if (spa != NULL) { + while (guid == 0 || spa_guid_exists(spa_guid(spa), guid)) + guid = spa_get_random(-1ULL); + } else { + while (guid == 0 || spa_guid_exists(guid, 0)) + guid = spa_get_random(-1ULL); } - if (BP_IS_HOLE(bp)) { - (void) snprintf(buf, len, ""); - return; - } + return (guid); +} - (void) snprintf(buf, len, "[L%llu %s] %llxL/%llxP ", - (u_longlong_t)BP_GET_LEVEL(bp), - dmu_ot[BP_GET_TYPE(bp)].ot_name, - (u_longlong_t)BP_GET_LSIZE(bp), - (u_longlong_t)BP_GET_PSIZE(bp)); - - for (d = 0; d < BP_GET_NDVAS(bp); d++) { - const dva_t *dva = &bp->blk_dva[d]; - (void) snprintf(buf + strlen(buf), len - strlen(buf), - "DVA[%d]=<%llu:%llx:%llx> ", d, - (u_longlong_t)DVA_GET_VDEV(dva), - (u_longlong_t)DVA_GET_OFFSET(dva), - (u_longlong_t)DVA_GET_ASIZE(dva)); - } +void +sprintf_blkptr(char *buf, const blkptr_t *bp) +{ + char *type = dmu_ot[BP_GET_TYPE(bp)].ot_name; + char *checksum = zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name; + char *compress = zio_compress_table[BP_GET_COMPRESS(bp)].ci_name; - (void) snprintf(buf + strlen(buf), len - strlen(buf), - "%s %s %s %s birth=%llu fill=%llu cksum=%llx:%llx:%llx:%llx", - zio_checksum_table[BP_GET_CHECKSUM(bp)].ci_name, - zio_compress_table[BP_GET_COMPRESS(bp)].ci_name, - BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", - BP_IS_GANG(bp) ? "gang" : "contiguous", - (u_longlong_t)bp->blk_birth, - (u_longlong_t)bp->blk_fill, - (u_longlong_t)bp->blk_cksum.zc_word[0], - (u_longlong_t)bp->blk_cksum.zc_word[1], - (u_longlong_t)bp->blk_cksum.zc_word[2], - (u_longlong_t)bp->blk_cksum.zc_word[3]); + SPRINTF_BLKPTR(snprintf, ' ', buf, bp, type, checksum, compress); } void @@ -1191,59 +1248,55 @@ spa_first_txg(spa_t *spa) return (spa->spa_first_txg); } +uint64_t +spa_syncing_txg(spa_t *spa) +{ + return (spa->spa_syncing_txg); +} + pool_state_t spa_state(spa_t *spa) { return (spa->spa_state); } -uint64_t -spa_freeze_txg(spa_t *spa) +spa_load_state_t +spa_load_state(spa_t *spa) { - return (spa->spa_freeze_txg); + return (spa->spa_load_state); } -/* - * Return how much space is allocated in the pool (ie. sum of all asize) - */ uint64_t -spa_get_alloc(spa_t *spa) +spa_freeze_txg(spa_t *spa) { - return (spa->spa_root_vdev->vdev_stat.vs_alloc); + return (spa->spa_freeze_txg); } -/* - * Return how much (raid-z inflated) space there is in the pool. - */ +/* ARGSUSED */ uint64_t -spa_get_space(spa_t *spa) +spa_get_asize(spa_t *spa, uint64_t lsize) { - return (spa->spa_root_vdev->vdev_stat.vs_space); + /* + * The worst case is single-sector max-parity RAID-Z blocks, in which + * case the space requirement is exactly (VDEV_RAIDZ_MAXPARITY + 1) + * times the size; so just assume that. Add to this the fact that + * we can have up to 3 DVAs per bp, and one more factor of 2 because + * the block may be dittoed with up to 3 DVAs by ddt_sync(). + */ + return (lsize * (VDEV_RAIDZ_MAXPARITY + 1) * SPA_DVAS_PER_BP * 2); } -/* - * Return the amount of raid-z-deflated space in the pool. - */ uint64_t spa_get_dspace(spa_t *spa) { - if (spa->spa_deflate) - return (spa->spa_root_vdev->vdev_stat.vs_dspace); - else - return (spa->spa_root_vdev->vdev_stat.vs_space); + return (spa->spa_dspace); } -/* ARGSUSED */ -uint64_t -spa_get_asize(spa_t *spa, uint64_t lsize) +void +spa_update_dspace(spa_t *spa) { - /* - * For now, the worst case is 512-byte RAID-Z blocks, in which - * case the space requirement is exactly 2x; so just assume that. - * Add to this the fact that we can have up to 3 DVAs per bp, and - * we have to multiply by a total of 6x. - */ - return (lsize * 6); + spa->spa_dspace = metaslab_class_get_dspace(spa_normal_class(spa)) + + ddt_get_dedup_dspace(spa); } /* @@ -1268,6 +1321,24 @@ spa_version(spa_t *spa) return (spa->spa_ubsync.ub_version); } +boolean_t +spa_deflate(spa_t *spa) +{ + return (spa->spa_deflate); +} + +metaslab_class_t * +spa_normal_class(spa_t *spa) +{ + return (spa->spa_normal_class); +} + +metaslab_class_t * +spa_log_class(spa_t *spa) +{ + return (spa->spa_log_class); +} + int spa_max_replication(spa_t *spa) { @@ -1282,23 +1353,45 @@ spa_max_replication(spa_t *spa) } uint64_t -bp_get_dasize(spa_t *spa, const blkptr_t *bp) +dva_get_dsize_sync(spa_t *spa, const dva_t *dva) { - int sz = 0, i; + uint64_t asize = DVA_GET_ASIZE(dva); + uint64_t dsize = asize; - if (!spa->spa_deflate) - return (BP_GET_ASIZE(bp)); + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); - for (i = 0; i < SPA_DVAS_PER_BP; i++) { - vdev_t *vd = - vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[i])); - if (vd) - sz += (DVA_GET_ASIZE(&bp->blk_dva[i]) >> - SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; + if (asize != 0 && spa->spa_deflate) { + vdev_t *vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + dsize = (asize >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; } + + return (dsize); +} + +uint64_t +bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp) +{ + uint64_t dsize = 0; + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + + return (dsize); +} + +uint64_t +bp_get_dsize(spa_t *spa, const blkptr_t *bp) +{ + uint64_t dsize = 0; + + spa_config_enter(spa, SCL_VDEV, FTAG, RW_READER); + + for (int d = 0; d < SPA_DVAS_PER_BP; d++) + dsize += dva_get_dsize_sync(spa, &bp->blk_dva[d]); + spa_config_exit(spa, SCL_VDEV, FTAG); - return (sz); + + return (dsize); } /* @@ -1351,7 +1444,7 @@ spa_init(int mode) avl_create(&spa_l2cache_avl, spa_l2cache_compare, sizeof (spa_aux_t), offsetof(spa_aux_t, aux_avl)); - spa_mode = mode; + spa_mode_global = mode; refcount_init(); unique_init(); @@ -1400,11 +1493,56 @@ spa_has_slogs(spa_t *spa) return (spa->spa_log_class->mc_rotor != NULL); } -/* - * Return whether this pool is the root pool. - */ +spa_log_state_t +spa_get_log_state(spa_t *spa) +{ + return (spa->spa_log_state); +} + +void +spa_set_log_state(spa_t *spa, spa_log_state_t state) +{ + spa->spa_log_state = state; +} + boolean_t spa_is_root(spa_t *spa) { return (spa->spa_is_root); } + +boolean_t +spa_writeable(spa_t *spa) +{ + return (!!(spa->spa_mode & FWRITE)); +} + +int +spa_mode(spa_t *spa) +{ + return (spa->spa_mode); +} + +uint64_t +spa_bootfs(spa_t *spa) +{ + return (spa->spa_bootfs); +} + +uint64_t +spa_delegation(spa_t *spa) +{ + return (spa->spa_delegation); +} + +objset_t * +spa_meta_objset(spa_t *spa) +{ + return (spa->spa_meta_objset); +} + +enum zio_checksum +spa_dedup_checksum(spa_t *spa) +{ + return (spa->spa_dedup_checksum); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c b/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c index 0a1fd59eaba67..1ce7b2a3d4660 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/space_map.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -60,6 +58,8 @@ space_map_create(space_map_t *sm, uint64_t start, uint64_t size, uint8_t shift, { bzero(sm, sizeof (*sm)); + cv_init(&sm->sm_load_cv, NULL, CV_DEFAULT, NULL); + avl_create(&sm->sm_root, space_map_seg_compare, sizeof (space_seg_t), offsetof(struct space_seg, ss_node)); @@ -75,6 +75,7 @@ space_map_destroy(space_map_t *sm) ASSERT(!sm->sm_loaded && !sm->sm_loading); VERIFY3U(sm->sm_space, ==, 0); avl_destroy(&sm->sm_root); + cv_destroy(&sm->sm_load_cv); } void @@ -115,12 +116,23 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) if (merge_before && merge_after) { avl_remove(&sm->sm_root, ss_before); + if (sm->sm_pp_root) { + avl_remove(sm->sm_pp_root, ss_before); + avl_remove(sm->sm_pp_root, ss_after); + } ss_after->ss_start = ss_before->ss_start; kmem_free(ss_before, sizeof (*ss_before)); + ss = ss_after; } else if (merge_before) { ss_before->ss_end = end; + if (sm->sm_pp_root) + avl_remove(sm->sm_pp_root, ss_before); + ss = ss_before; } else if (merge_after) { ss_after->ss_start = start; + if (sm->sm_pp_root) + avl_remove(sm->sm_pp_root, ss_after); + ss = ss_after; } else { ss = kmem_alloc(sizeof (*ss), KM_SLEEP); ss->ss_start = start; @@ -128,6 +140,9 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size) avl_insert(&sm->sm_root, ss, where); } + if (sm->sm_pp_root) + avl_add(sm->sm_pp_root, ss); + sm->sm_space += size; } @@ -162,12 +177,17 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) left_over = (ss->ss_start != start); right_over = (ss->ss_end != end); + if (sm->sm_pp_root) + avl_remove(sm->sm_pp_root, ss); + if (left_over && right_over) { newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP); newseg->ss_start = end; newseg->ss_end = ss->ss_end; ss->ss_end = start; avl_insert_here(&sm->sm_root, newseg, ss, AVL_AFTER); + if (sm->sm_pp_root) + avl_add(sm->sm_pp_root, newseg); } else if (left_over) { ss->ss_end = start; } else if (right_over) { @@ -175,12 +195,16 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size) } else { avl_remove(&sm->sm_root, ss); kmem_free(ss, sizeof (*ss)); + ss = NULL; } + if (sm->sm_pp_root && ss != NULL) + avl_add(sm->sm_pp_root, ss); + sm->sm_space -= size; } -int +boolean_t space_map_contains(space_map_t *sm, uint64_t start, uint64_t size) { avl_index_t where; @@ -220,59 +244,10 @@ space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest) { space_seg_t *ss; - for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) - func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); -} - -void -space_map_excise(space_map_t *sm, uint64_t start, uint64_t size) -{ - avl_tree_t *t = &sm->sm_root; - avl_index_t where; - space_seg_t *ss, search; - uint64_t end = start + size; - uint64_t rm_start, rm_end; - ASSERT(MUTEX_HELD(sm->sm_lock)); - search.ss_start = start; - search.ss_end = start; - - for (;;) { - ss = avl_find(t, &search, &where); - - if (ss == NULL) - ss = avl_nearest(t, where, AVL_AFTER); - - if (ss == NULL || ss->ss_start >= end) - break; - - rm_start = MAX(ss->ss_start, start); - rm_end = MIN(ss->ss_end, end); - - space_map_remove(sm, rm_start, rm_end - rm_start); - } -} - -/* - * Replace smd with the union of smd and sms. - */ -void -space_map_union(space_map_t *smd, space_map_t *sms) -{ - avl_tree_t *t = &sms->sm_root; - space_seg_t *ss; - - ASSERT(MUTEX_HELD(smd->sm_lock)); - - /* - * For each source segment, remove any intersections with the - * destination, then add the source segment to the destination. - */ - for (ss = avl_first(t); ss != NULL; ss = AVL_NEXT(t, ss)) { - space_map_excise(smd, ss->ss_start, ss->ss_end - ss->ss_start); - space_map_add(smd, ss->ss_start, ss->ss_end - ss->ss_start); - } + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + func(mdest, ss->ss_start, ss->ss_end - ss->ss_start); } /* @@ -283,8 +258,10 @@ space_map_load_wait(space_map_t *sm) { ASSERT(MUTEX_HELD(sm->sm_lock)); - while (sm->sm_loading) + while (sm->sm_loading) { + ASSERT(!sm->sm_loaded); cv_wait(&sm->sm_load_cv, sm->sm_lock); + } } /* @@ -301,11 +278,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, int error = 0; ASSERT(MUTEX_HELD(sm->sm_lock)); - - space_map_load_wait(sm); - - if (sm->sm_loaded) - return (0); + ASSERT(!sm->sm_loaded); + ASSERT(!sm->sm_loading); sm->sm_loading = B_TRUE; end = smo->smo_objsize; @@ -336,7 +310,8 @@ space_map_load(space_map_t *sm, space_map_ops_t *ops, uint8_t maptype, smo->smo_object, offset, size); mutex_exit(sm->sm_lock); - error = dmu_read(os, smo->smo_object, offset, size, entry_map); + error = dmu_read(os, smo->smo_object, offset, size, entry_map, + DMU_READ_PREFETCH); mutex_enter(sm->sm_lock); if (error != 0) break; @@ -389,6 +364,13 @@ space_map_unload(space_map_t *sm) space_map_vacate(sm, NULL, NULL); } +uint64_t +space_map_maxsize(space_map_t *sm) +{ + ASSERT(sm->sm_ops != NULL); + return (sm->sm_ops->smop_max(sm)); +} + uint64_t space_map_alloc(space_map_t *sm, uint64_t size) { @@ -504,3 +486,131 @@ space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx) smo->smo_objsize = 0; smo->smo_alloc = 0; } + +/* + * Space map reference trees. + * + * A space map is a collection of integers. Every integer is either + * in the map, or it's not. A space map reference tree generalizes + * the idea: it allows its members to have arbitrary reference counts, + * as opposed to the implicit reference count of 0 or 1 in a space map. + * This representation comes in handy when computing the union or + * intersection of multiple space maps. For example, the union of + * N space maps is the subset of the reference tree with refcnt >= 1. + * The intersection of N space maps is the subset with refcnt >= N. + * + * [It's very much like a Fourier transform. Unions and intersections + * are hard to perform in the 'space map domain', so we convert the maps + * into the 'reference count domain', where it's trivial, then invert.] + * + * vdev_dtl_reassess() uses computations of this form to determine + * DTL_MISSING and DTL_OUTAGE for interior vdevs -- e.g. a RAID-Z vdev + * has an outage wherever refcnt >= vdev_nparity + 1, and a mirror vdev + * has an outage wherever refcnt >= vdev_children. + */ +static int +space_map_ref_compare(const void *x1, const void *x2) +{ + const space_ref_t *sr1 = x1; + const space_ref_t *sr2 = x2; + + if (sr1->sr_offset < sr2->sr_offset) + return (-1); + if (sr1->sr_offset > sr2->sr_offset) + return (1); + + if (sr1 < sr2) + return (-1); + if (sr1 > sr2) + return (1); + + return (0); +} + +void +space_map_ref_create(avl_tree_t *t) +{ + avl_create(t, space_map_ref_compare, + sizeof (space_ref_t), offsetof(space_ref_t, sr_node)); +} + +void +space_map_ref_destroy(avl_tree_t *t) +{ + space_ref_t *sr; + void *cookie = NULL; + + while ((sr = avl_destroy_nodes(t, &cookie)) != NULL) + kmem_free(sr, sizeof (*sr)); + + avl_destroy(t); +} + +static void +space_map_ref_add_node(avl_tree_t *t, uint64_t offset, int64_t refcnt) +{ + space_ref_t *sr; + + sr = kmem_alloc(sizeof (*sr), KM_SLEEP); + sr->sr_offset = offset; + sr->sr_refcnt = refcnt; + + avl_add(t, sr); +} + +void +space_map_ref_add_seg(avl_tree_t *t, uint64_t start, uint64_t end, + int64_t refcnt) +{ + space_map_ref_add_node(t, start, refcnt); + space_map_ref_add_node(t, end, -refcnt); +} + +/* + * Convert (or add) a space map into a reference tree. + */ +void +space_map_ref_add_map(avl_tree_t *t, space_map_t *sm, int64_t refcnt) +{ + space_seg_t *ss; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + for (ss = avl_first(&sm->sm_root); ss; ss = AVL_NEXT(&sm->sm_root, ss)) + space_map_ref_add_seg(t, ss->ss_start, ss->ss_end, refcnt); +} + +/* + * Convert a reference tree into a space map. The space map will contain + * all members of the reference tree for which refcnt >= minref. + */ +void +space_map_ref_generate_map(avl_tree_t *t, space_map_t *sm, int64_t minref) +{ + uint64_t start = -1ULL; + int64_t refcnt = 0; + space_ref_t *sr; + + ASSERT(MUTEX_HELD(sm->sm_lock)); + + space_map_vacate(sm, NULL, NULL); + + for (sr = avl_first(t); sr != NULL; sr = AVL_NEXT(t, sr)) { + refcnt += sr->sr_refcnt; + if (refcnt >= minref) { + if (start == -1ULL) { + start = sr->sr_offset; + } + } else { + if (start != -1ULL) { + uint64_t end = sr->sr_offset; + ASSERT(start <= end); + if (end > start) + space_map_add(sm, start, end - start); + start = -1ULL; + } + } + } + ASSERT(refcnt == 0); + ASSERT(start == -1ULL); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h index 749bf53e5b5e8..c528fac1a6466 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/arc.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -68,12 +68,26 @@ typedef enum arc_buf_contents { #define ARC_CACHED (1 << 4) /* I/O was already in cache */ #define ARC_L2CACHE (1 << 5) /* cache in L2ARC */ -void arc_space_consume(uint64_t space); -void arc_space_return(uint64_t space); +/* + * The following breakdows of arc_size exist for kstat only. + */ +typedef enum arc_space_type { + ARC_SPACE_DATA, + ARC_SPACE_HDRS, + ARC_SPACE_L2HDRS, + ARC_SPACE_OTHER, + ARC_SPACE_NUMTYPES +} arc_space_type_t; + +void arc_space_consume(uint64_t space, arc_space_type_t type); +void arc_space_return(uint64_t space, arc_space_type_t type); void *arc_data_buf_alloc(uint64_t space); void arc_data_buf_free(void *buf, uint64_t space); arc_buf_t *arc_buf_alloc(spa_t *spa, int size, void *tag, arc_buf_contents_t type); +arc_buf_t *arc_loan_buf(spa_t *spa, int size); +void arc_return_buf(arc_buf_t *buf, void *tag); +void arc_loan_inuse_buf(arc_buf_t *buf, void *tag); void arc_buf_add_ref(arc_buf_t *buf, void *tag); int arc_buf_remove_ref(arc_buf_t *buf, void *tag); int arc_buf_size(arc_buf_t *buf); @@ -86,28 +100,17 @@ void arc_buf_thaw(arc_buf_t *buf); int arc_referenced(arc_buf_t *buf); #endif -typedef struct writeprops { - dmu_object_type_t wp_type; - uint8_t wp_level; - uint8_t wp_copies; - uint8_t wp_dncompress, wp_oscompress; - uint8_t wp_dnchecksum, wp_oschecksum; -} writeprops_t; - -void write_policy(spa_t *spa, const writeprops_t *wp, zio_prop_t *zp); -int arc_read(zio_t *pio, spa_t *spa, blkptr_t *bp, arc_buf_t *pbuf, +int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_buf_t *pbuf, arc_done_func_t *done, void *private, int priority, int zio_flags, uint32_t *arc_flags, const zbookmark_t *zb); -int arc_read_nolock(zio_t *pio, spa_t *spa, blkptr_t *bp, +int arc_read_nolock(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, void *private, int priority, int flags, uint32_t *arc_flags, const zbookmark_t *zb); -zio_t *arc_write(zio_t *pio, spa_t *spa, const writeprops_t *wp, - boolean_t l2arc, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - arc_done_func_t *ready, arc_done_func_t *done, void *private, int priority, - int zio_flags, const zbookmark_t *zb); -int arc_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, uint32_t arc_flags); -int arc_tryread(spa_t *spa, blkptr_t *bp, void *data); +zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, + blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, + arc_done_func_t *ready, arc_done_func_t *done, void *private, + int priority, int zio_flags, const zbookmark_t *zb); +void arc_free(spa_t *spa, const blkptr_t *bp); void arc_set_callback(arc_buf_t *buf, arc_evict_func_t *func, void *private); int arc_buf_evict(arc_buf_t *buf); @@ -123,7 +126,7 @@ void arc_fini(void); * Level 2 ARC */ -void l2arc_add_vdev(spa_t *spa, vdev_t *vd, uint64_t start, uint64_t end); +void l2arc_add_vdev(spa_t *spa, vdev_t *vd); void l2arc_remove_vdev(vdev_t *vd); boolean_t l2arc_vdev_present(vdev_t *vd); void l2arc_init(void); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h index cdb93a6c35a31..94143bccbc56a 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/bplist.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -29,6 +29,7 @@ #include #include #include +#include #include #ifdef __cplusplus @@ -67,6 +68,10 @@ typedef struct bplist { dmu_buf_t *bpl_cached_dbuf; } bplist_t; +typedef void bplist_sync_cb_t(void *arg, const blkptr_t *bp, dmu_tx_t *tx); + +extern void bplist_init(bplist_t *bpl); +extern void bplist_fini(bplist_t *bpl); extern uint64_t bplist_create(objset_t *mos, int blocksize, dmu_tx_t *tx); extern void bplist_destroy(objset_t *mos, uint64_t object, dmu_tx_t *tx); extern int bplist_open(bplist_t *bpl, objset_t *mos, uint64_t object); @@ -74,13 +79,15 @@ extern void bplist_close(bplist_t *bpl); extern boolean_t bplist_empty(bplist_t *bpl); extern int bplist_iterate(bplist_t *bpl, uint64_t *itorp, blkptr_t *bp); extern int bplist_enqueue(bplist_t *bpl, const blkptr_t *bp, dmu_tx_t *tx); +extern void bplist_enqueue_cb(void *bpl, const blkptr_t *bp, dmu_tx_t *tx); extern void bplist_enqueue_deferred(bplist_t *bpl, const blkptr_t *bp); -extern void bplist_sync(bplist_t *bpl, dmu_tx_t *tx); +extern void bplist_sync(bplist_t *bpl, bplist_sync_cb_t *func, + void *arg, dmu_tx_t *tx); extern void bplist_vacate(bplist_t *bpl, dmu_tx_t *tx); extern int bplist_space(bplist_t *bpl, uint64_t *usedp, uint64_t *compp, uint64_t *uncompp); extern int bplist_space_birthrange(bplist_t *bpl, - uint64_t mintxg, uint64_t maxtxg, uint64_t *dasizep); + uint64_t mintxg, uint64_t maxtxg, uint64_t *dsizep); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h index 75ce27264e3ce..d99ade07f8c67 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dbuf.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -75,7 +75,6 @@ typedef enum dbuf_states { DB_EVICTING } dbuf_states_t; -struct objset_impl; struct dnode; struct dmu_tx; @@ -134,6 +133,7 @@ typedef struct dbuf_dirty_record { arc_buf_t *dr_data; blkptr_t dr_overridden_by; override_states_t dr_override_state; + uint8_t dr_copies; } dl; } dt; } dbuf_dirty_record_t; @@ -148,7 +148,7 @@ typedef struct dmu_buf_impl { dmu_buf_t db; /* the objset we belong to */ - struct objset_impl *db_objset; + struct objset *db_objset; /* * the dnode we belong to (NULL when evicted) @@ -255,6 +255,7 @@ void dbuf_add_ref(dmu_buf_impl_t *db, void *tag); uint64_t dbuf_refcount(dmu_buf_impl_t *db); void dbuf_rele(dmu_buf_impl_t *db, void *tag); +void dbuf_rele_and_unlock(dmu_buf_impl_t *db, void *tag); dmu_buf_impl_t *dbuf_find(struct dnode *dn, uint8_t level, uint64_t blkid); @@ -264,7 +265,9 @@ void dbuf_fill_done(dmu_buf_impl_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); +arc_buf_t *dbuf_loan_arcbuf(dmu_buf_impl_t *db); void dbuf_clear(dmu_buf_impl_t *db); void dbuf_evict(dmu_buf_impl_t *db); @@ -323,7 +326,7 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_dbuf_bp(db, bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, bp); \ + sprintf_blkptr(__blkbuf, bp); \ dprintf_dbuf(db, fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h new file mode 100644 index 0000000000000..26bcbea5039bb --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/ddt.h @@ -0,0 +1,240 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_DDT_H +#define _SYS_DDT_H + +#include +#include +#include +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * On-disk DDT formats, in the desired search order (newest version first). + */ +enum ddt_type { + DDT_TYPE_ZAP = 0, + DDT_TYPES +}; + +/* + * DDT classes, in the desired search order (highest replication level first). + */ +enum ddt_class { + DDT_CLASS_DITTO = 0, + DDT_CLASS_DUPLICATE, + DDT_CLASS_UNIQUE, + DDT_CLASSES +}; + +#define DDT_TYPE_CURRENT 0 + +#define DDT_COMPRESS_BYTEORDER_MASK 0x80 +#define DDT_COMPRESS_FUNCTION_MASK 0x7f + +/* + * On-disk ddt entry: key (name) and physical storage (value). + */ +typedef struct ddt_key { + zio_cksum_t ddk_cksum; /* 256-bit block checksum */ + uint64_t ddk_prop; /* LSIZE, PSIZE, compression */ +} ddt_key_t; + +/* + * ddk_prop layout: + * + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | 0 | 0 | 0 | comp | PSIZE | LSIZE | + * +-------+-------+-------+-------+-------+-------+-------+-------+ + */ +#define DDK_GET_LSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_LSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_PSIZE(ddk) \ + BF64_GET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1) +#define DDK_SET_PSIZE(ddk, x) \ + BF64_SET_SB((ddk)->ddk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) + +#define DDK_GET_COMPRESS(ddk) BF64_GET((ddk)->ddk_prop, 32, 8) +#define DDK_SET_COMPRESS(ddk, x) BF64_SET((ddk)->ddk_prop, 32, 8, x) + +#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) + +typedef struct ddt_phys { + dva_t ddp_dva[SPA_DVAS_PER_BP]; + uint64_t ddp_refcnt; + uint64_t ddp_phys_birth; +} ddt_phys_t; + +enum ddt_phys_type { + DDT_PHYS_DITTO = 0, + DDT_PHYS_SINGLE = 1, + DDT_PHYS_DOUBLE = 2, + DDT_PHYS_TRIPLE = 3, + DDT_PHYS_TYPES +} ddt_phys_type_t; + +/* + * In-core ddt entry + */ +struct ddt_entry { + ddt_key_t dde_key; + ddt_phys_t dde_phys[DDT_PHYS_TYPES]; + zio_t *dde_lead_zio[DDT_PHYS_TYPES]; + void *dde_repair_data; + enum ddt_type dde_type; + enum ddt_class dde_class; + uint8_t dde_loading; + uint8_t dde_loaded; + kcondvar_t dde_cv; + avl_node_t dde_node; +}; + +/* + * In-core ddt + */ +struct ddt { + kmutex_t ddt_lock; + avl_tree_t ddt_tree; + avl_tree_t ddt_repair_tree; + enum zio_checksum ddt_checksum; + spa_t *ddt_spa; + objset_t *ddt_os; + uint64_t ddt_stat_object; + uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; + ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; + avl_node_t ddt_node; +}; + +/* + * In-core and on-disk bookmark for DDT walks + */ +typedef struct ddt_bookmark { + uint64_t ddb_class; + uint64_t ddb_type; + uint64_t ddb_checksum; + uint64_t ddb_cursor; +} ddt_bookmark_t; + +/* + * Ops vector to access a specific DDT object type. + */ +typedef struct ddt_ops { + char ddt_op_name[32]; + int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, + boolean_t prehash); + int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); + int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); + int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, + uint64_t *walk); + uint64_t (*ddt_op_count)(objset_t *os, uint64_t object); +} ddt_ops_t; + +#define DDT_NAMELEN 80 + +extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, char *name); +extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, uint64_t *walk, ddt_entry_t *dde); +extern uint64_t ddt_object_count(ddt_t *ddt, enum ddt_type type, + enum ddt_class class); +extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, + enum ddt_class class, dmu_object_info_t *); +extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, + enum ddt_class class); + +extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, + uint64_t txg); +extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, + const ddt_phys_t *ddp, blkptr_t *bp); + +extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); + +extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); +extern void ddt_phys_clear(ddt_phys_t *ddp); +extern void ddt_phys_addref(ddt_phys_t *ddp); +extern void ddt_phys_decref(ddt_phys_t *ddp); +extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, + uint64_t txg); +extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); +extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); + +extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); + +extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); +extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); +extern boolean_t ddt_histogram_empty(const ddt_histogram_t *ddh); +extern void ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo); +extern void ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh); +extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); + +extern uint64_t ddt_get_dedup_dspace(spa_t *spa); +extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); + +extern int ddt_ditto_copies_needed(ddt_t *ddt, ddt_entry_t *dde, + ddt_phys_t *ddp_willref); +extern int ddt_ditto_copies_present(ddt_entry_t *dde); + +extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); +extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); + +extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); +extern void ddt_enter(ddt_t *ddt); +extern void ddt_exit(ddt_t *ddt); +extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); +extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); + +extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, + const blkptr_t *bp); + +extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); +extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); + +extern int ddt_entry_compare(const void *x1, const void *x2); + +extern void ddt_create(spa_t *spa); +extern int ddt_load(spa_t *spa); +extern void ddt_unload(spa_t *spa); +extern void ddt_sync(spa_t *spa, uint64_t txg); +extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); + +extern const ddt_ops_t ddt_zap_ops; + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DDT_H */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h index 3b1e5c8fbc1fd..b41bc96c38f1b 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,12 +38,14 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { #endif struct uio; +struct xuio; struct page; struct vnode; struct spa; @@ -59,7 +61,8 @@ struct drr_end; struct zbookmark; struct spa; struct nvlist; -struct objset_impl; +struct arc_buf; +struct zio_prop; typedef struct objset objset_t; typedef struct dmu_tx dmu_tx_t; @@ -114,6 +117,11 @@ typedef enum dmu_object_type { DMU_OT_FUID_SIZE, /* FUID table size UINT64 */ DMU_OT_NEXT_CLONES, /* ZAP */ DMU_OT_SCRUB_QUEUE, /* ZAP */ + DMU_OT_USERGROUP_USED, /* ZAP */ + DMU_OT_USERGROUP_QUOTA, /* ZAP */ + DMU_OT_USERREFS, /* ZAP */ + DMU_OT_DDT_ZAP, /* ZAP */ + DMU_OT_DDT_STATS, /* ZAP */ DMU_OT_NUMTYPES } dmu_object_type_t; @@ -136,16 +144,6 @@ void zfs_oldacl_byteswap(void *buf, size_t size); void zfs_acl_byteswap(void *buf, size_t size); void zfs_znode_byteswap(void *buf, size_t size); -#define DS_MODE_NOHOLD 0 /* internal use only */ -#define DS_MODE_USER 1 /* simple access, no special needs */ -#define DS_MODE_OWNER 2 /* the "main" access, e.g. a mount */ -#define DS_MODE_TYPE_MASK 0x3 -#define DS_MODE_TYPE(x) ((x) & DS_MODE_TYPE_MASK) -#define DS_MODE_READONLY 0x8 -#define DS_MODE_IS_READONLY(x) ((x) & DS_MODE_READONLY) -#define DS_MODE_INCONSISTENT 0x10 -#define DS_MODE_IS_INCONSISTENT(x) ((x) & DS_MODE_INCONSISTENT) - #define DS_FIND_SNAPSHOTS (1<<0) #define DS_FIND_CHILDREN (1<<1) @@ -156,25 +154,32 @@ void zfs_znode_byteswap(void *buf, size_t size); #define DMU_MAX_ACCESS (10<<20) /* 10MB */ #define DMU_MAX_DELETEBLKCNT (20480) /* ~5MB of indirect blocks */ +#define DMU_USERUSED_OBJECT (-1ULL) +#define DMU_GROUPUSED_OBJECT (-2ULL) +#define DMU_DEADLIST_OBJECT (-3ULL) + /* * Public routines to create, destroy, open, and close objsets. */ -int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp); -int dmu_objset_open_ds(struct dsl_dataset *ds, dmu_objset_type_t type, - objset_t **osp); -void dmu_objset_close(objset_t *os); +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, void *tag); +int dmu_objset_open_ds(struct dsl_dataset *ds, objset_t **osp); + int dmu_objset_evict_dbufs(objset_t *os); -int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, uint64_t flags, +int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); -int dmu_snapshots_destroy(char *fsname, char *snapname); -int dmu_objset_rollback(objset_t *os); -int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, + uint64_t flags); +int dmu_objset_destroy(const char *name, boolean_t defer); +int dmu_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); +int dmu_objset_snapshot(char *fsname, char *snapname, struct nvlist *props, + boolean_t recursive); int dmu_objset_rename(const char *name, const char *newname, boolean_t recursive); -int dmu_objset_find(char *name, int func(char *, void *), void *arg, +int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); void dmu_objset_byteswap(void *buf, size_t size); @@ -201,9 +206,16 @@ typedef void dmu_buf_evict_func_t(struct dmu_buf *db, void *user_ptr); #define DMU_POOL_HISTORY "history" #define DMU_POOL_PROPS "pool_props" #define DMU_POOL_L2CACHE "l2cache" +#define DMU_POOL_TMP_USERREFS "tmp_userrefs" +#define DMU_POOL_DDT "DDT-%s-%s-%s" +#define DMU_POOL_DDT_STATS "DDT-statistics" /* 4x8 zbookmark_t */ #define DMU_POOL_SCRUB_BOOKMARK "scrub_bookmark" +/* 4x8 ddt_bookmark_t */ +#define DMU_POOL_SCRUB_DDT_BOOKMARK "scrub_ddt_bookmark" +/* 1x8 max_class */ +#define DMU_POOL_SCRUB_DDT_CLASS_MAX "scrub_ddt_class_max" /* 1x8 zap obj DMU_OT_SCRUB_QUEUE */ #define DMU_POOL_SCRUB_QUEUE "scrub_queue" /* 1x8 txg */ @@ -235,7 +247,7 @@ uint64_t dmu_object_alloc(objset_t *os, dmu_object_type_t ot, int dmu_object_claim(objset_t *os, uint64_t object, dmu_object_type_t ot, int blocksize, dmu_object_type_t bonus_type, int bonus_len, dmu_tx_t *tx); int dmu_object_reclaim(objset_t *os, uint64_t object, dmu_object_type_t ot, - int blocksize, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); + int blocksize, dmu_object_type_t bonustype, int bonuslen); /* * Free an object from this objset. @@ -298,11 +310,13 @@ void dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress, dmu_tx_t *tx); /* - * Decide how many copies of a given block we should make. Can be from - * 1 to SPA_DVAS_PER_BP. + * Decide how to write a block: checksum, compression, number of copies, etc. */ -int dmu_get_replication_level(struct objset_impl *, struct zbookmark *zb, - dmu_object_type_t ot); +#define WP_NOFILL 0x1 +#define WP_DMU_SYNC 0x2 + +void dmu_write_policy(objset_t *os, struct dnode *dn, int level, int wp, + struct zio_prop *zp); /* * The bonus data is accessed more or less like a regular buffer. * You must dmu_bonus_hold() to get the buffer, which will give you a @@ -397,6 +411,11 @@ void *dmu_buf_get_user(dmu_buf_t *db); */ void dmu_buf_will_dirty(dmu_buf_t *db, dmu_tx_t *tx); +/* + * Tells if the given dbuf is freeable. + */ +boolean_t dmu_buf_freeable(dmu_buf_t *); + /* * You must create a transaction, then hold the objects which you will * (or might) modify as part of this transaction. Then you must assign @@ -422,13 +441,33 @@ dmu_tx_t *dmu_tx_create(objset_t *os); void dmu_tx_hold_write(dmu_tx_t *tx, uint64_t object, uint64_t off, int len); void dmu_tx_hold_free(dmu_tx_t *tx, uint64_t object, uint64_t off, uint64_t len); -void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, char *name); +void dmu_tx_hold_zap(dmu_tx_t *tx, uint64_t object, int add, const char *name); void dmu_tx_hold_bonus(dmu_tx_t *tx, uint64_t object); void dmu_tx_abort(dmu_tx_t *tx); int dmu_tx_assign(dmu_tx_t *tx, uint64_t txg_how); void dmu_tx_wait(dmu_tx_t *tx); void dmu_tx_commit(dmu_tx_t *tx); +/* + * To register a commit callback, dmu_tx_callback_register() must be called. + * + * dcb_data is a pointer to caller private data that is passed on as a + * callback parameter. The caller is responsible for properly allocating and + * freeing it. + * + * When registering a callback, the transaction must be already created, but + * it cannot be committed or aborted. It can be assigned to a txg or not. + * + * The callback will be called after the transaction has been safely written + * to stable storage and will also be called if the dmu_tx is aborted. + * If there is any error which prevents the transaction from being committed to + * disk, the callback will be called with a value of error != 0. + */ +typedef void dmu_tx_callback_func_t(void *dcb_data, int error); + +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); + /* * Free up the data blocks for a defined range of a file. If size is * zero, the range from offset to end-of-file is freed. @@ -445,8 +484,10 @@ int dmu_free_object(objset_t *os, uint64_t object); * Canfail routines will return 0 on success, or an errno if there is a * nonrecoverable I/O error. */ +#define DMU_READ_PREFETCH 0 /* prefetch */ +#define DMU_READ_NO_PREFETCH 1 /* don't prefetch */ int dmu_read(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, - void *buf); + void *buf, uint32_t flags); void dmu_write(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, const void *buf, dmu_tx_t *tx); void dmu_prealloc(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, @@ -456,6 +497,19 @@ int dmu_write_uio(objset_t *os, uint64_t object, struct uio *uio, uint64_t size, dmu_tx_t *tx); int dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, struct page *pp, dmu_tx_t *tx); +struct arc_buf *dmu_request_arcbuf(dmu_buf_t *handle, int size); +void dmu_return_arcbuf(struct arc_buf *buf); +void dmu_assign_arcbuf(dmu_buf_t *handle, uint64_t offset, struct arc_buf *buf, + dmu_tx_t *tx); +int dmu_xuio_init(struct xuio *uio, int niov); +void dmu_xuio_fini(struct xuio *uio); +int dmu_xuio_add(struct xuio *uio, struct arc_buf *abuf, offset_t off, + size_t n); +int dmu_xuio_cnt(struct xuio *uio); +struct arc_buf *dmu_xuio_arcbuf(struct xuio *uio, int i); +void dmu_xuio_clear(struct xuio *uio, int i); +void xuio_stat_wbuf_copied(); +void xuio_stat_wbuf_nocopy(); extern int zfs_prefetch_disable; @@ -466,19 +520,19 @@ void dmu_prefetch(objset_t *os, uint64_t object, uint64_t offset, uint64_t len); typedef struct dmu_object_info { - /* All sizes are in bytes. */ + /* All sizes are in bytes unless otherwise indicated. */ uint32_t doi_data_block_size; uint32_t doi_metadata_block_size; - uint64_t doi_bonus_size; dmu_object_type_t doi_type; dmu_object_type_t doi_bonus_type; + uint64_t doi_bonus_size; uint8_t doi_indirection; /* 2 = dnode->indirect->data */ uint8_t doi_checksum; uint8_t doi_compress; uint8_t doi_pad[5]; - /* Values below are number of 512-byte blocks. */ - uint64_t doi_physical_blks; /* data + metadata */ - uint64_t doi_max_block_offset; + uint64_t doi_physical_blocks_512; /* data + metadata, 512b blks */ + uint64_t doi_max_offset; + uint64_t doi_fill_count; /* number of non-empty blocks */ } dmu_object_info_t; typedef void arc_byteswap_func_t(void *buf, size_t size); @@ -547,6 +601,11 @@ void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, */ uint64_t dmu_objset_fsid_guid(objset_t *os); +/* + * Get the [cm]time for an objset's snapshot dir + */ +timestruc_t dmu_objset_snap_cmtime(objset_t *os); + int dmu_objset_is_snapshot(objset_t *os); extern struct spa *dmu_objset_spa(objset_t *os); @@ -556,12 +615,18 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os); extern void dmu_objset_name(objset_t *os, char *buf); extern dmu_objset_type_t dmu_objset_type(objset_t *os); extern uint64_t dmu_objset_id(objset_t *os); +extern uint64_t dmu_objset_logbias(objset_t *os); extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name, uint64_t *id, uint64_t *offp, boolean_t *case_conflict); extern int dmu_snapshot_realname(objset_t *os, char *name, char *real, int maxlen, boolean_t *conflict); extern int dmu_dir_list_next(objset_t *os, int namelen, char *name, uint64_t *idp, uint64_t *offp); + +typedef int objset_used_cb_t(dmu_object_type_t bonustype, + void *bonus, uint64_t *userp, uint64_t *groupp); +extern void dmu_objset_register_type(dmu_objset_type_t ost, + objset_used_cb_t *cb); extern void dmu_objset_set_user(objset_t *os, void *user_ptr); extern void *dmu_objset_get_user(objset_t *os); @@ -580,9 +645,20 @@ uint64_t dmu_tx_get_txg(dmu_tx_t *tx); * storage when the write completes this new data does not become a * permanent part of the file until the associated transaction commits. */ -typedef void dmu_sync_cb_t(dmu_buf_t *db, void *arg); -int dmu_sync(struct zio *zio, dmu_buf_t *db, - struct blkptr *bp, uint64_t txg, dmu_sync_cb_t *done, void *arg); + +/* + * {zfs,zvol,ztest}_get_done() args + */ +typedef struct zgd { + struct zilog *zgd_zilog; + struct blkptr *zgd_bp; + dmu_buf_t *zgd_db; + struct rl *zgd_rl; + void *zgd_private; +} zgd_t; + +typedef void dmu_sync_cb_t(zgd_t *arg, int error); +int dmu_sync(struct zio *zio, uint64_t txg, dmu_sync_cb_t *done, zgd_t *zgd); /* * Find the next hole or data block in file starting at *off @@ -617,15 +693,15 @@ typedef struct dmu_recv_cookie { struct dsl_dataset *drc_real_ds; struct drr_begin *drc_drrb; char *drc_tosnap; + char *drc_top_ds; boolean_t drc_newfs; boolean_t drc_force; } dmu_recv_cookie_t; -int dmu_recv_begin(char *tofs, char *tosnap, struct drr_begin *, - boolean_t force, objset_t *origin, boolean_t online, dmu_recv_cookie_t *); +int dmu_recv_begin(char *tofs, char *tosnap, char *topds, struct drr_begin *, + boolean_t force, objset_t *origin, dmu_recv_cookie_t *); int dmu_recv_stream(dmu_recv_cookie_t *drc, struct vnode *vp, offset_t *voffp); int dmu_recv_end(dmu_recv_cookie_t *drc); -void dmu_recv_abort_cleanup(dmu_recv_cookie_t *drc); /* CRC64 table */ #define ZFS_CRC64_POLY 0xC96C5795D7870F42ULL /* ECMA-182, reflected form */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h index 96ce688e1551a..22f9f5f8c88c4 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -210,11 +210,11 @@ extern "C" { * * ds_lock * protects: - * ds_user_ptr - * ds_user_evice_func + * ds_objset * ds_open_refcount * ds_snapname * ds_phys accounting + * ds_phys userrefs zapobj * ds_reserved * held from: * dsl_dataset_* @@ -232,6 +232,39 @@ extern "C" { struct objset; struct dmu_pool; +typedef struct dmu_xuio { + int next; + int cnt; + struct arc_buf **bufs; + iovec_t *iovp; +} dmu_xuio_t; + +typedef struct xuio_stats { + /* loaned yet not returned arc_buf */ + kstat_named_t xuiostat_onloan_rbuf; + kstat_named_t xuiostat_onloan_wbuf; + /* whether a copy is made when loaning out a read buffer */ + kstat_named_t xuiostat_rbuf_copied; + kstat_named_t xuiostat_rbuf_nocopy; + /* whether a copy is made when assigning a write buffer */ + kstat_named_t xuiostat_wbuf_copied; + kstat_named_t xuiostat_wbuf_nocopy; +} xuio_stats_t; + +static xuio_stats_t xuio_stats = { + { "onloan_read_buf", KSTAT_DATA_UINT64 }, + { "onloan_write_buf", KSTAT_DATA_UINT64 }, + { "read_buf_copied", KSTAT_DATA_UINT64 }, + { "read_buf_nocopy", KSTAT_DATA_UINT64 }, + { "write_buf_copied", KSTAT_DATA_UINT64 }, + { "write_buf_nocopy", KSTAT_DATA_UINT64 } +}; + +#define XUIOSTAT_INCR(stat, val) \ + atomic_add_64(&xuio_stats.stat.value.ui64, (val)) +#define XUIOSTAT_BUMP(stat) XUIOSTAT_INCR(stat, 1) + + #ifdef __cplusplus } #endif diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h index 1d65727808c32..a153602021595 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_objset.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -40,40 +40,50 @@ extern "C" { struct dsl_dataset; struct dmu_tx; -struct objset_impl; + +#define OBJSET_PHYS_SIZE 2048 +#define OBJSET_OLD_PHYS_SIZE 1024 + +#define OBJSET_FLAG_USERACCOUNTING_COMPLETE (1ULL<<0) typedef struct objset_phys { dnode_phys_t os_meta_dnode; zil_header_t os_zil_header; uint64_t os_type; - char os_pad[1024 - sizeof (dnode_phys_t) - sizeof (zil_header_t) - - sizeof (uint64_t)]; + uint64_t os_flags; + char os_pad[OBJSET_PHYS_SIZE - sizeof (dnode_phys_t)*3 - + sizeof (zil_header_t) - sizeof (uint64_t)*2]; + dnode_phys_t os_userused_dnode; + dnode_phys_t os_groupused_dnode; } objset_phys_t; struct objset { - struct objset_impl *os; - int os_mode; -}; - -typedef struct objset_impl { /* Immutable: */ struct dsl_dataset *os_dsl_dataset; spa_t *os_spa; arc_buf_t *os_phys_buf; objset_phys_t *os_phys; dnode_t *os_meta_dnode; + dnode_t *os_userused_dnode; + dnode_t *os_groupused_dnode; zilog_t *os_zil; - objset_t os; - uint8_t os_checksum; /* can change, under dsl_dir's locks */ - uint8_t os_compress; /* can change, under dsl_dir's locks */ - uint8_t os_copies; /* can change, under dsl_dir's locks */ - uint8_t os_primary_cache; /* can change, under dsl_dir's locks */ - uint8_t os_secondary_cache; /* can change, under dsl_dir's locks */ + + /* can change, under dsl_dir's locks: */ + uint8_t os_checksum; + uint8_t os_compress; + uint8_t os_copies; + uint8_t os_dedup_checksum; + uint8_t os_dedup_verify; + uint8_t os_logbias; + uint8_t os_primary_cache; + uint8_t os_secondary_cache; /* no lock needed: */ struct dmu_tx *os_synctx; /* XXX sketchy */ blkptr_t *os_rootbp; zil_header_t os_zil_header; + list_t os_synced_dnodes; + uint64_t os_flags; /* Protected by os_obj_lock */ kmutex_t os_obj_lock; @@ -89,44 +99,57 @@ typedef struct objset_impl { /* stuff we store for the user */ kmutex_t os_user_ptr_lock; void *os_user_ptr; -} objset_impl_t; +}; +#define DMU_META_OBJSET 0 #define DMU_META_DNODE_OBJECT 0 +#define DMU_OBJECT_IS_SPECIAL(obj) ((int64_t)(obj) <= 0) #define DMU_OS_IS_L2CACHEABLE(os) \ ((os)->os_secondary_cache == ZFS_CACHE_ALL || \ (os)->os_secondary_cache == ZFS_CACHE_METADATA) /* called from zpl */ -int dmu_objset_open(const char *name, dmu_objset_type_t type, int mode, - objset_t **osp); -void dmu_objset_close(objset_t *os); -int dmu_objset_create(const char *name, dmu_objset_type_t type, - objset_t *clone_parent, uint64_t flags, +int dmu_objset_hold(const char *name, void *tag, objset_t **osp); +int dmu_objset_own(const char *name, dmu_objset_type_t type, + boolean_t readonly, void *tag, objset_t **osp); +void dmu_objset_rele(objset_t *os, void *tag); +void dmu_objset_disown(objset_t *os, void *tag); +int dmu_objset_from_ds(struct dsl_dataset *ds, objset_t **osp); + +int dmu_objset_create(const char *name, dmu_objset_type_t type, uint64_t flags, void (*func)(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx), void *arg); -int dmu_objset_destroy(const char *name); -int dmu_objset_rollback(objset_t *os); -int dmu_objset_snapshot(char *fsname, char *snapname, boolean_t recursive); +int dmu_objset_clone(const char *name, struct dsl_dataset *clone_origin, + uint64_t flags); +int dmu_objset_destroy(const char *name, boolean_t defer); +int dmu_objset_snapshot(char *fsname, char *snapname, nvlist_t *props, + boolean_t recursive); void dmu_objset_stats(objset_t *os, nvlist_t *nv); void dmu_objset_fast_stat(objset_t *os, dmu_objset_stats_t *stat); void dmu_objset_space(objset_t *os, uint64_t *refdbytesp, uint64_t *availbytesp, uint64_t *usedobjsp, uint64_t *availobjsp); uint64_t dmu_objset_fsid_guid(objset_t *os); -int dmu_objset_find(char *name, int func(char *, void *), void *arg, +int dmu_objset_find(char *name, int func(const char *, void *), void *arg, int flags); int dmu_objset_find_spa(spa_t *spa, const char *name, int func(spa_t *, uint64_t, const char *, void *), void *arg, int flags); -int dmu_objset_prefetch(char *name, void *arg); +int dmu_objset_prefetch(const char *name, void *arg); void dmu_objset_byteswap(void *buf, size_t size); int dmu_objset_evict_dbufs(objset_t *os); +timestruc_t dmu_objset_snap_cmtime(objset_t *os); /* called from dsl */ -void dmu_objset_sync(objset_impl_t *os, zio_t *zio, dmu_tx_t *tx); -objset_impl_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, +void dmu_objset_sync(objset_t *os, zio_t *zio, dmu_tx_t *tx); +boolean_t dmu_objset_is_dirty(objset_t *os, uint64_t txg); +objset_t *dmu_objset_create_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, dmu_objset_type_t type, dmu_tx_t *tx); int dmu_objset_open_impl(spa_t *spa, struct dsl_dataset *ds, blkptr_t *bp, - objset_impl_t **osip); -void dmu_objset_evict(struct dsl_dataset *ds, void *arg); + objset_t **osp); +void dmu_objset_evict(objset_t *os); +void dmu_objset_do_userquota_callbacks(objset_t *os, dmu_tx_t *tx); +boolean_t dmu_objset_userused_enabled(objset_t *os); +int dmu_objset_userspace_upgrade(objset_t *os); +boolean_t dmu_objset_userspace_present(objset_t *os); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h index 3e026891153c3..5b0821253dd78 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_traverse.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,8 +36,9 @@ extern "C" { struct dnode_phys; struct dsl_dataset; +struct zilog; -typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp, +typedef int (blkptr_cb_t)(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_t *zb, const struct dnode_phys *dnp, void *arg); #define TRAVERSE_PRE (1<<0) @@ -45,10 +46,12 @@ typedef int (blkptr_cb_t)(spa_t *spa, blkptr_t *bp, #define TRAVERSE_PREFETCH_METADATA (1<<2) #define TRAVERSE_PREFETCH_DATA (1<<3) #define TRAVERSE_PREFETCH (TRAVERSE_PREFETCH_METADATA | TRAVERSE_PREFETCH_DATA) +#define TRAVERSE_HARD (1<<4) -int traverse_dataset(struct dsl_dataset *ds, uint64_t txg_start, - int flags, blkptr_cb_t func, void *arg); -int traverse_pool(spa_t *spa, blkptr_cb_t func, void *arg); +int traverse_dataset(struct dsl_dataset *ds, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); +int traverse_pool(spa_t *spa, + uint64_t txg_start, int flags, blkptr_cb_t func, void *arg); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h index 2727daaaa76b1..ed01cdf38210f 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_tx.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DMU_TX_H #define _SYS_DMU_TX_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -59,6 +57,7 @@ struct dmu_tx { txg_handle_t tx_txgh; void *tx_tempreserve_cookie; struct dmu_tx_hold *tx_needassign_txh; + list_t tx_callbacks; /* list of dmu_tx_callback_t on this dmu_tx */ uint8_t tx_anyobj; int tx_err; #ifdef ZFS_DEBUG @@ -98,6 +97,11 @@ typedef struct dmu_tx_hold { #endif } dmu_tx_hold_t; +typedef struct dmu_tx_callback { + list_node_t dcb_node; /* linked to tx_callbacks list */ + dmu_tx_callback_func_t *dcb_func; /* caller function pointer */ + void *dcb_data; /* caller private data */ +} dmu_tx_callback_t; /* * These routines are defined in dmu.h, and are called by the user. @@ -109,6 +113,10 @@ void dmu_tx_abort(dmu_tx_t *tx); uint64_t dmu_tx_get_txg(dmu_tx_t *tx); void dmu_tx_wait(dmu_tx_t *tx); +void dmu_tx_callback_register(dmu_tx_t *tx, dmu_tx_callback_func_t *dcb_func, + void *dcb_data); +void dmu_tx_do_callbacks(list_t *cb_list, int error); + /* * These routines are defined in dmu_spa.h, and are called by the SPA. */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h index c94bced933aff..78cadd2b1ee1b 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dmu_zfetch.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _DFETCH_H #define _DFETCH_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -63,6 +61,9 @@ typedef struct zfetch { uint64_t zf_alloc_fail; /* # of failed attempts to alloc strm */ } zfetch_t; +void zfetch_init(void); +void zfetch_fini(void); + void dmu_zfetch_init(zfetch_t *, struct dnode *); void dmu_zfetch_rele(zfetch_t *); void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, int); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h index c79ff48a60c56..58e62d93c1460 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dnode.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -74,6 +74,7 @@ extern "C" { #define DNODES_PER_BLOCK_SHIFT (DNODE_BLOCK_SHIFT - DNODE_SHIFT) #define DNODES_PER_BLOCK (1ULL << DNODES_PER_BLOCK_SHIFT) #define DNODES_PER_LEVEL_SHIFT (DN_MAX_INDBLKSHIFT - SPA_BLKPTRSHIFT) +#define DNODES_PER_LEVEL (1ULL << DNODES_PER_LEVEL_SHIFT) /* The +2 here is a cheesy way to round up */ #define DN_MAX_LEVELS (2 + ((DN_MAX_OFFSET_SHIFT - SPA_MINBLOCKSHIFT) / \ @@ -88,7 +89,7 @@ extern "C" { #define EPB(blkshift, typeshift) (1 << (blkshift - typeshift)) struct dmu_buf_impl; -struct objset_impl; +struct objset; struct zio; enum dnode_dirtycontext { @@ -98,7 +99,8 @@ enum dnode_dirtycontext { }; /* Is dn_used in bytes? if not, it's in multiples of SPA_MINBLOCKSIZE */ -#define DNODE_FLAG_USED_BYTES (1<<0) +#define DNODE_FLAG_USED_BYTES (1<<0) +#define DNODE_FLAG_USERUSED_ACCOUNTED (1<<1) typedef struct dnode_phys { uint8_t dn_type; /* dmu_object_type_t */ @@ -131,14 +133,11 @@ typedef struct dnode { */ krwlock_t dn_struct_rwlock; - /* - * Our link on dataset's dd_dnodes list. - * Protected by dd_accounting_mtx. - */ + /* Our link on dn_objset->os_dnodes list; protected by os_lock. */ list_node_t dn_link; /* immutable: */ - struct objset_impl *dn_objset; + struct objset *dn_objset; uint64_t dn_object; struct dmu_buf_impl *dn_dbuf; dnode_phys_t *dn_phys; /* pointer into dn->dn_dbuf->db.db_data */ @@ -160,6 +159,7 @@ typedef struct dnode { uint16_t dn_datablkszsec; /* in 512b sectors */ uint32_t dn_datablksz; /* in bytes */ uint64_t dn_maxblkid; + uint8_t dn_next_nblkptr[TXG_SIZE]; uint8_t dn_next_nlevels[TXG_SIZE]; uint8_t dn_next_indblkshift[TXG_SIZE]; uint16_t dn_next_bonuslen[TXG_SIZE]; @@ -190,6 +190,9 @@ typedef struct dnode { /* parent IO for current sync write */ zio_t *dn_zio; + /* used in syncing context */ + dnode_phys_t *dn_oldphys; + /* holds prefetch structure */ struct zfetch dn_zfetch; } dnode_t; @@ -200,14 +203,14 @@ typedef struct free_range { uint64_t fr_nblks; } free_range_t; -dnode_t *dnode_special_open(struct objset_impl *dd, dnode_phys_t *dnp, +dnode_t *dnode_special_open(struct objset *dd, dnode_phys_t *dnp, uint64_t object); void dnode_special_close(dnode_t *dn); void dnode_setbonuslen(dnode_t *dn, int newsize, dmu_tx_t *tx); -int dnode_hold(struct objset_impl *dd, uint64_t object, +int dnode_hold(struct objset *dd, uint64_t object, void *ref, dnode_t **dnp); -int dnode_hold_impl(struct objset_impl *dd, uint64_t object, int flag, +int dnode_hold_impl(struct objset *dd, uint64_t object, int flag, void *ref, dnode_t **dnp); boolean_t dnode_add_ref(dnode_t *dn, void *ref); void dnode_rele(dnode_t *dn, void *ref); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h index 8665aec2dda87..6eb7505ea53ff 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dataset.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -42,8 +42,6 @@ struct dsl_dataset; struct dsl_dir; struct dsl_pool; -typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); - #define DS_FLAG_INCONSISTENT (1ULL<<0) #define DS_IS_INCONSISTENT(ds) \ ((ds)->ds_phys->ds_flags & DS_FLAG_INCONSISTENT) @@ -62,6 +60,14 @@ typedef void dsl_dataset_evict_func_t(struct dsl_dataset *, void *); */ #define DS_FLAG_UNIQUE_ACCURATE (1ULL<<2) +/* + * DS_FLAG_DEFER_DESTROY is set after 'zfs destroy -d' has been called + * on a dataset. This allows the dataset to be destroyed using 'zfs release'. + */ +#define DS_FLAG_DEFER_DESTROY (1ULL<<3) +#define DS_IS_DEFER_DESTROY(ds) \ + ((ds)->ds_phys->ds_flags & DS_FLAG_DEFER_DESTROY) + /* * DS_FLAG_CI_DATASET is set if the dataset contains a file system whose * name lookups should be performed case-insensitively. @@ -93,7 +99,8 @@ typedef struct dsl_dataset_phys { blkptr_t ds_bp; uint64_t ds_next_clones_obj; /* DMU_OT_DSL_CLONES */ uint64_t ds_props_obj; /* DMU_OT_DSL_PROPS for snaps */ - uint64_t ds_pad[6]; /* pad out to 320 bytes for good measure */ + uint64_t ds_userrefs_obj; /* DMU_OT_USERREFS */ + uint64_t ds_pad[5]; /* pad out to 320 bytes for good measure */ } dsl_dataset_phys_t; typedef struct dsl_dataset { @@ -111,6 +118,9 @@ typedef struct dsl_dataset { /* has internal locking: */ bplist_t ds_deadlist; + /* to protect against multiple concurrent incremental recv */ + kmutex_t ds_recvlock; + /* protected by lock on pool's dp_dirty_datasets list */ txg_node_t ds_dirty_link; list_node_t ds_synced_link; @@ -120,8 +130,8 @@ typedef struct dsl_dataset { * Protected by ds_lock: */ kmutex_t ds_lock; - void *ds_user_ptr; - dsl_dataset_evict_func_t *ds_user_evict_func; + objset_t *ds_objset; + uint64_t ds_userrefs; /* * ds_owner is protected by the ds_rwlock and the ds_lock @@ -143,6 +153,15 @@ typedef struct dsl_dataset { char ds_snapname[MAXNAMELEN]; } dsl_dataset_t; +struct dsl_ds_destroyarg { + dsl_dataset_t *ds; /* ds to destroy */ + dsl_dataset_t *rm_origin; /* also remove our origin? */ + boolean_t is_origin_rm; /* set if removing origin snap */ + boolean_t defer; /* destroy -d requested? */ + boolean_t releasing; /* destroying due to release? */ + boolean_t need_prep; /* do we need to retry due to EBUSY? */ +}; + #define dsl_dataset_is_snapshot(ds) \ ((ds)->ds_phys->ds_num_children != 0) @@ -152,36 +171,38 @@ typedef struct dsl_dataset { int dsl_dataset_hold(const char *name, void *tag, dsl_dataset_t **dsp); int dsl_dataset_hold_obj(struct dsl_pool *dp, uint64_t dsobj, void *tag, dsl_dataset_t **); -int dsl_dataset_own(const char *name, int flags, void *owner, - dsl_dataset_t **dsp); +int dsl_dataset_own(const char *name, boolean_t inconsistentok, + void *tag, dsl_dataset_t **dsp); int dsl_dataset_own_obj(struct dsl_pool *dp, uint64_t dsobj, - int flags, void *owner, dsl_dataset_t **); + boolean_t inconsistentok, void *tag, dsl_dataset_t **dsp); void dsl_dataset_name(dsl_dataset_t *ds, char *name); void dsl_dataset_rele(dsl_dataset_t *ds, void *tag); -void dsl_dataset_disown(dsl_dataset_t *ds, void *owner); +void dsl_dataset_disown(dsl_dataset_t *ds, void *tag); void dsl_dataset_drop_ref(dsl_dataset_t *ds, void *tag); boolean_t dsl_dataset_tryown(dsl_dataset_t *ds, boolean_t inconsistentok, - void *owner); -void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *owner); + void *tag); +void dsl_dataset_make_exclusive(dsl_dataset_t *ds, void *tag); uint64_t dsl_dataset_create_sync(dsl_dir_t *pds, const char *lastname, dsl_dataset_t *origin, uint64_t flags, cred_t *, dmu_tx_t *); uint64_t dsl_dataset_create_sync_dd(dsl_dir_t *dd, dsl_dataset_t *origin, uint64_t flags, dmu_tx_t *tx); -int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag); -int dsl_snapshots_destroy(char *fsname, char *snapname); +int dsl_dataset_destroy(dsl_dataset_t *ds, void *tag, boolean_t defer); +int dsl_snapshots_destroy(char *fsname, char *snapname, boolean_t defer); dsl_checkfunc_t dsl_dataset_destroy_check; dsl_syncfunc_t dsl_dataset_destroy_sync; dsl_checkfunc_t dsl_dataset_snapshot_check; dsl_syncfunc_t dsl_dataset_snapshot_sync; -int dsl_dataset_rollback(dsl_dataset_t *ds, dmu_objset_type_t ost); int dsl_dataset_rename(char *name, const char *newname, boolean_t recursive); -int dsl_dataset_promote(const char *name); +int dsl_dataset_promote(const char *name, char *conflsnap); int dsl_dataset_clone_swap(dsl_dataset_t *clone, dsl_dataset_t *origin_head, boolean_t force); - -void *dsl_dataset_set_user_ptr(dsl_dataset_t *ds, - void *p, dsl_dataset_evict_func_t func); -void *dsl_dataset_get_user_ptr(dsl_dataset_t *ds); +int dsl_dataset_user_hold(char *dsname, char *snapname, char *htag, + boolean_t recursive, boolean_t temphold); +int dsl_dataset_user_release(char *dsname, char *snapname, char *htag, + boolean_t recursive); +int dsl_dataset_user_release_tmp(struct dsl_pool *dp, uint64_t dsobj, + char *htag); +int dsl_dataset_get_holds(const char *dsname, nvlist_t **nvp); blkptr_t *dsl_dataset_get_blkptr(dsl_dataset_t *ds); void dsl_dataset_set_blkptr(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); @@ -192,10 +213,11 @@ boolean_t dsl_dataset_modified_since_lastsnap(dsl_dataset_t *ds); void dsl_dataset_sync(dsl_dataset_t *os, zio_t *zio, dmu_tx_t *tx); -void dsl_dataset_block_born(dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx); -int dsl_dataset_block_kill(dsl_dataset_t *ds, blkptr_t *bp, zio_t *pio, +void dsl_dataset_block_born(dsl_dataset_t *ds, const blkptr_t *bp, dmu_tx_t *tx); -int dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); +int dsl_dataset_block_kill(dsl_dataset_t *ds, const blkptr_t *bp, + dmu_tx_t *tx, boolean_t async); +boolean_t dsl_dataset_block_freeable(dsl_dataset_t *ds, uint64_t blk_birth); uint64_t dsl_dataset_prev_snap_txg(dsl_dataset_t *ds); void dsl_dataset_dirty(dsl_dataset_t *ds, dmu_tx_t *tx); @@ -211,13 +233,14 @@ int dsl_dsobj_to_dsname(char *pname, uint64_t obj, char *buf); int dsl_dataset_check_quota(dsl_dataset_t *ds, boolean_t check_quota, uint64_t asize, uint64_t inflight, uint64_t *used, uint64_t *ref_rsrv); -int dsl_dataset_set_quota(const char *dsname, uint64_t quota); +int dsl_dataset_set_quota(const char *dsname, zprop_source_t source, + uint64_t quota); void dsl_dataset_set_quota_sync(void *arg1, void *arg2, cred_t *cr, dmu_tx_t *tx); -int dsl_dataset_set_reservation(const char *dsname, uint64_t reservation); -void dsl_dataset_set_flags(dsl_dataset_t *ds, uint64_t flags); -int64_t dsl_dataset_new_refreservation(dsl_dataset_t *ds, uint64_t reservation, - dmu_tx_t *tx); +int dsl_dataset_set_reservation(const char *dsname, zprop_source_t source, + uint64_t reservation); + +int dsl_destroy_inconsistent(const char *dsname, void *arg); #ifdef ZFS_DEBUG #define dprintf_ds(ds, fmt, ...) do { \ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h index a29e44e67d0c5..a26a3f7058a19 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_deleg.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DSL_DELEG_H #define _SYS_DSL_DELEG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -51,6 +49,12 @@ extern "C" { #define ZFS_DELEG_PERM_ALLOW "allow" #define ZFS_DELEG_PERM_USERPROP "userprop" #define ZFS_DELEG_PERM_VSCAN "vscan" +#define ZFS_DELEG_PERM_USERQUOTA "userquota" +#define ZFS_DELEG_PERM_GROUPQUOTA "groupquota" +#define ZFS_DELEG_PERM_USERUSED "userused" +#define ZFS_DELEG_PERM_GROUPUSED "groupused" +#define ZFS_DELEG_PERM_HOLD "hold" +#define ZFS_DELEG_PERM_RELEASE "release" /* * Note: the names of properties that are marked delegatable are also diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h index 86b9636ceaabb..14a64e019e0fa 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_dir.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -89,6 +89,7 @@ struct dsl_dir { /* Protected by dd_lock */ kmutex_t dd_lock; list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */ + timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */ /* gross estimate of space used by in-flight tx's */ uint64_t dd_tempreserved[TXG_SIZE]; @@ -107,7 +108,6 @@ int dsl_dir_open_obj(dsl_pool_t *dp, uint64_t ddobj, const char *tail, void *tag, dsl_dir_t **); void dsl_dir_name(dsl_dir_t *dd, char *buf); int dsl_dir_namelen(dsl_dir_t *dd); -int dsl_dir_is_private(dsl_dir_t *dd); uint64_t dsl_dir_create_sync(dsl_pool_t *dp, dsl_dir_t *pds, const char *name, dmu_tx_t *tx); dsl_checkfunc_t dsl_dir_destroy_check; @@ -126,14 +126,18 @@ void dsl_dir_diduse_space(dsl_dir_t *dd, dd_used_t type, int64_t used, int64_t compressed, int64_t uncompressed, dmu_tx_t *tx); void dsl_dir_transfer_space(dsl_dir_t *dd, int64_t delta, dd_used_t oldtype, dd_used_t newtype, dmu_tx_t *tx); -int dsl_dir_set_quota(const char *ddname, uint64_t quota); -int dsl_dir_set_reservation(const char *ddname, uint64_t reservation); +int dsl_dir_set_quota(const char *ddname, zprop_source_t source, + uint64_t quota); +int dsl_dir_set_reservation(const char *ddname, zprop_source_t source, + uint64_t reservation); int dsl_dir_rename(dsl_dir_t *dd, const char *newname); int dsl_dir_transfer_possible(dsl_dir_t *sdd, dsl_dir_t *tdd, uint64_t space); int dsl_dir_set_reservation_check(void *arg1, void *arg2, dmu_tx_t *tx); boolean_t dsl_dir_is_clone(dsl_dir_t *dd); void dsl_dir_new_refreservation(dsl_dir_t *dd, struct dsl_dataset *ds, uint64_t reservation, cred_t *cr, dmu_tx_t *tx); +void dsl_dir_snap_cmtime_update(dsl_dir_t *dd); +timestruc_t dsl_dir_snap_cmtime(dsl_dir_t *dd); /* internal reserved dir name */ #define MOS_DIR_NAME "$MOS" diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h index 3bb4ad4efe55f..4e49d212a3052 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_pool.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,6 +32,7 @@ #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -77,13 +78,15 @@ typedef struct dsl_pool { struct dsl_dir *dp_mos_dir; struct dsl_dataset *dp_origin_snap; uint64_t dp_root_dir_obj; + struct taskq *dp_vnrele_taskq; /* No lock needed - sync context only */ blkptr_t dp_meta_rootbp; list_t dp_synced_datasets; hrtime_t dp_read_overhead; - uint64_t dp_throughput; + uint64_t dp_throughput; /* bytes per millisec */ uint64_t dp_write_limit; + uint64_t dp_tmp_userrefs_obj; /* Uses dp_lock */ kmutex_t dp_lock; @@ -94,12 +97,15 @@ typedef struct dsl_pool { uint64_t dp_scrub_queue_obj; uint64_t dp_scrub_min_txg; uint64_t dp_scrub_max_txg; + uint64_t dp_scrub_start_time; + uint64_t dp_scrub_ddt_class_max; zbookmark_t dp_scrub_bookmark; + ddt_bookmark_t dp_scrub_ddt_bookmark; boolean_t dp_scrub_pausing; boolean_t dp_scrub_isresilver; - uint64_t dp_scrub_start_time; - kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ boolean_t dp_scrub_restart; + kmutex_t dp_scrub_cancel_lock; /* protects dp_scrub_restart */ + zio_t *dp_scrub_prefetch_zio_root; /* Has its own locking */ tx_state_t dp_tx; @@ -122,15 +128,15 @@ int dsl_pool_open(spa_t *spa, uint64_t txg, dsl_pool_t **dpp); void dsl_pool_close(dsl_pool_t *dp); dsl_pool_t *dsl_pool_create(spa_t *spa, nvlist_t *zplprops, uint64_t txg); void dsl_pool_sync(dsl_pool_t *dp, uint64_t txg); -void dsl_pool_zil_clean(dsl_pool_t *dp); +void dsl_pool_sync_done(dsl_pool_t *dp, uint64_t txg); int dsl_pool_sync_context(dsl_pool_t *dp); uint64_t dsl_pool_adjustedsize(dsl_pool_t *dp, boolean_t netfree); +uint64_t dsl_pool_adjustedfree(dsl_pool_t *dp, boolean_t netfree); int dsl_pool_tempreserve_space(dsl_pool_t *dp, uint64_t space, dmu_tx_t *tx); void dsl_pool_tempreserve_clear(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); void dsl_pool_memory_pressure(dsl_pool_t *dp); void dsl_pool_willuse_space(dsl_pool_t *dp, int64_t space, dmu_tx_t *tx); -int dsl_free(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp, - zio_done_func_t *done, void *private, uint32_t arc_flags); +void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp); void dsl_pool_ds_destroyed(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_pool_ds_snapshotted(struct dsl_dataset *ds, struct dmu_tx *tx); void dsl_pool_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, @@ -142,6 +148,16 @@ int dsl_pool_scrub_cancel(dsl_pool_t *dp); int dsl_pool_scrub_clean(dsl_pool_t *dp); void dsl_pool_scrub_sync(dsl_pool_t *dp, dmu_tx_t *tx); void dsl_pool_scrub_restart(dsl_pool_t *dp); +void dsl_pool_scrub_ddt_entry(dsl_pool_t *dp, enum zio_checksum checksum, + const ddt_entry_t *dde); + +taskq_t *dsl_pool_vnrele_taskq(dsl_pool_t *dp); + +extern int dsl_pool_user_hold(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, uint64_t *now, dmu_tx_t *tx); +extern int dsl_pool_user_release(dsl_pool_t *dp, uint64_t dsobj, + const char *tag, dmu_tx_t *tx); +extern void dsl_pool_clean_tmp_userrefs(dsl_pool_t *dp); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h index d66caa86cff61..d8a8ab2d64e4a 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/dsl_prop.h @@ -19,18 +19,17 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DSL_PROP_H #define _SYS_DSL_PROP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include +#include #ifdef __cplusplus extern "C" { @@ -50,6 +49,25 @@ typedef struct dsl_prop_cb_record { void *cbr_arg; } dsl_prop_cb_record_t; +typedef struct dsl_props_arg { + nvlist_t *pa_props; + zprop_source_t pa_source; +} dsl_props_arg_t; + +typedef struct dsl_prop_set_arg { + const char *psa_name; + zprop_source_t psa_source; + int psa_intsz; + int psa_numints; + const void *psa_value; + + /* + * Used to handle the special requirements of the quota and reservation + * properties. + */ + uint64_t psa_effective_value; +} dsl_prop_setarg_t; + int dsl_prop_register(struct dsl_dataset *ds, const char *propname, dsl_prop_changed_cb_t *callback, void *cbarg); int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname, @@ -60,17 +78,37 @@ int dsl_prop_get(const char *ddname, const char *propname, int intsz, int numints, void *buf, char *setpoint); int dsl_prop_get_integer(const char *ddname, const char *propname, uint64_t *valuep, char *setpoint); -int dsl_prop_get_all(objset_t *os, nvlist_t **nvp, boolean_t local); +int dsl_prop_get_all(objset_t *os, nvlist_t **nvp); +int dsl_prop_get_received(objset_t *os, nvlist_t **nvp); int dsl_prop_get_ds(struct dsl_dataset *ds, const char *propname, int intsz, int numints, void *buf, char *setpoint); int dsl_prop_get_dd(struct dsl_dir *dd, const char *propname, - int intsz, int numints, void *buf, char *setpoint); + int intsz, int numints, void *buf, char *setpoint, + boolean_t snapshot); +dsl_syncfunc_t dsl_props_set_sync; int dsl_prop_set(const char *ddname, const char *propname, - int intsz, int numints, const void *buf); -void dsl_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, + zprop_source_t source, int intsz, int numints, const void *buf); +int dsl_props_set(const char *dsname, zprop_source_t source, nvlist_t *nvl); +void dsl_dir_prop_set_uint64_sync(dsl_dir_t *dd, const char *name, uint64_t val, cred_t *cr, dmu_tx_t *tx); +void dsl_prop_setarg_init_uint64(dsl_prop_setarg_t *psa, const char *propname, + zprop_source_t source, uint64_t *value); +int dsl_prop_predict_sync(dsl_dir_t *dd, dsl_prop_setarg_t *psa); +#ifdef ZFS_DEBUG +void dsl_prop_check_prediction(dsl_dir_t *dd, dsl_prop_setarg_t *psa); +#define DSL_PROP_CHECK_PREDICTION(dd, psa) \ + dsl_prop_check_prediction((dd), (psa)) +#else +#define DSL_PROP_CHECK_PREDICTION(dd, psa) /* nothing */ +#endif + +/* flag first receive on or after SPA_VERSION_RECVD_PROPS */ +boolean_t dsl_prop_get_hasrecvd(objset_t *os); +void dsl_prop_set_hasrecvd(objset_t *os); +void dsl_prop_unset_hasrecvd(objset_t *os); + void dsl_prop_nvlist_add_uint64(nvlist_t *nv, zfs_prop_t prop, uint64_t value); void dsl_prop_nvlist_add_string(nvlist_t *nv, zfs_prop_t prop, const char *value); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h index 1c9d89e8fd69e..5ce6251ddbd3f 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,14 +36,14 @@ extern "C" { #endif -typedef struct metaslab_class metaslab_class_t; -typedef struct metaslab_group metaslab_group_t; +extern space_map_ops_t *zfs_metaslab_ops; extern metaslab_t *metaslab_init(metaslab_group_t *mg, space_map_obj_t *smo, uint64_t start, uint64_t size, uint64_t txg); extern void metaslab_fini(metaslab_t *msp); extern void metaslab_sync(metaslab_t *msp, uint64_t txg); extern void metaslab_sync_done(metaslab_t *msp, uint64_t txg); +extern void metaslab_sync_reassess(metaslab_group_t *mg); #define METASLAB_HINTBP_FAVOR 0x0 #define METASLAB_HINTBP_AVOID 0x1 @@ -55,14 +55,24 @@ extern void metaslab_free(spa_t *spa, const blkptr_t *bp, uint64_t txg, boolean_t now); extern int metaslab_claim(spa_t *spa, const blkptr_t *bp, uint64_t txg); -extern metaslab_class_t *metaslab_class_create(void); +extern metaslab_class_t *metaslab_class_create(spa_t *spa, + space_map_ops_t *ops); extern void metaslab_class_destroy(metaslab_class_t *mc); -extern void metaslab_class_add(metaslab_class_t *mc, metaslab_group_t *mg); -extern void metaslab_class_remove(metaslab_class_t *mc, metaslab_group_t *mg); +extern int metaslab_class_validate(metaslab_class_t *mc); + +extern void metaslab_class_space_update(metaslab_class_t *mc, + int64_t alloc_delta, int64_t defer_delta, + int64_t space_delta, int64_t dspace_delta); +extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_space(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc); extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd); extern void metaslab_group_destroy(metaslab_group_t *mg); +extern void metaslab_group_activate(metaslab_group_t *mg); +extern void metaslab_group_passivate(metaslab_group_t *mg); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h index 5980cbc843aca..07988dd51a738 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/metaslab_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_METASLAB_IMPL_H #define _SYS_METASLAB_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -39,15 +37,23 @@ extern "C" { #endif struct metaslab_class { + spa_t *mc_spa; metaslab_group_t *mc_rotor; - uint64_t mc_allocated; + space_map_ops_t *mc_ops; + uint64_t mc_aliquot; + uint64_t mc_alloc; /* total allocated space */ + uint64_t mc_deferred; /* total deferred frees */ + uint64_t mc_space; /* total space (alloc + free) */ + uint64_t mc_dspace; /* total deflated space */ }; struct metaslab_group { kmutex_t mg_lock; avl_tree_t mg_metaslab_tree; uint64_t mg_aliquot; + uint64_t mg_bonus_area; int64_t mg_bias; + int64_t mg_activation_count; metaslab_class_t *mg_class; vdev_t *mg_vd; metaslab_group_t *mg_prev; @@ -67,7 +73,9 @@ struct metaslab { space_map_obj_t ms_smo_syncing; /* syncing space map object */ space_map_t ms_allocmap[TXG_SIZE]; /* allocated this txg */ space_map_t ms_freemap[TXG_SIZE]; /* freed this txg */ + space_map_t ms_defermap[TXG_DEFER_SIZE]; /* deferred frees */ space_map_t ms_map; /* in-core free space map */ + int64_t ms_deferspace; /* sum of ms_defermap[] space */ uint64_t ms_weight; /* weight vs. others in group */ metaslab_group_t *ms_group; /* metaslab group */ avl_node_t ms_group_node; /* node in metaslab group tree */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h index 24b3ca4476795..868d4fc1d7a29 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -43,8 +43,13 @@ extern "C" { typedef struct spa spa_t; typedef struct vdev vdev_t; typedef struct metaslab metaslab_t; +typedef struct metaslab_group metaslab_group_t; +typedef struct metaslab_class metaslab_class_t; +typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; +typedef struct ddt ddt_t; +typedef struct ddt_entry ddt_entry_t; struct dsl_pool; /* @@ -134,15 +139,15 @@ typedef struct zio_cksum { * +-------+-------+-------+-------+-------+-------+-------+-------+ * 5 |G| offset3 | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 6 |E| lvl | type | cksum | comp | PSIZE | LSIZE | + * 6 |BDX|lvl| type | cksum | comp | PSIZE | LSIZE | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 7 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ * 8 | padding | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * 9 | padding | + * 9 | physical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ - * a | birth txg | + * a | logical birth txg | * +-------+-------+-------+-------+-------+-------+-------+-------+ * b | fill count | * +-------+-------+-------+-------+-------+-------+-------+-------+ @@ -166,25 +171,29 @@ typedef struct zio_cksum { * cksum checksum function * comp compression function * G gang block indicator - * E endianness - * type DMU object type + * B byteorder (endianness) + * D dedup + * X unused * lvl level of indirection - * birth txg transaction group in which the block was born + * type DMU object type + * phys birth txg of block allocation; zero if same as logical birth txg + * log. birth transaction group in which the block was logically born * fill count number of non-zero blocks under this bp * checksum[4] 256-bit checksum of the data this bp describes */ -typedef struct blkptr { - dva_t blk_dva[3]; /* 128-bit Data Virtual Address */ - uint64_t blk_prop; /* size, compression, type, etc */ - uint64_t blk_pad[3]; /* Extra space for the future */ - uint64_t blk_birth; /* transaction group at birth */ - uint64_t blk_fill; /* fill count */ - zio_cksum_t blk_cksum; /* 256-bit checksum */ -} blkptr_t; - #define SPA_BLKPTRSHIFT 7 /* blkptr_t is 128 bytes */ #define SPA_DVAS_PER_BP 3 /* Number of DVAs in a bp */ +typedef struct blkptr { + dva_t blk_dva[SPA_DVAS_PER_BP]; /* Data Virtual Addresses */ + uint64_t blk_prop; /* size, compression, type, etc */ + uint64_t blk_pad[2]; /* Extra space for the future */ + uint64_t blk_phys_birth; /* txg when block was allocated */ + uint64_t blk_birth; /* transaction group at birth */ + uint64_t blk_fill; /* fill count */ + zio_cksum_t blk_cksum; /* 256-bit checksum */ +} blkptr_t; + /* * Macros to get and set fields in a bp or DVA. */ @@ -208,8 +217,7 @@ typedef struct blkptr { #define DVA_SET_GANG(dva, x) BF64_SET((dva)->dva_word[1], 63, 1, x) #define BP_GET_LSIZE(bp) \ - (BP_IS_HOLE(bp) ? 0 : \ - BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1)) + BF64_GET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) #define BP_SET_LSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1, x) @@ -218,20 +226,35 @@ typedef struct blkptr { #define BP_SET_PSIZE(bp, x) \ BF64_SET_SB((bp)->blk_prop, 16, 16, SPA_MINBLOCKSHIFT, 1, x) -#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) -#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) +#define BP_GET_COMPRESS(bp) BF64_GET((bp)->blk_prop, 32, 8) +#define BP_SET_COMPRESS(bp, x) BF64_SET((bp)->blk_prop, 32, 8, x) -#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) -#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) +#define BP_GET_CHECKSUM(bp) BF64_GET((bp)->blk_prop, 40, 8) +#define BP_SET_CHECKSUM(bp, x) BF64_SET((bp)->blk_prop, 40, 8, x) -#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) -#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) +#define BP_GET_TYPE(bp) BF64_GET((bp)->blk_prop, 48, 8) +#define BP_SET_TYPE(bp, x) BF64_SET((bp)->blk_prop, 48, 8, x) -#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) -#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) +#define BP_GET_LEVEL(bp) BF64_GET((bp)->blk_prop, 56, 5) +#define BP_SET_LEVEL(bp, x) BF64_SET((bp)->blk_prop, 56, 5, x) -#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) -#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) +#define BP_GET_PROP_BIT_61(bp) BF64_GET((bp)->blk_prop, 61, 1) +#define BP_SET_PROP_BIT_61(bp, x) BF64_SET((bp)->blk_prop, 61, 1, x) + +#define BP_GET_DEDUP(bp) BF64_GET((bp)->blk_prop, 62, 1) +#define BP_SET_DEDUP(bp, x) BF64_SET((bp)->blk_prop, 62, 1, x) + +#define BP_GET_BYTEORDER(bp) (0 - BF64_GET((bp)->blk_prop, 63, 1)) +#define BP_SET_BYTEORDER(bp, x) BF64_SET((bp)->blk_prop, 63, 1, x) + +#define BP_PHYSICAL_BIRTH(bp) \ + ((bp)->blk_phys_birth ? (bp)->blk_phys_birth : (bp)->blk_birth) + +#define BP_SET_BIRTH(bp, logical, physical) \ +{ \ + (bp)->blk_birth = (logical); \ + (bp)->blk_phys_birth = ((logical) == (physical) ? 0 : (physical)); \ +} #define BP_GET_ASIZE(bp) \ (DVA_GET_ASIZE(&(bp)->blk_dva[0]) + DVA_GET_ASIZE(&(bp)->blk_dva[1]) + \ @@ -255,6 +278,12 @@ typedef struct blkptr { ((dva1)->dva_word[1] == (dva2)->dva_word[1] && \ (dva1)->dva_word[0] == (dva2)->dva_word[0]) +#define BP_EQUAL(bp1, bp2) \ + (BP_PHYSICAL_BIRTH(bp1) == BP_PHYSICAL_BIRTH(bp2) && \ + DVA_EQUAL(&(bp1)->blk_dva[0], &(bp2)->blk_dva[0]) && \ + DVA_EQUAL(&(bp1)->blk_dva[1], &(bp2)->blk_dva[1]) && \ + DVA_EQUAL(&(bp1)->blk_dva[2], &(bp2)->blk_dva[2])) + #define ZIO_CHECKSUM_EQUAL(zc1, zc2) \ (0 == (((zc1).zc_word[0] - (zc2).zc_word[0]) | \ ((zc1).zc_word[1] - (zc2).zc_word[1]) | \ @@ -274,7 +303,10 @@ typedef struct blkptr { #define BP_IDENTITY(bp) (&(bp)->blk_dva[0]) #define BP_IS_GANG(bp) DVA_GET_GANG(BP_IDENTITY(bp)) #define BP_IS_HOLE(bp) ((bp)->blk_birth == 0) -#define BP_IS_OLDER(bp, txg) (!BP_IS_HOLE(bp) && (bp)->blk_birth < (txg)) + +/* BP_IS_RAIDZ(bp) assumes no block compression */ +#define BP_IS_RAIDZ(bp) (DVA_GET_ASIZE(&(bp)->blk_dva[0]) > \ + BP_GET_PSIZE(bp)) #define BP_ZERO(bp) \ { \ @@ -287,14 +319,12 @@ typedef struct blkptr { (bp)->blk_prop = 0; \ (bp)->blk_pad[0] = 0; \ (bp)->blk_pad[1] = 0; \ - (bp)->blk_pad[2] = 0; \ + (bp)->blk_phys_birth = 0; \ (bp)->blk_birth = 0; \ (bp)->blk_fill = 0; \ ZIO_SET_CHECKSUM(&(bp)->blk_cksum, 0, 0, 0, 0); \ } -#define BLK_FILL_ALREADY_FREED (-1ULL) - /* * Note: the byteorder is either 0 or -1, both of which are palindromes. * This simplifies the endianness handling a bit. @@ -309,30 +339,92 @@ typedef struct blkptr { #define BP_SPRINTF_LEN 320 +/* + * This macro allows code sharing between zfs, libzpool, and mdb. + * 'func' is either snprintf() or mdb_snprintf(). + * 'ws' (whitespace) can be ' ' for single-line format, '\n' for multi-line. + */ +#define SPRINTF_BLKPTR(func, ws, buf, bp, type, checksum, compress) \ +{ \ + static const char *copyname[] = \ + { "zero", "single", "double", "triple" }; \ + int size = BP_SPRINTF_LEN; \ + int len = 0; \ + int copies = 0; \ + \ + if (bp == NULL) { \ + len = func(buf + len, size - len, ""); \ + } else if (BP_IS_HOLE(bp)) { \ + len = func(buf + len, size - len, ""); \ + } else { \ + for (int d = 0; d < BP_GET_NDVAS(bp); d++) { \ + const dva_t *dva = &bp->blk_dva[d]; \ + if (DVA_IS_VALID(dva)) \ + copies++; \ + len += func(buf + len, size - len, \ + "DVA[%d]=<%llu:%llx:%llx>%c", d, \ + (u_longlong_t)DVA_GET_VDEV(dva), \ + (u_longlong_t)DVA_GET_OFFSET(dva), \ + (u_longlong_t)DVA_GET_ASIZE(dva), \ + ws); \ + } \ + if (BP_IS_GANG(bp) && \ + DVA_GET_ASIZE(&bp->blk_dva[2]) <= \ + DVA_GET_ASIZE(&bp->blk_dva[1]) / 2) \ + copies--; \ + len += func(buf + len, size - len, \ + "[L%llu %s] %s %s %s %s %s %s%c" \ + "size=%llxL/%llxP birth=%lluL/%lluP fill=%llu%c" \ + "cksum=%llx:%llx:%llx:%llx", \ + (u_longlong_t)BP_GET_LEVEL(bp), \ + type, \ + checksum, \ + compress, \ + BP_GET_BYTEORDER(bp) == 0 ? "BE" : "LE", \ + BP_IS_GANG(bp) ? "gang" : "contiguous", \ + BP_GET_DEDUP(bp) ? "dedup" : "unique", \ + copyname[copies], \ + ws, \ + (u_longlong_t)BP_GET_LSIZE(bp), \ + (u_longlong_t)BP_GET_PSIZE(bp), \ + (u_longlong_t)bp->blk_birth, \ + (u_longlong_t)BP_PHYSICAL_BIRTH(bp), \ + (u_longlong_t)bp->blk_fill, \ + ws, \ + (u_longlong_t)bp->blk_cksum.zc_word[0], \ + (u_longlong_t)bp->blk_cksum.zc_word[1], \ + (u_longlong_t)bp->blk_cksum.zc_word[2], \ + (u_longlong_t)bp->blk_cksum.zc_word[3]); \ + } \ + ASSERT(len < size); \ +} + #include #define BP_GET_BUFC_TYPE(bp) \ (((BP_GET_LEVEL(bp) > 0) || (dmu_ot[BP_GET_TYPE(bp)].ot_metadata)) ? \ ARC_BUFC_METADATA : ARC_BUFC_DATA); -/* - * Routines found in spa.c - */ + +typedef enum spa_import_type { + SPA_IMPORT_EXISTING, + SPA_IMPORT_ASSEMBLE +} spa_import_type_t; /* state manipulation functions */ extern int spa_open(const char *pool, spa_t **, void *tag); +extern int spa_open_rewind(const char *pool, spa_t **, void *tag, + nvlist_t *policy, nvlist_t **config); extern int spa_get_stats(const char *pool, nvlist_t **config, char *altroot, size_t buflen); extern int spa_create(const char *pool, nvlist_t *config, nvlist_t *props, const char *history_str, nvlist_t *zplprops); -extern int spa_check_rootconf(char *devpath, char *devid, - nvlist_t **bestconf, uint64_t *besttxg); -extern boolean_t spa_rootdev_validate(nvlist_t *nv); extern int spa_import_rootpool(char *devpath, char *devid); extern int spa_import(const char *pool, nvlist_t *config, nvlist_t *props); -extern int spa_import_faulted(const char *, nvlist_t *, nvlist_t *); +extern int spa_import_verbatim(const char *, nvlist_t *, nvlist_t *); extern nvlist_t *spa_tryimport(nvlist_t *tryconfig); extern int spa_destroy(char *pool); -extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force); +extern int spa_export(char *pool, nvlist_t **oldconfig, boolean_t force, + boolean_t hardforce); extern int spa_reset(char *pool); extern void spa_async_request(spa_t *spa, int flag); extern void spa_async_unrequest(spa_t *spa, int flag); @@ -346,14 +438,19 @@ extern void spa_inject_delref(spa_t *spa); #define SPA_ASYNC_PROBE 0x04 #define SPA_ASYNC_RESILVER_DONE 0x08 #define SPA_ASYNC_RESILVER 0x10 +#define SPA_ASYNC_AUTOEXPAND 0x20 /* device manipulation */ extern int spa_vdev_add(spa_t *spa, nvlist_t *nvroot); extern int spa_vdev_attach(spa_t *spa, uint64_t guid, nvlist_t *nvroot, int replacing); -extern int spa_vdev_detach(spa_t *spa, uint64_t guid, int replace_done); +extern int spa_vdev_detach(spa_t *spa, uint64_t guid, uint64_t pguid, + int replace_done); extern int spa_vdev_remove(spa_t *spa, uint64_t guid, boolean_t unspare); extern int spa_vdev_setpath(spa_t *spa, uint64_t guid, const char *newpath); +extern int spa_vdev_setfru(spa_t *spa, uint64_t guid, const char *newfru); +extern int spa_vdev_split_mirror(spa_t *spa, char *newname, nvlist_t *config, + nvlist_t *props, boolean_t exp); /* spare state (which is global across all pools) */ extern void spa_spare_add(vdev_t *vd); @@ -367,7 +464,6 @@ extern void spa_l2cache_remove(vdev_t *vd); extern boolean_t spa_l2cache_exists(uint64_t guid, uint64_t *pool); extern void spa_l2cache_activate(vdev_t *vd); extern void spa_l2cache_drop(spa_t *spa); -extern void spa_l2cache_space_update(vdev_t *vd, int64_t space, int64_t alloc); /* scrubbing */ extern int spa_scrub(spa_t *spa, pool_scrub_type_t type); @@ -376,6 +472,10 @@ extern int spa_scrub(spa_t *spa, pool_scrub_type_t type); extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */ extern void spa_sync_allpools(void); +#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */ +#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ +#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ + /* spa namespace global mutex */ extern kmutex_t spa_namespace_lock; @@ -393,7 +493,6 @@ extern void spa_config_set(spa_t *spa, nvlist_t *config); extern nvlist_t *spa_config_generate(spa_t *spa, vdev_t *vd, uint64_t txg, int getstats); extern void spa_config_update(spa_t *spa, int what); -extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot); /* * Miscellaneous SPA routines in spa_misc.c @@ -401,7 +500,7 @@ extern void spa_config_update_common(spa_t *spa, int what, boolean_t isroot); /* Namespace manipulation */ extern spa_t *spa_lookup(const char *name); -extern spa_t *spa_add(const char *name, const char *altroot); +extern spa_t *spa_add(const char *name, nvlist_t *config, const char *altroot); extern void spa_remove(spa_t *spa); extern spa_t *spa_next(spa_t *prev); @@ -410,6 +509,7 @@ extern void spa_open_ref(spa_t *spa, void *tag); extern void spa_close(spa_t *spa, void *tag); extern boolean_t spa_refcount_zero(spa_t *spa); +#define SCL_NONE 0x00 #define SCL_CONFIG 0x01 #define SCL_STATE 0x02 #define SCL_L2ARC 0x04 /* hack until L2ARC 2.0 */ @@ -429,12 +529,30 @@ extern int spa_config_held(spa_t *spa, int locks, krw_t rw); /* Pool vdev add/remove lock */ extern uint64_t spa_vdev_enter(spa_t *spa); +extern uint64_t spa_vdev_config_enter(spa_t *spa); +extern void spa_vdev_config_exit(spa_t *spa, vdev_t *vd, uint64_t txg, + int error, char *tag); extern int spa_vdev_exit(spa_t *spa, vdev_t *vd, uint64_t txg, int error); /* Pool vdev state change lock */ -extern void spa_vdev_state_enter(spa_t *spa); +extern void spa_vdev_state_enter(spa_t *spa, int oplock); extern int spa_vdev_state_exit(spa_t *spa, vdev_t *vd, int error); +/* Log state */ +typedef enum spa_log_state { + SPA_LOG_UNKNOWN = 0, /* unknown log state */ + SPA_LOG_MISSING, /* missing log(s) */ + SPA_LOG_CLEAR, /* clear the log(s) */ + SPA_LOG_GOOD, /* log(s) are good */ +} spa_log_state_t; + +extern spa_log_state_t spa_get_log_state(spa_t *spa); +extern void spa_set_log_state(spa_t *spa, spa_log_state_t state); +extern int spa_offline_log(spa_t *spa); + +/* Log claim callback */ +extern void spa_claim_notify(zio_t *zio); + /* Accessor functions */ extern boolean_t spa_shutting_down(spa_t *spa); extern struct dsl_pool *spa_get_dsl(spa_t *spa); @@ -446,18 +564,26 @@ extern char *spa_name(spa_t *spa); extern uint64_t spa_guid(spa_t *spa); extern uint64_t spa_last_synced_txg(spa_t *spa); extern uint64_t spa_first_txg(spa_t *spa); +extern uint64_t spa_syncing_txg(spa_t *spa); extern uint64_t spa_version(spa_t *spa); extern pool_state_t spa_state(spa_t *spa); +extern spa_load_state_t spa_load_state(spa_t *spa); extern uint64_t spa_freeze_txg(spa_t *spa); -extern uint64_t spa_get_alloc(spa_t *spa); -extern uint64_t spa_get_space(spa_t *spa); -extern uint64_t spa_get_dspace(spa_t *spa); extern uint64_t spa_get_asize(spa_t *spa, uint64_t lsize); +extern uint64_t spa_get_dspace(spa_t *spa); +extern void spa_update_dspace(spa_t *spa); extern uint64_t spa_version(spa_t *spa); +extern boolean_t spa_deflate(spa_t *spa); +extern metaslab_class_t *spa_normal_class(spa_t *spa); +extern metaslab_class_t *spa_log_class(spa_t *spa); extern int spa_max_replication(spa_t *spa); extern int spa_busy(void); extern uint8_t spa_get_failmode(spa_t *spa); extern boolean_t spa_suspended(spa_t *spa); +extern uint64_t spa_bootfs(spa_t *spa); +extern uint64_t spa_delegation(spa_t *spa); +extern objset_t *spa_meta_objset(spa_t *spa); +extern enum zio_checksum spa_dedup_checksum(spa_t *spa); /* Miscellaneous support routines */ extern int spa_rename(const char *oldname, const char *newname); @@ -465,16 +591,24 @@ extern boolean_t spa_guid_exists(uint64_t pool_guid, uint64_t device_guid); extern char *spa_strdup(const char *); extern void spa_strfree(char *); extern uint64_t spa_get_random(uint64_t range); -extern void sprintf_blkptr(char *buf, int len, const blkptr_t *bp); +extern uint64_t spa_generate_guid(spa_t *spa); +extern void sprintf_blkptr(char *buf, const blkptr_t *bp); extern void spa_freeze(spa_t *spa); extern void spa_upgrade(spa_t *spa, uint64_t version); extern void spa_evict_all(void); extern vdev_t *spa_lookup_by_guid(spa_t *spa, uint64_t guid, boolean_t l2cache); extern boolean_t spa_has_spare(spa_t *, uint64_t guid); -extern uint64_t bp_get_dasize(spa_t *spa, const blkptr_t *bp); +extern uint64_t dva_get_dsize_sync(spa_t *spa, const dva_t *dva); +extern uint64_t bp_get_dsize_sync(spa_t *spa, const blkptr_t *bp); +extern uint64_t bp_get_dsize(spa_t *spa, const blkptr_t *bp); extern boolean_t spa_has_slogs(spa_t *spa); extern boolean_t spa_is_root(spa_t *spa); +extern boolean_t spa_writeable(spa_t *spa); +extern void spa_rewind_data_to_nvlist(spa_t *spa, nvlist_t *to); + +extern int spa_mode(spa_t *spa); +extern uint64_t strtonum(const char *str, char **nptr); /* history logging */ typedef enum history_log_type { @@ -497,16 +631,17 @@ extern int spa_history_get(spa_t *spa, uint64_t *offset, uint64_t *len_read, char *his_buf); extern int spa_history_log(spa_t *spa, const char *his_buf, history_log_type_t what); -void spa_history_internal_log(history_internal_events_t event, spa_t *spa, - dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_internal_log(history_internal_events_t event, + spa_t *spa, dmu_tx_t *tx, cred_t *cr, const char *fmt, ...); +extern void spa_history_log_version(spa_t *spa, history_internal_events_t evt); /* error handling */ struct zbookmark; -struct zio; -extern void spa_log_error(spa_t *spa, struct zio *zio); +extern void spa_log_error(spa_t *spa, zio_t *zio); extern void zfs_ereport_post(const char *class, spa_t *spa, vdev_t *vd, - struct zio *zio, uint64_t stateoroffset, uint64_t length); + zio_t *zio, uint64_t stateoroffset, uint64_t length); extern void zfs_post_remove(spa_t *spa, vdev_t *vd); +extern void zfs_post_state_change(spa_t *spa, vdev_t *vd); extern void zfs_post_autoreplace(spa_t *spa, vdev_t *vd); extern uint64_t spa_get_errlog_size(spa_t *spa); extern int spa_get_errlog(spa_t *spa, void *uaddr, size_t *count); @@ -528,6 +663,7 @@ extern void spa_boot_init(); extern int spa_prop_set(spa_t *spa, nvlist_t *nvp); extern int spa_prop_get(spa_t *spa, nvlist_t **nvp); extern void spa_prop_clear_bootfs(spa_t *spa, uint64_t obj, dmu_tx_t *tx); +extern void spa_configfile_set(spa_t *, nvlist_t *, boolean_t); /* asynchronous event notification */ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); @@ -536,7 +672,7 @@ extern void spa_event_notify(spa_t *spa, vdev_t *vdev, const char *name); #define dprintf_bp(bp, fmt, ...) do { \ if (zfs_flags & ZFS_DEBUG_DPRINTF) { \ char *__blkbuf = kmem_alloc(BP_SPRINTF_LEN, KM_SLEEP); \ - sprintf_blkptr(__blkbuf, BP_SPRINTF_LEN, (bp)); \ + sprintf_blkptr(__blkbuf, (bp)); \ dprintf(fmt " %s\n", __VA_ARGS__, __blkbuf); \ kmem_free(__blkbuf, BP_SPRINTF_LEN); \ } \ @@ -545,7 +681,7 @@ _NOTE(CONSTCOND) } while (0) #define dprintf_bp(bp, fmt, ...) #endif -extern int spa_mode; /* mode, e.g. FREAD | FWRITE */ +extern int spa_mode_global; /* mode, e.g. FREAD | FWRITE */ #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h index b56073b97516b..1d3622f5a108b 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_boot.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SPA_BOOT_H #define _SYS_SPA_BOOT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -36,7 +34,6 @@ extern "C" { extern char *spa_get_bootprop(char *prop); extern void spa_free_bootprop(char *prop); -extern int spa_get_rootconf(char *devpath, char *devid, nvlist_t **bestconf_p); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h index 8aeb414fe9de3..9daec092b4aad 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/spa_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -78,19 +78,33 @@ typedef struct spa_config_dirent { char *scd_path; } spa_config_dirent_t; -typedef enum spa_log_state { - SPA_LOG_UNKNOWN = 0, /* unknown log state */ - SPA_LOG_MISSING, /* missing log(s) */ - SPA_LOG_CLEAR, /* clear the log(s) */ - SPA_LOG_GOOD, /* log(s) are good */ -} spa_log_state_t; - enum zio_taskq_type { ZIO_TASKQ_ISSUE = 0, + ZIO_TASKQ_ISSUE_HIGH, ZIO_TASKQ_INTERRUPT, + ZIO_TASKQ_INTERRUPT_HIGH, ZIO_TASKQ_TYPES }; +/* + * State machine for the zpool-pooname process. The states transitions + * are done as follows: + * + * From To Routine + * PROC_NONE -> PROC_CREATED spa_activate() + * PROC_CREATED -> PROC_ACTIVE spa_thread() + * PROC_ACTIVE -> PROC_DEACTIVATE spa_deactivate() + * PROC_DEACTIVATE -> PROC_GONE spa_thread() + * PROC_GONE -> PROC_NONE spa_deactivate() + */ +typedef enum spa_proc_state { + SPA_PROC_NONE, /* spa_proc = &p0, no process created */ + SPA_PROC_CREATED, /* spa_activate() has proc, is waiting */ + SPA_PROC_ACTIVE, /* taskqs created, spa_proc set */ + SPA_PROC_DEACTIVATE, /* spa_deactivate() requests process exit */ + SPA_PROC_GONE /* spa_thread() is exiting, spa_proc = &p0 */ +} spa_proc_state_t; + struct spa { /* * Fields protected by spa_namespace_lock. @@ -99,12 +113,14 @@ struct spa { avl_node_t spa_avl; /* node in spa_namespace_avl */ nvlist_t *spa_config; /* last synced config */ nvlist_t *spa_config_syncing; /* currently syncing config */ + nvlist_t *spa_config_splitting; /* config for splitting */ uint64_t spa_config_txg; /* txg of last config change */ int spa_sync_pass; /* iterate-to-convergence */ pool_state_t spa_state; /* pool state */ int spa_inject_ref; /* injection references */ uint8_t spa_sync_on; /* sync threads are running */ spa_load_state_t spa_load_state; /* current load operation */ + boolean_t spa_load_verbatim; /* load the given config? */ taskq_t *spa_zio_taskq[ZIO_TYPES][ZIO_TASKQ_TYPES]; dsl_pool_t *spa_dsl_pool; metaslab_class_t *spa_normal_class; /* normal data class */ @@ -112,6 +128,8 @@ struct spa { uint64_t spa_first_txg; /* first txg after spa_open() */ uint64_t spa_final_txg; /* txg of export/destroy */ uint64_t spa_freeze_txg; /* freeze pool at this txg */ + uint64_t spa_load_max_txg; /* best initial ub_txg */ + uint64_t spa_claim_max_txg; /* highest claimed birth txg */ objset_t *spa_meta_objset; /* copy of dp->dp_meta_objset */ txg_list_t spa_vdev_txg_list; /* per-txg dirty vdev list */ vdev_t *spa_root_vdev; /* top-level vdev container */ @@ -121,11 +139,14 @@ struct spa { spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ uint64_t spa_config_object; /* MOS object for pool config */ + uint64_t spa_config_generation; /* config generation number */ uint64_t spa_syncing_txg; /* txg currently syncing */ - uint64_t spa_sync_bplist_obj; /* object for deferred frees */ - bplist_t spa_sync_bplist; /* deferred-free bplist */ + uint64_t spa_deferred_bplist_obj; /* object for deferred frees */ + bplist_t spa_deferred_bplist; /* deferred-free bplist */ + bplist_t spa_free_bplist[TXG_SIZE]; /* bplist of stuff to free */ uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ + boolean_t spa_extreme_rewind; /* rewind past deferred frees */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ uint64_t spa_scrub_maxinflight; /* max in-flight scrub I/Os */ @@ -141,12 +162,16 @@ struct spa { int spa_async_suspended; /* async tasks suspended */ kcondvar_t spa_async_cv; /* wait for thread_exit() */ uint16_t spa_async_tasks; /* async task mask */ - kmutex_t spa_async_root_lock; /* protects async root count */ - uint64_t spa_async_root_count; /* number of async root zios */ - kcondvar_t spa_async_root_cv; /* notify when count == 0 */ char *spa_root; /* alternate root directory */ uint64_t spa_ena; /* spa-wide ereport ENA */ - boolean_t spa_last_open_failed; /* true if last open faled */ + int spa_last_open_failed; /* error if last open failed */ + uint64_t spa_last_ubsync_txg; /* "best" uberblock txg */ + uint64_t spa_last_ubsync_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_txg; /* ub txg that loaded */ + uint64_t spa_load_txg_ts; /* timestamp from that ub */ + uint64_t spa_load_meta_errors; /* verify metadata err count */ + uint64_t spa_load_data_errors; /* verify data err count */ + uint64_t spa_verify_min_txg; /* start txg of verify scrub */ kmutex_t spa_errlog_lock; /* error log lock */ uint64_t spa_errlog_last; /* last error log object */ uint64_t spa_errlog_scrub; /* scrub error log object */ @@ -163,14 +188,31 @@ struct spa { uint64_t spa_failmode; /* failure mode for the pool */ uint64_t spa_delegation; /* delegation on/off */ list_t spa_config_list; /* previous cache file(s) */ + zio_t *spa_async_zio_root; /* root of all async I/O */ zio_t *spa_suspend_zio_root; /* root of all suspended I/O */ kmutex_t spa_suspend_lock; /* protects suspend_zio_root */ kcondvar_t spa_suspend_cv; /* notification of resume */ uint8_t spa_suspended; /* pool is suspended */ - boolean_t spa_import_faulted; /* allow faulted vdevs */ + uint8_t spa_claiming; /* pool is doing zil_claim() */ boolean_t spa_is_root; /* pool is root */ int spa_minref; /* num refs when first opened */ + int spa_mode; /* FREAD | FWRITE */ spa_log_state_t spa_log_state; /* log state */ + uint64_t spa_autoexpand; /* lun expansion on/off */ + ddt_t *spa_ddt[ZIO_CHECKSUM_FUNCTIONS]; /* in-core DDTs */ + uint64_t spa_ddt_stat_object; /* DDT statistics */ + uint64_t spa_dedup_ditto; /* dedup ditto threshold */ + uint64_t spa_dedup_checksum; /* default dedup checksum */ + uint64_t spa_dspace; /* dspace in normal class */ + kmutex_t spa_vdev_top_lock; /* dueling offline/remove */ + kmutex_t spa_proc_lock; /* protects spa_proc* */ + kcondvar_t spa_proc_cv; /* spa_proc_state transitions */ + spa_proc_state_t spa_proc_state; /* see definition */ + struct proc *spa_proc; /* "zpool-poolname" process */ + uint64_t spa_did; /* if procp != p0, did of t1 */ + boolean_t spa_autoreplace; /* autoreplace set in open */ + int spa_vdev_locks; /* locks grabbed */ + /* * spa_refcnt & spa_config_lock must be the last elements * because refcount_t changes size based on compilation options. @@ -183,12 +225,6 @@ struct spa { extern const char *spa_config_path; -#define BOOTFS_COMPRESS_VALID(compress) \ - ((compress) == ZIO_COMPRESS_LZJB || \ - ((compress) == ZIO_COMPRESS_ON && \ - ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ - (compress) == ZIO_COMPRESS_OFF) - #ifdef __cplusplus } #endif diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h index db9daef1f156f..6f935c9db27e5 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/space_map.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2006 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SPACE_MAP_H #define _SYS_SPACE_MAP_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -48,16 +46,24 @@ typedef struct space_map { uint8_t sm_loading; /* map loading? */ kcondvar_t sm_load_cv; /* map load completion */ space_map_ops_t *sm_ops; /* space map block picker ops vector */ + avl_tree_t *sm_pp_root; /* picker-private AVL tree */ void *sm_ppd; /* picker-private data */ kmutex_t *sm_lock; /* pointer to lock that protects map */ } space_map_t; typedef struct space_seg { avl_node_t ss_node; /* AVL node */ + avl_node_t ss_pp_node; /* AVL picker-private node */ uint64_t ss_start; /* starting offset of this segment */ uint64_t ss_end; /* ending offset (non-inclusive) */ } space_seg_t; +typedef struct space_ref { + avl_node_t sr_node; /* AVL node */ + uint64_t sr_offset; /* offset (start or end) */ + int64_t sr_refcnt; /* associated reference count */ +} space_ref_t; + typedef struct space_map_obj { uint64_t smo_object; /* on-disk space map object */ uint64_t smo_objsize; /* size of the object */ @@ -70,6 +76,8 @@ struct space_map_ops { uint64_t (*smop_alloc)(space_map_t *sm, uint64_t size); void (*smop_claim)(space_map_t *sm, uint64_t start, uint64_t size); void (*smop_free)(space_map_t *sm, uint64_t start, uint64_t size); + uint64_t (*smop_max)(space_map_t *sm); + boolean_t (*smop_fragmented)(space_map_t *sm); }; /* @@ -133,13 +141,12 @@ extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size, extern void space_map_destroy(space_map_t *sm); extern void space_map_add(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_remove(space_map_t *sm, uint64_t start, uint64_t size); -extern int space_map_contains(space_map_t *sm, uint64_t start, uint64_t size); +extern boolean_t space_map_contains(space_map_t *sm, + uint64_t start, uint64_t size); extern void space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); extern void space_map_walk(space_map_t *sm, space_map_func_t *func, space_map_t *mdest); -extern void space_map_excise(space_map_t *sm, uint64_t start, uint64_t size); -extern void space_map_union(space_map_t *smd, space_map_t *sms); extern void space_map_load_wait(space_map_t *sm); extern int space_map_load(space_map_t *sm, space_map_ops_t *ops, @@ -149,12 +156,22 @@ extern void space_map_unload(space_map_t *sm); extern uint64_t space_map_alloc(space_map_t *sm, uint64_t size); extern void space_map_claim(space_map_t *sm, uint64_t start, uint64_t size); extern void space_map_free(space_map_t *sm, uint64_t start, uint64_t size); +extern uint64_t space_map_maxsize(space_map_t *sm); extern void space_map_sync(space_map_t *sm, uint8_t maptype, space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); extern void space_map_truncate(space_map_obj_t *smo, objset_t *os, dmu_tx_t *tx); +extern void space_map_ref_create(avl_tree_t *t); +extern void space_map_ref_destroy(avl_tree_t *t); +extern void space_map_ref_add_seg(avl_tree_t *t, + uint64_t start, uint64_t end, int64_t refcnt); +extern void space_map_ref_add_map(avl_tree_t *t, + space_map_t *sm, int64_t refcnt); +extern void space_map_ref_generate_map(avl_tree_t *t, + space_map_t *sm, int64_t minref); + #ifdef __cplusplus } #endif diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h index 23bdff211b4a4..6429a6bd8a499 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_TXG_H #define _SYS_TXG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -41,6 +39,9 @@ extern "C" { #define TXG_INITIAL TXG_SIZE /* initial txg */ #define TXG_IDX (txg & TXG_MASK) +/* Number of txgs worth of frees we defer adding to in-core spacemaps */ +#define TXG_DEFER_SIZE 2 + #define TXG_WAIT 1ULL #define TXG_NOWAIT 2ULL @@ -71,8 +72,7 @@ extern void txg_sync_stop(struct dsl_pool *dp); extern uint64_t txg_hold_open(struct dsl_pool *dp, txg_handle_t *txghp); extern void txg_rele_to_quiesce(txg_handle_t *txghp); extern void txg_rele_to_sync(txg_handle_t *txghp); -extern void txg_suspend(struct dsl_pool *dp); -extern void txg_resume(struct dsl_pool *dp); +extern void txg_register_callbacks(txg_handle_t *txghp, list_t *tx_callbacks); /* * Delay the caller by the specified number of ticks or until diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h index 7413c662b3555..7b356eac1293b 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/txg_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -37,13 +37,13 @@ struct tx_cpu { kmutex_t tc_lock; kcondvar_t tc_cv[TXG_SIZE]; uint64_t tc_count[TXG_SIZE]; + list_t tc_callbacks[TXG_SIZE]; /* commit cb list */ char tc_pad[16]; }; typedef struct tx_state { tx_cpu_t *tx_cpu; /* protects right to enter txg */ kmutex_t tx_sync_lock; /* protects tx_state_t */ - krwlock_t tx_suspend; uint64_t tx_open_txg; /* currently open txg id */ uint64_t tx_quiesced_txg; /* quiesced txg waiting for sync */ uint64_t tx_syncing_txg; /* currently syncing txg id */ @@ -64,6 +64,8 @@ typedef struct tx_state { kthread_t *tx_sync_thread; kthread_t *tx_quiesce_thread; + + taskq_t *tx_commit_cb_taskq; /* commit callback taskq */ } tx_state_t; #ifdef __cplusplus diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h index 93d936ae4b18d..b5bb915731452 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,19 +19,16 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_UBERBLOCK_H #define _SYS_UBERBLOCK_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include -#include #ifdef __cplusplus extern "C" { diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h index 55a0dd5aec0d0..c135df9b106b3 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/uberblock_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_UBERBLOCK_IMPL_H #define _SYS_UBERBLOCK_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h index c070d6f3d623d..3bf5ba8042e3d 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,13 +36,22 @@ extern "C" { #endif +typedef enum vdev_dtl_type { + DTL_MISSING, /* 0% replication: no copies of the data */ + DTL_PARTIAL, /* less than 100% replication: some copies missing */ + DTL_SCRUB, /* unable to fully repair during scrub/resilver */ + DTL_OUTAGE, /* temporarily missing (used to attempt detach) */ + DTL_TYPES +} vdev_dtl_type_t; + extern boolean_t zfs_nocacheflush; extern int vdev_open(vdev_t *); +extern void vdev_open_children(vdev_t *); +extern boolean_t vdev_uses_zvols(vdev_t *); extern int vdev_validate(vdev_t *); extern void vdev_close(vdev_t *); extern int vdev_create(vdev_t *, uint64_t txg, boolean_t isreplace); -extern void vdev_init(vdev_t *, uint64_t txg); extern void vdev_reopen(vdev_t *); extern int vdev_validate_aux(vdev_t *vd); extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); @@ -50,33 +59,40 @@ extern zio_t *vdev_probe(vdev_t *vd, zio_t *pio); extern boolean_t vdev_is_bootable(vdev_t *vd); extern vdev_t *vdev_lookup_top(spa_t *spa, uint64_t vdev); extern vdev_t *vdev_lookup_by_guid(vdev_t *vd, uint64_t guid); -extern void vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size); -extern int vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size); +extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, + uint64_t txg, uint64_t size); +extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done); +extern boolean_t vdev_dtl_required(vdev_t *vd); extern boolean_t vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp); extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); +extern void vdev_metaslab_set_size(vdev_t *); +extern void vdev_expand(vdev_t *vd, uint64_t txg); +extern void vdev_split(vdev_t *vd); + extern void vdev_get_stats(vdev_t *vd, vdev_stat_t *vs); extern void vdev_clear_stats(vdev_t *vd); extern void vdev_stat_update(zio_t *zio, uint64_t psize); extern void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete); -extern int vdev_getspec(spa_t *spa, uint64_t vdev, char **vdev_spec); extern void vdev_propagate_state(vdev_t *vd); extern void vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux); -extern void vdev_space_update(vdev_t *vd, int64_t space_delta, - int64_t alloc_delta, boolean_t update_root); +extern void vdev_space_update(vdev_t *vd, + int64_t alloc_delta, int64_t defer_delta, int64_t space_delta); extern uint64_t vdev_psize_to_asize(vdev_t *vd, uint64_t psize); -extern int vdev_fault(spa_t *spa, uint64_t guid); -extern int vdev_degrade(spa_t *spa, uint64_t guid); +extern int vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux); +extern int vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux); extern int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *); extern int vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags); @@ -101,11 +117,13 @@ extern void vdev_queue_io_done(zio_t *zio); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); -extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg); +extern int vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, + boolean_t); extern void vdev_state_dirty(vdev_t *vd); extern void vdev_state_clean(vdev_t *vd); +extern void vdev_top_config_generate(spa_t *spa, nvlist_t *config); extern nvlist_t *vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, boolean_t isspare, boolean_t isl2cache); @@ -123,7 +141,8 @@ typedef enum { VDEV_LABEL_REPLACE, /* replace an existing device */ VDEV_LABEL_SPARE, /* add a new hot spare */ VDEV_LABEL_REMOVE, /* remove an existing device */ - VDEV_LABEL_L2CACHE /* add an L2ARC cache device */ + VDEV_LABEL_L2CACHE, /* add an L2ARC cache device */ + VDEV_LABEL_SPLIT /* generating new label for split-off dev */ } vdev_labeltype_t; extern int vdev_label_init(vdev_t *vd, uint64_t txg, vdev_labeltype_t reason); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h index 26904d089a3be..238b9610f5859 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/vdev_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -112,7 +112,9 @@ struct vdev { uint64_t vdev_id; /* child number in vdev parent */ uint64_t vdev_guid; /* unique ID for this vdev */ uint64_t vdev_guid_sum; /* self guid + all child guids */ + uint64_t vdev_orig_guid; /* orig. guid prior to remove */ uint64_t vdev_asize; /* allocatable device capacity */ + uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ @@ -123,9 +125,13 @@ struct vdev { vdev_t *vdev_parent; /* parent vdev */ vdev_t **vdev_child; /* array of children */ uint64_t vdev_children; /* number of children */ - space_map_t vdev_dtl_map; /* dirty time log in-core state */ - space_map_t vdev_dtl_scrub; /* DTL for scrub repair writes */ + space_map_t vdev_dtl[DTL_TYPES]; /* in-core dirty time logs */ vdev_stat_t vdev_stat; /* virtual device statistics */ + boolean_t vdev_expanding; /* expand the vdev? */ + boolean_t vdev_reopening; /* reopen in progress? */ + int vdev_open_error; /* error on last open */ + kthread_t *vdev_open_thread; /* thread opening children */ + uint64_t vdev_crtxg; /* txg when top-level was added */ /* * Top-level vdev state. @@ -140,16 +146,18 @@ struct vdev { txg_node_t vdev_txg_node; /* per-txg dirty vdev linkage */ boolean_t vdev_remove_wanted; /* async remove wanted? */ boolean_t vdev_probe_wanted; /* async probe wanted? */ + boolean_t vdev_removing; /* device is being removed? */ list_node_t vdev_config_dirty_node; /* config dirty list */ list_node_t vdev_state_dirty_node; /* state dirty list */ uint64_t vdev_deflate_ratio; /* deflation ratio (x512) */ uint64_t vdev_islog; /* is an intent log device */ + uint64_t vdev_ishole; /* is a hole in the namespace */ /* * Leaf vdev state. */ uint64_t vdev_psize; /* physical device capacity */ - space_map_obj_t vdev_dtl; /* dirty time log on-disk state */ + space_map_obj_t vdev_dtl_smo; /* dirty time log space map obj */ txg_node_t vdev_dtl_node; /* per-txg dirty DTL linkage */ uint64_t vdev_wholedisk; /* true if this is a whole disk */ uint64_t vdev_offline; /* persistent offline state */ @@ -160,12 +168,14 @@ struct vdev { char *vdev_path; /* vdev path (if any) */ char *vdev_devid; /* vdev devid (if any) */ char *vdev_physpath; /* vdev device path (if any) */ + char *vdev_fru; /* physical FRU location */ uint64_t vdev_not_present; /* not present during import */ uint64_t vdev_unspare; /* unspare when resilvering done */ hrtime_t vdev_last_try; /* last reopen time */ boolean_t vdev_nowritecache; /* true if flushwritecache failed */ boolean_t vdev_checkremove; /* temporary online test */ boolean_t vdev_forcefault; /* force online fault */ + boolean_t vdev_splitting; /* split or repair in progress */ uint8_t vdev_tmpoffline; /* device taken offline temporarily? */ uint8_t vdev_detached; /* device detached? */ uint8_t vdev_cant_read; /* vdev is failing all reads */ @@ -176,6 +186,7 @@ struct vdev { vdev_cache_t vdev_cache; /* physical block cache */ spa_aux_vdev_t *vdev_aux; /* for l2cache vdevs */ zio_t *vdev_probe_zio; /* root of current probe */ + vdev_aux_t vdev_label_aux; /* on-disk aux state */ /* * For DTrace to work in userland (libzpool) context, these fields must @@ -189,8 +200,11 @@ struct vdev { kmutex_t vdev_probe_lock; /* protects vdev_probe_zio */ }; -#define VDEV_SKIP_SIZE (8 << 10) -#define VDEV_BOOT_HEADER_SIZE (8 << 10) +#define VDEV_RAIDZ_MAXPARITY 3 + +#define VDEV_PAD_SIZE (8 << 10) +/* 2 padding areas (vl_pad1 and vl_pad2) to skip */ +#define VDEV_SKIP_SIZE VDEV_PAD_SIZE * 2 #define VDEV_PHYS_SIZE (112 << 10) #define VDEV_UBERBLOCK_RING (128 << 10) @@ -202,26 +216,14 @@ struct vdev { offsetof(vdev_label_t, vl_uberblock[(n) << VDEV_UBERBLOCK_SHIFT(vd)]) #define VDEV_UBERBLOCK_SIZE(vd) (1ULL << VDEV_UBERBLOCK_SHIFT(vd)) -/* ZFS boot block */ -#define VDEV_BOOT_MAGIC 0x2f5b007b10cULL -#define VDEV_BOOT_VERSION 1 /* version number */ - -typedef struct vdev_boot_header { - uint64_t vb_magic; /* VDEV_BOOT_MAGIC */ - uint64_t vb_version; /* VDEV_BOOT_VERSION */ - uint64_t vb_offset; /* start offset (bytes) */ - uint64_t vb_size; /* size (bytes) */ - char vb_pad[VDEV_BOOT_HEADER_SIZE - 4 * sizeof (uint64_t)]; -} vdev_boot_header_t; - typedef struct vdev_phys { - char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_block_tail_t)]; - zio_block_tail_t vp_zbt; + char vp_nvlist[VDEV_PHYS_SIZE - sizeof (zio_eck_t)]; + zio_eck_t vp_zbt; } vdev_phys_t; typedef struct vdev_label { - char vl_pad[VDEV_SKIP_SIZE]; /* 8K */ - vdev_boot_header_t vl_boot_header; /* 8K */ + char vl_pad1[VDEV_PAD_SIZE]; /* 8K */ + char vl_pad2[VDEV_PAD_SIZE]; /* 8K */ vdev_phys_t vl_vdev_phys; /* 112K */ char vl_uberblock[VDEV_UBERBLOCK_RING]; /* 128K */ } vdev_label_t; /* 256K total */ @@ -250,10 +252,14 @@ typedef struct vdev_label { #define VDEV_ALLOC_ADD 1 #define VDEV_ALLOC_SPARE 2 #define VDEV_ALLOC_L2CACHE 3 +#define VDEV_ALLOC_ROOTPOOL 4 +#define VDEV_ALLOC_SPLIT 5 /* * Allocate or free a vdev */ +extern vdev_t *vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, + vdev_ops_t *ops); extern int vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *config, vdev_t *parent, uint_t id, int alloctype); extern void vdev_free(vdev_t *vd); @@ -270,6 +276,7 @@ extern void vdev_remove_parent(vdev_t *cvd); /* * vdev sync load and sync */ +extern void vdev_load_log_state(vdev_t *nvd, vdev_t *ovd); extern void vdev_load(vdev_t *vd); extern void vdev_sync(vdev_t *vd, uint64_t txg); extern void vdev_sync_done(vdev_t *vd, uint64_t txg); @@ -285,13 +292,15 @@ extern vdev_ops_t vdev_raidz_ops; extern vdev_ops_t vdev_disk_ops; extern vdev_ops_t vdev_file_ops; extern vdev_ops_t vdev_missing_ops; +extern vdev_ops_t vdev_hole_ops; extern vdev_ops_t vdev_spare_ops; /* * Common size functions */ extern uint64_t vdev_default_asize(vdev_t *vd, uint64_t psize); -extern uint64_t vdev_get_rsize(vdev_t *vd); +extern uint64_t vdev_get_min_asize(vdev_t *vd); +extern void vdev_set_min_asize(vdev_t *vd); /* * zdb uses this tunable, so it must be declared here to make lint happy. diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h index f88cc068bd579..3b9de2a2f93a0 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZAP_H #define _SYS_ZAP_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ZAP - ZFS Attribute Processor * @@ -87,9 +85,6 @@ extern "C" { #endif -#define ZAP_MAXNAMELEN 256 -#define ZAP_MAXVALUELEN 1024 - /* * The matchtype specifies which entry will be accessed. * MT_EXACT: only find an exact match (non-normalized) @@ -106,6 +101,18 @@ typedef enum matchtype MT_FIRST } matchtype_t; +typedef enum zap_flags { + /* Use 64-bit hash value (serialized cursors will always use 64-bits) */ + ZAP_FLAG_HASH64 = 1 << 0, + /* Key is binary, not string (zap_add_uint64() can be used) */ + ZAP_FLAG_UINT64_KEY = 1 << 1, + /* + * First word of key (which must be an array of uint64) is + * already randomly distributed. + */ + ZAP_FLAG_PRE_HASHED_KEY = 1 << 2, +} zap_flags_t; + /* * Create a new zapobj with no attributes and return its object number. * MT_EXACT will cause the zap object to only support MT_EXACT lookups, @@ -123,6 +130,9 @@ uint64_t zap_create(objset_t *ds, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); uint64_t zap_create_norm(objset_t *ds, int normflags, dmu_object_type_t ot, dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); +uint64_t zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx); /* * Create a new zapobj with no attributes from the given (unallocated) @@ -185,6 +195,12 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, uint64_t integer_size, uint64_t num_integers, void *buf, matchtype_t mt, char *realname, int rn_len, boolean_t *normalization_conflictp); +int zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf); +int zap_contains(objset_t *ds, uint64_t zapobj, const char *name); + +int zap_count_write(objset_t *os, uint64_t zapobj, const char *name, + int add, uint64_t *towrite, uint64_t *tooverwrite); /* * Create an attribute with the given name and value. @@ -192,9 +208,12 @@ int zap_lookup_norm(objset_t *ds, uint64_t zapobj, const char *name, * If an attribute with the given name already exists, the call will * fail and return EEXIST. */ -int zap_add(objset_t *ds, uint64_t zapobj, const char *name, +int zap_add(objset_t *ds, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_add_uint64(objset_t *ds, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx); /* * Set the attribute with the given name to the given value. If an @@ -206,6 +225,9 @@ int zap_add(objset_t *ds, uint64_t zapobj, const char *name, */ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); +int zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); /* * Get the length (in integers) and the integer size of the specified @@ -216,6 +238,8 @@ int zap_update(objset_t *ds, uint64_t zapobj, const char *name, */ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers); +int zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers); /* * Remove the specified attribute. @@ -226,6 +250,8 @@ int zap_length(objset_t *ds, uint64_t zapobj, const char *name, int zap_remove(objset_t *ds, uint64_t zapobj, const char *name, dmu_tx_t *tx); int zap_remove_norm(objset_t *ds, uint64_t zapobj, const char *name, matchtype_t mt, dmu_tx_t *tx); +int zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx); /* * Returns (in *count) the number of attributes in the specified zap @@ -257,6 +283,8 @@ int zap_join(objset_t *os, uint64_t fromobj, uint64_t intoobj, dmu_tx_t *tx); int zap_add_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_remove_int(objset_t *os, uint64_t obj, uint64_t value, dmu_tx_t *tx); int zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value); +int zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx); struct zap; struct zap_leaf; @@ -266,6 +294,7 @@ typedef struct zap_cursor { struct zap *zc_zap; struct zap_leaf *zc_leaf; uint64_t zc_zapobj; + uint64_t zc_serialized; uint64_t zc_hash; uint32_t zc_cd; } zap_cursor_t; @@ -316,6 +345,11 @@ void zap_cursor_advance(zap_cursor_t *zc); */ uint64_t zap_cursor_serialize(zap_cursor_t *zc); +/* + * Advance the cursor to the attribute having the given key. + */ +int zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt); + /* * Initialize a zap cursor pointing to the position recorded by * zap_cursor_serialize (in the "serialized" argument). You can also diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h index 0dc02ab6b0ac0..5aa0efc98d4f9 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZAP_IMPL_H #define _SYS_ZAP_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -42,13 +40,13 @@ extern int fzap_default_block_shift; #define FZAP_BLOCK_SHIFT(zap) ((zap)->zap_f.zap_block_shift) -#define ZAP_MAXCD (uint32_t)(-1) -#define ZAP_HASHBITS 28 #define MZAP_ENT_LEN 64 #define MZAP_NAME_LEN (MZAP_ENT_LEN - 8 - 4 - 2) #define MZAP_MAX_BLKSHIFT SPA_MAXBLOCKSHIFT #define MZAP_MAX_BLKSZ (1 << MZAP_MAX_BLKSHIFT) +#define ZAP_NEED_CD (-1U) + typedef struct mzap_ent_phys { uint64_t mze_value; uint32_t mze_cd; @@ -72,7 +70,6 @@ typedef struct mzap_ent { mzap_ent_phys_t mze_phys; } mzap_ent_t; - /* * The (fat) zap is stored in one object. It is an array of * 1<> (64 - (n)))) @@ -195,6 +199,8 @@ int fzap_count(zap_t *zap, uint64_t *count); int fzap_lookup(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, void *buf, char *realname, int rn_len, boolean_t *normalization_conflictp); +int fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, + uint64_t *tooverwrite); int fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx); int fzap_update(zap_name_t *zn, @@ -209,7 +215,8 @@ void zap_put_leaf(struct zap_leaf *l); int fzap_add_cd(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, uint32_t cd, dmu_tx_t *tx); -void fzap_upgrade(zap_t *zap, dmu_tx_t *tx); +void fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags); +int fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h index 14144e059e540..173b6b195e19f 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zap_leaf.h @@ -19,20 +19,20 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZAP_LEAF_H #define _SYS_ZAP_LEAF_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif struct zap; +struct zap_name; +struct zap_stats; #define ZAP_LEAF_MAGIC 0x2AB1EAF @@ -129,12 +129,12 @@ typedef struct zap_leaf_phys { typedef union zap_leaf_chunk { struct zap_leaf_entry { uint8_t le_type; /* always ZAP_CHUNK_ENTRY */ - uint8_t le_int_size; /* size of ints */ + uint8_t le_value_intlen; /* size of value's ints */ uint16_t le_next; /* next entry in hash chain */ uint16_t le_name_chunk; /* first chunk of the name */ - uint16_t le_name_length; /* bytes in name, incl null */ + uint16_t le_name_numints; /* ints in name (incl null) */ uint16_t le_value_chunk; /* first chunk of the value */ - uint16_t le_value_length; /* value length in ints */ + uint16_t le_value_numints; /* value length in ints */ uint32_t le_cd; /* collision differentiator */ uint64_t le_hash; /* hash value of the name */ } l_entry; @@ -177,7 +177,7 @@ typedef struct zap_entry_handle { * value must equal zap_hash(name). */ extern int zap_leaf_lookup(zap_leaf_t *l, - zap_name_t *zn, zap_entry_handle_t *zeh); + struct zap_name *zn, zap_entry_handle_t *zeh); /* * Return a handle to the entry with this hash+cd, or the entry with the @@ -193,10 +193,10 @@ extern int zap_leaf_lookup_closest(zap_leaf_t *l, * num_integers in the attribute. */ extern int zap_entry_read(const zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, void *buf); + uint8_t integer_size, uint64_t num_integers, void *buf); -extern int zap_entry_read_name(const zap_entry_handle_t *zeh, - uint16_t buflen, char *buf); +extern int zap_entry_read_name(struct zap *zap, const zap_entry_handle_t *zeh, + uint16_t buflen, char *buf); /* * Replace the value of an existing entry. @@ -204,7 +204,7 @@ extern int zap_entry_read_name(const zap_entry_handle_t *zeh, * zap_entry_update may fail if it runs out of space (ENOSPC). */ extern int zap_entry_update(zap_entry_handle_t *zeh, - uint8_t integer_size, uint64_t num_integers, const void *buf); + uint8_t integer_size, uint64_t num_integers, const void *buf); /* * Remove an entry. @@ -216,17 +216,16 @@ extern void zap_entry_remove(zap_entry_handle_t *zeh); * belong in this leaf (according to its hash value). Fills in the * entry handle on success. Returns 0 on success or ENOSPC on failure. */ -extern int zap_entry_create(zap_leaf_t *l, - const char *name, uint64_t h, uint32_t cd, - uint8_t integer_size, uint64_t num_integers, const void *buf, - zap_entry_handle_t *zeh); +extern int zap_entry_create(zap_leaf_t *l, struct zap_name *zn, uint32_t cd, + uint8_t integer_size, uint64_t num_integers, const void *buf, + zap_entry_handle_t *zeh); /* * Return true if there are additional entries with the same normalized * form. */ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, - zap_name_t *zn, const char *name, zap_t *zap); + struct zap_name *zn, const char *name, struct zap *zap); /* * Other stuff. @@ -235,7 +234,8 @@ extern boolean_t zap_entry_normalization_conflict(zap_entry_handle_t *zeh, extern void zap_leaf_init(zap_leaf_t *l, boolean_t sort); extern void zap_leaf_byteswap(zap_leaf_phys_t *buf, int len); extern void zap_leaf_split(zap_leaf_t *l, zap_leaf_t *nl, boolean_t sort); -extern void zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs); +extern void zap_leaf_stats(struct zap *zap, zap_leaf_t *l, + struct zap_stats *zs); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h index bd91b33d16886..3488962e216f0 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_acl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -114,8 +114,6 @@ typedef struct zfs_acl_phys { uint8_t z_ace_data[ZFS_ACE_SPACE]; /* space for embedded ACEs */ } zfs_acl_phys_t; - - typedef struct acl_ops { uint32_t (*ace_mask_get) (void *acep); /* get access mask */ void (*ace_mask_set) (void *acep, @@ -161,12 +159,21 @@ typedef struct zfs_acl { zfs_acl_node_t *z_curr_node; /* current node iterator is handling */ list_t z_acl; /* chunks of ACE data */ acl_ops_t z_ops; /* ACL operations */ - boolean_t z_has_fuids; /* FUIDs present in ACL? */ } zfs_acl_t; #define ACL_DATA_ALLOCED 0x1 #define ZFS_ACL_SIZE(aclcnt) (sizeof (ace_t) * (aclcnt)) +struct zfs_fuid_info; + +typedef struct zfs_acl_ids { + uint64_t z_fuid; /* file owner fuid */ + uint64_t z_fgid; /* file group owner fuid */ + uint64_t z_mode; /* mode to set on create */ + zfs_acl_t *z_aclp; /* ACL to create with file */ + struct zfs_fuid_info *z_fuidp; /* for tracking fuids for log */ +} zfs_acl_ids_t; + /* * Property values for acl_mode and acl_inherit. * @@ -183,17 +190,20 @@ typedef struct zfs_acl { struct znode; struct zfsvfs; -struct zfs_fuid_info; #ifdef _KERNEL -void zfs_perm_init(struct znode *, struct znode *, int, vattr_t *, - dmu_tx_t *, cred_t *, zfs_acl_t *, zfs_fuid_info_t **); +int zfs_acl_ids_create(struct znode *, int, vattr_t *, + cred_t *, vsecattr_t *, zfs_acl_ids_t *); +void zfs_acl_ids_free(zfs_acl_ids_t *); +boolean_t zfs_acl_ids_overquota(struct zfsvfs *, zfs_acl_ids_t *); int zfs_getacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); int zfs_setacl(struct znode *, vsecattr_t *, boolean_t, cred_t *); void zfs_acl_rele(void *); void zfs_oldace_byteswap(ace_t *, int); void zfs_ace_byteswap(void *, size_t, boolean_t); +extern boolean_t zfs_has_access(struct znode *zp, cred_t *cr); extern int zfs_zaccess(struct znode *, int, int, boolean_t, cred_t *); +int zfs_fastaccesschk_execute(struct znode *, cred_t *); extern int zfs_zaccess_rwx(struct znode *, mode_t, int, cred_t *); extern int zfs_zaccess_unix(struct znode *, mode_t, cred_t *); extern int zfs_acl_access(struct znode *, int, cred_t *); @@ -202,9 +212,9 @@ int zfs_zaccess_delete(struct znode *, struct znode *, cred_t *); int zfs_zaccess_rename(struct znode *, struct znode *, struct znode *, struct znode *, cred_t *cr); void zfs_acl_free(zfs_acl_t *); -int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, zfs_acl_t **); -int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, - struct zfs_fuid_info **, dmu_tx_t *); +int zfs_vsec_2_aclp(struct zfsvfs *, vtype_t, vsecattr_t *, cred_t *, + struct zfs_fuid_info **, zfs_acl_t **); +int zfs_aclset_common(struct znode *, zfs_acl_t *, cred_t *, dmu_tx_t *); #endif diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h index a5be3e1303db2..558e9e1884e37 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_context.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZFS_CONTEXT_H #define _SYS_ZFS_CONTEXT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -62,7 +60,9 @@ extern "C" { #include #include #include +#include #include +#include #define CPU_SEQID (CPU->cpu_seqid) diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h index ce29625d1e3ad..c15c946d5dc17 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ctldir.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _ZFS_CTLDIR_H #define _ZFS_CTLDIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -66,6 +64,7 @@ int zfsctl_lookup_objset(vfs_t *vfsp, uint64_t objsetid, zfsvfs_t **zfsvfsp); #define ZFSCTL_INO_ROOT 0x1 #define ZFSCTL_INO_SNAPDIR 0x2 +#define ZFSCTL_INO_SHARES 0x3 #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h index ebb66e8ae4e90..f050f7f24de44 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_dir.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_DIR_H #define _SYS_FS_ZFS_DIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -44,11 +42,11 @@ extern "C" { #define ZRENAMING 0x0010 /* znode is being renamed */ #define ZCILOOK 0x0020 /* case-insensitive lookup requested */ #define ZCIEXACT 0x0040 /* c-i requires c-s match (rename) */ +#define ZHAVELOCK 0x0080 /* z_name_lock is already held */ /* mknode flags */ #define IS_ROOT_NODE 0x01 /* create a root node */ #define IS_XATTR 0x02 /* create an extended attribute node */ -#define IS_REPLAY 0x04 /* we are replaying intent log */ extern int zfs_dirent_lock(zfs_dirlock_t **, znode_t *, char *, znode_t **, int, int *, pathname_t *); @@ -59,7 +57,7 @@ extern int zfs_link_destroy(zfs_dirlock_t *, znode_t *, dmu_tx_t *, int, extern int zfs_dirlook(znode_t *, char *, vnode_t **, int, int *, pathname_t *); extern void zfs_mknode(znode_t *, vattr_t *, dmu_tx_t *, cred_t *, - uint_t, znode_t **, int, zfs_acl_t *, zfs_fuid_info_t **); + uint_t, znode_t **, int, zfs_acl_ids_t *); extern void zfs_rmnode(znode_t *); extern void zfs_dl_name_switch(zfs_dirlock_t *dl, char *new, char **old); extern boolean_t zfs_dirempty(znode_t *); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h index 810ffc81a8ccb..0feb3ce4bb7ca 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_fuid.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_FUID_H #define _SYS_FS_ZFS_FUID_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef _KERNEL #include #include @@ -51,11 +49,11 @@ typedef enum { * Estimate space needed for one more fuid table entry. * for now assume its current size + 1K */ -#define FUID_SIZE_ESTIMATE(z) (z->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) +#define FUID_SIZE_ESTIMATE(z) ((z)->z_fuid_size + (SPA_MINBLOCKSIZE << 1)) -#define FUID_INDEX(x) (x >> 32) -#define FUID_RID(x) (x & 0xffffffff) -#define FUID_ENCODE(idx, rid) ((idx << 32) | rid) +#define FUID_INDEX(x) ((x) >> 32) +#define FUID_RID(x) ((x) & 0xffffffff) +#define FUID_ENCODE(idx, rid) (((uint64_t)(idx) << 32) | (rid)) /* * FUIDs cause problems for the intent log * we need to replay the creation of the FUID, @@ -102,19 +100,27 @@ typedef struct zfs_fuid_info { #ifdef _KERNEL struct znode; extern uid_t zfs_fuid_map_id(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t); +extern void zfs_fuid_node_add(zfs_fuid_info_t **, const char *, uint32_t, + uint64_t, uint64_t, zfs_fuid_type_t); extern void zfs_fuid_destroy(zfsvfs_t *); extern uint64_t zfs_fuid_create_cred(zfsvfs_t *, zfs_fuid_type_t, - dmu_tx_t *, cred_t *, zfs_fuid_info_t **); + cred_t *, zfs_fuid_info_t **); extern uint64_t zfs_fuid_create(zfsvfs_t *, uint64_t, cred_t *, zfs_fuid_type_t, - dmu_tx_t *, zfs_fuid_info_t **); -extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, uid_t *uid, - uid_t *gid); + zfs_fuid_info_t **); +extern void zfs_fuid_map_ids(struct znode *zp, cred_t *cr, + uid_t *uid, uid_t *gid); extern zfs_fuid_info_t *zfs_fuid_info_alloc(void); -extern void zfs_fuid_info_free(); +extern void zfs_fuid_info_free(zfs_fuid_info_t *); extern boolean_t zfs_groupmember(zfsvfs_t *, uint64_t, cred_t *); +void zfs_fuid_sync(zfsvfs_t *, dmu_tx_t *); +extern int zfs_fuid_find_by_domain(zfsvfs_t *, const char *domain, + char **retdomain, boolean_t addok); +extern const char *zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx); +extern void zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx); #endif char *zfs_fuid_idx_domain(avl_tree_t *, uint32_t); +void zfs_fuid_avl_tree_create(avl_tree_t *, avl_tree_t *); uint64_t zfs_fuid_table_load(objset_t *, uint64_t, avl_tree_t *, avl_tree_t *); void zfs_fuid_table_destroy(avl_tree_t *, avl_tree_t *); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h index 1692608bb9ce6..90eecb812f23a 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_ioctl.h @@ -19,19 +19,18 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZFS_IOCTL_H #define _SYS_ZFS_IOCTL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include #include +#include #ifdef _KERNEL #include @@ -47,26 +46,85 @@ extern "C" { #define ZFS_SNAPDIR_HIDDEN 0 #define ZFS_SNAPDIR_VISIBLE 1 -#define DMU_BACKUP_STREAM_VERSION (1ULL) -#define DMU_BACKUP_HEADER_VERSION (2ULL) +/* + * Field manipulation macros for the drr_versioninfo field of the + * send stream header. + */ + +/* + * Header types for zfs send streams. + */ +typedef enum drr_headertype { + DMU_SUBSTREAM = 0x1, + DMU_COMPOUNDSTREAM = 0x2 +} drr_headertype_t; + +#define DMU_GET_STREAM_HDRTYPE(vi) BF64_GET((vi), 0, 2) +#define DMU_SET_STREAM_HDRTYPE(vi, x) BF64_SET((vi), 0, 2, x) + +#define DMU_GET_FEATUREFLAGS(vi) BF64_GET((vi), 2, 30) +#define DMU_SET_FEATUREFLAGS(vi, x) BF64_SET((vi), 2, 30, x) + +/* + * Feature flags for zfs send streams (flags in drr_versioninfo) + */ + +#define DMU_BACKUP_FEATURE_DEDUP (0x1) +#define DMU_BACKUP_FEATURE_DEDUPPROPS (0x2) + +/* + * Mask of all supported backup features + */ +#define DMU_BACKUP_FEATURE_MASK (DMU_BACKUP_FEATURE_DEDUP | \ + DMU_BACKUP_FEATURE_DEDUPPROPS) + +/* Are all features in the given flag word currently supported? */ +#define DMU_STREAM_SUPPORTED(x) (!((x) & ~DMU_BACKUP_FEATURE_MASK)) + +/* + * The drr_versioninfo field of the dmu_replay_record has the + * following layout: + * + * 64 56 48 40 32 24 16 8 0 + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * | reserved | feature-flags |C|S| + * +-------+-------+-------+-------+-------+-------+-------+-------+ + * + * The low order two bits indicate the header type: SUBSTREAM (0x1) + * or COMPOUNDSTREAM (0x2). Using two bits for this is historical: + * this field used to be a version number, where the two version types + * were 1 and 2. Using two bits for this allows earlier versions of + * the code to be able to recognize send streams that don't use any + * of the features indicated by feature flags. + */ + #define DMU_BACKUP_MAGIC 0x2F5bacbacULL #define DRR_FLAG_CLONE (1<<0) #define DRR_FLAG_CI_DATA (1<<1) +/* + * flags in the drr_checksumflags field in the DRR_WRITE and + * DRR_WRITE_BYREF blocks + */ +#define DRR_CHECKSUM_DEDUP (1<<0) + +#define DRR_IS_DEDUP_CAPABLE(flags) ((flags) & DRR_CHECKSUM_DEDUP) + /* * zfs ioctl command structure */ typedef struct dmu_replay_record { enum { DRR_BEGIN, DRR_OBJECT, DRR_FREEOBJECTS, - DRR_WRITE, DRR_FREE, DRR_END, + DRR_WRITE, DRR_FREE, DRR_END, DRR_WRITE_BYREF, + DRR_NUMTYPES } drr_type; uint32_t drr_payloadlen; union { struct drr_begin { uint64_t drr_magic; - uint64_t drr_version; + uint64_t drr_versioninfo; /* was drr_version */ uint64_t drr_creation_time; dmu_objset_type_t drr_type; uint32_t drr_flags; @@ -76,6 +134,7 @@ typedef struct dmu_replay_record { } drr_begin; struct drr_end { zio_cksum_t drr_checksum; + uint64_t drr_toguid; } drr_end; struct drr_object { uint64_t drr_object; @@ -83,14 +142,16 @@ typedef struct dmu_replay_record { dmu_object_type_t drr_bonustype; uint32_t drr_blksz; uint32_t drr_bonuslen; - uint8_t drr_checksum; + uint8_t drr_checksumtype; uint8_t drr_compress; uint8_t drr_pad[6]; + uint64_t drr_toguid; /* bonus content follows */ } drr_object; struct drr_freeobjects { uint64_t drr_firstobj; uint64_t drr_numobjs; + uint64_t drr_toguid; } drr_freeobjects; struct drr_write { uint64_t drr_object; @@ -98,13 +159,35 @@ typedef struct dmu_replay_record { uint32_t drr_pad; uint64_t drr_offset; uint64_t drr_length; + uint64_t drr_toguid; + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_pad2[6]; + ddt_key_t drr_key; /* deduplication key */ /* content follows */ } drr_write; struct drr_free { uint64_t drr_object; uint64_t drr_offset; uint64_t drr_length; + uint64_t drr_toguid; } drr_free; + struct drr_write_byref { + /* where to put the data */ + uint64_t drr_object; + uint64_t drr_offset; + uint64_t drr_length; + uint64_t drr_toguid; + /* where to find the prior copy of the data */ + uint64_t drr_refguid; + uint64_t drr_refobject; + uint64_t drr_refoffset; + /* properties of the data */ + uint8_t drr_checksumtype; + uint8_t drr_checksumflags; + uint8_t drr_pad2[6]; + ddt_key_t drr_key; /* deduplication key */ + } drr_write_byref; } drr_u; } dmu_replay_record_t; @@ -118,7 +201,11 @@ typedef struct zinject_record { uint32_t zi_error; uint64_t zi_type; uint32_t zi_freq; - uint32_t zi_pad; /* pad out to 64 bit alignment */ + uint32_t zi_failfast; + char zi_func[MAXNAMELEN]; + uint32_t zi_iotype; + int32_t zi_duration; + uint64_t zi_timer; } zinject_record_t; #define ZINJECT_NULL 0x1 @@ -148,6 +235,7 @@ typedef struct zfs_cmd { char zc_name[MAXPATHLEN]; char zc_value[MAXPATHLEN * 2]; char zc_string[MAXNAMELEN]; + char zc_top_ds[MAXPATHLEN]; uint64_t zc_guid; uint64_t zc_nvlist_conf; /* really (char *) */ uint64_t zc_nvlist_conf_size; @@ -162,15 +250,27 @@ typedef struct zfs_cmd { uint64_t zc_history_len; uint64_t zc_history_offset; uint64_t zc_obj; + uint64_t zc_iflags; /* internal to zfs(7fs) */ zfs_share_t zc_share; dmu_objset_stats_t zc_objset_stats; struct drr_begin zc_begin_record; zinject_record_t zc_inject_record; + boolean_t zc_defer_destroy; + boolean_t zc_temphold; } zfs_cmd_t; +typedef struct zfs_useracct { + char zu_domain[256]; + uid_t zu_rid; + uint32_t zu_pad; + uint64_t zu_space; +} zfs_useracct_t; + #define ZVOL_MAX_MINOR (1 << 16) #define ZFS_MIN_MINOR (ZVOL_MAX_MINOR + 1) +#define ZPOOL_EXPORT_AFTER_SPLIT 0x1 + #ifdef _KERNEL typedef struct zfs_creat { @@ -185,7 +285,7 @@ extern int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr); extern int zfs_secpolicy_destroy_perms(const char *name, cred_t *cr); extern int zfs_busy(void); -extern int zfs_unmount_snap(char *, void *); +extern int zfs_unmount_snap(const char *, void *); #endif /* _KERNEL */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h index 87b75e6e75b5a..e961b756107b3 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_vfsops.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_VFSOPS_H #define _SYS_FS_ZFS_VFSOPS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -49,13 +47,13 @@ struct zfsvfs { uint64_t z_root; /* id of root znode */ uint64_t z_unlinkedobj; /* id of unlinked zapobj */ uint64_t z_max_blksz; /* maximum block size for files */ - uint64_t z_assign; /* TXG_NOWAIT or set by zil_replay() */ uint64_t z_fuid_obj; /* fuid table object number */ uint64_t z_fuid_size; /* fuid table size */ avl_tree_t z_fuid_idx; /* fuid tree keyed by index */ avl_tree_t z_fuid_domain; /* fuid tree keyed by domain */ krwlock_t z_fuid_lock; /* fuid lock */ boolean_t z_fuid_loaded; /* fuid tables are loaded */ + boolean_t z_fuid_dirty; /* need to sync fuid table ? */ struct zfs_fuid_info *z_fuid_replay; /* fuid info for replay */ zilog_t *z_log; /* intent log pointer */ uint_t z_acl_mode; /* acl chmod/mode behavior */ @@ -74,8 +72,12 @@ struct zfsvfs { boolean_t z_issnap; /* true if this is a snapshot */ boolean_t z_vscan; /* virus scan on/off */ boolean_t z_use_fuids; /* version allows fuids */ - kmutex_t z_online_recv_lock; /* recv in prog grabs as WRITER */ + boolean_t z_replay; /* set during ZIL replay */ uint64_t z_version; /* ZPL version */ + uint64_t z_shares_dir; /* hidden shares dir */ + kmutex_t z_lock; + uint64_t z_userquota_obj; + uint64_t z_groupquota_obj; #define ZFS_OBJ_MTX_SZ 64 kmutex_t z_hold_mtx[ZFS_OBJ_MTX_SZ]; /* znode hold locks */ }; @@ -130,8 +132,20 @@ typedef struct zfid_long { extern uint_t zfs_fsyncer_key; -extern int zfs_suspend_fs(zfsvfs_t *zfsvfs, char *osname, int *mode); -extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode); +extern int zfs_suspend_fs(zfsvfs_t *zfsvfs); +extern int zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname); +extern int zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valuep); +extern int zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep); +extern int zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota); +extern boolean_t zfs_usergroup_overquota(zfsvfs_t *zfsvfs, + boolean_t isgroup, uint64_t fuid); +extern int zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers); +extern int zfsvfs_create(const char *name, zfsvfs_t **zfvp); +extern void zfsvfs_free(zfsvfs_t *zfsvfs); +extern int zfs_check_global_label(const char *dsname, const char *hexsl); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h index a5416525c7a37..a064627f157b6 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zfs_znode.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -57,6 +57,7 @@ extern "C" { #define ZFS_OPAQUE 0x0000010000000000 #define ZFS_AV_QUARANTINED 0x0000020000000000 #define ZFS_AV_MODIFIED 0x0000040000000000 +#define ZFS_REPARSE 0x0000080000000000 #define ZFS_ATTR_SET(zp, attr, value) \ { \ @@ -77,6 +78,7 @@ extern "C" { #define ZFS_ACL_DEFAULTED 0x20 /* ACL should be defaulted */ #define ZFS_ACL_AUTO_INHERIT 0x40 /* ACL should be inherited */ #define ZFS_BONUS_SCANSTAMP 0x80 /* Scanstamp in bonus area */ +#define ZFS_NO_EXECS_DENIED 0x100 /* exec was given to everyone */ /* * Is ID ephemeral? @@ -93,12 +95,15 @@ extern "C" { /* * Special attributes for master node. + * "userquota@" and "groupquota@" are also valid (from + * zfs_userquota_prop_prefixes[]). */ #define ZFS_FSID "FSID" #define ZFS_UNLINKED_SET "DELETE_QUEUE" #define ZFS_ROOT_OBJ "ROOT" #define ZPL_VERSION_STR "VERSION" #define ZFS_FUID_TABLES "FUID" +#define ZFS_SHARES_DIR "SHARES" #define ZFS_MAX_BLOCKSIZE (SPA_MAXBLOCKSIZE) @@ -171,6 +176,7 @@ typedef struct znode_phys { typedef struct zfs_dirlock { char *dl_name; /* directory entry being locked */ uint32_t dl_sharecnt; /* 0 if exclusive, > 0 if shared */ + uint8_t dl_namelock; /* 1 if z_name_lock is NOT held */ uint16_t dl_namesize; /* set if dl_name was allocated */ kcondvar_t dl_cv; /* wait for entry to be unlocked */ struct znode *dl_dzp; /* directory znode */ @@ -182,7 +188,6 @@ typedef struct znode { vnode_t *z_vnode; uint64_t z_id; /* object ID for this znode */ kmutex_t z_lock; /* znode modification lock */ - krwlock_t z_map_lock; /* page map lock */ krwlock_t z_parent_lock; /* parent lock for directories */ krwlock_t z_name_lock; /* "master" lock for dirent locks */ zfs_dirlock_t *z_dirlocks; /* directory entry lock list */ @@ -198,6 +203,7 @@ typedef struct znode { uint64_t z_gen; /* generation (same as zp_gen) */ uint32_t z_sync_cnt; /* synchronous open count */ kmutex_t z_acl_lock; /* acl data lock */ + zfs_acl_t *z_acl_cached; /* cached acl */ list_node_t z_link_node; /* all znodes in fs link */ /* * These are dmu managed fields. @@ -310,7 +316,6 @@ extern int zfs_create_op_tables(); extern int zfs_sync(vfs_t *vfsp, short flag, cred_t *cr); extern dev_t zfs_cmpldev(uint64_t); extern int zfs_get_zplprop(objset_t *os, zfs_prop_t prop, uint64_t *value); -extern int zfs_set_version(const char *name, uint64_t newvers); extern int zfs_get_stats(objset_t *os, nvlist_t *nv); extern void zfs_znode_dmu_fini(znode_t *); @@ -337,6 +342,7 @@ extern void zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, vsecattr_t *vsecp, zfs_fuid_info_t *fuidp); extern void zfs_xvattr_set(znode_t *zp, xvattr_t *xvap); extern void zfs_upgrade(zfsvfs_t *zfsvfs, dmu_tx_t *tx); +extern int zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx); extern caddr_t zfs_map_page(page_t *, enum seg_rw); extern void zfs_unmap_page(page_t *, caddr_t); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h index 4d02d14f70756..b603241db733c 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -55,28 +55,40 @@ typedef struct zil_header { uint64_t zh_claim_txg; /* txg in which log blocks were claimed */ uint64_t zh_replay_seq; /* highest replayed sequence number */ blkptr_t zh_log; /* log chain */ - uint64_t zh_claim_seq; /* highest claimed sequence number */ - uint64_t zh_pad[5]; + uint64_t zh_claim_blk_seq; /* highest claimed block sequence number */ + uint64_t zh_flags; /* header flags */ + uint64_t zh_claim_lr_seq; /* highest claimed lr sequence number */ + uint64_t zh_pad[3]; } zil_header_t; /* - * Log block trailer - structure at the end of the header and each log block + * zh_flags bit settings + */ +#define ZIL_REPLAY_NEEDED 0x1 /* replay needed - internal only */ +#define ZIL_CLAIM_LR_SEQ_VALID 0x2 /* zh_claim_lr_seq field is valid */ + +/* + * Log block chaining. * - * The zit_bt contains a zbt_cksum which for the intent log is + * Log blocks are chained together. Originally they were chained at the + * end of the block. For performance reasons the chain was moved to the + * beginning of the block which allows writes for only the data being used. + * The older position is supported for backwards compatability. + * + * The zio_eck_t contains a zec_cksum which for the intent log is * the sequence number of this log block. A seq of 0 is invalid. - * The zbt_cksum is checked by the SPA against the sequence + * The zec_cksum is checked by the SPA against the sequence * number passed in the blk_cksum field of the blkptr_t */ -typedef struct zil_trailer { - uint64_t zit_pad; - blkptr_t zit_next_blk; /* next block in chain */ - uint64_t zit_nused; /* bytes in log block used */ - zio_block_tail_t zit_bt; /* block trailer */ -} zil_trailer_t; +typedef struct zil_chain { + uint64_t zc_pad; + blkptr_t zc_next_blk; /* next block in chain */ + uint64_t zc_nused; /* bytes in log block used */ + zio_eck_t zc_eck; /* block trailer */ +} zil_chain_t; #define ZIL_MIN_BLKSZ 4096ULL #define ZIL_MAX_BLKSZ SPA_MAXBLOCKSIZE -#define ZIL_BLK_DATA_SZ(lwb) ((lwb)->lwb_sz - sizeof (zil_trailer_t)) /* * The words of a log block checksum. @@ -133,7 +145,8 @@ typedef enum zil_create { #define TX_MKDIR_ACL 17 /* mkdir with ACL */ #define TX_MKDIR_ATTR 18 /* mkdir with attr */ #define TX_MKDIR_ACL_ATTR 19 /* mkdir with ACL + attrs */ -#define TX_MAX_TYPE 20 /* Max transaction type */ +#define TX_WRITE2 20 /* dmu_sync EALREADY write */ +#define TX_MAX_TYPE 21 /* Max transaction type */ /* * The transactions for mkdir, symlink, remove, rmdir, link, and rename @@ -142,6 +155,20 @@ typedef enum zil_create { */ #define TX_CI ((uint64_t)0x1 << 63) /* case-insensitive behavior requested */ +/* + * Transactions for write, truncate, setattr, acl_v0, and acl can be logged + * out of order. For convenience in the code, all such records must have + * lr_foid at the same offset. + */ +#define TX_OOO(txtype) \ + ((txtype) == TX_WRITE || \ + (txtype) == TX_TRUNCATE || \ + (txtype) == TX_SETATTR || \ + (txtype) == TX_ACL_V0 || \ + (txtype) == TX_ACL || \ + (txtype) == TX_WRITE2) + + /* * Format of log records. * The fields are carefully defined to allow them to be aligned @@ -161,6 +188,14 @@ typedef struct { /* common log record header */ uint64_t lrc_seq; /* see comment above */ } lr_t; +/* + * Common start of all out-of-order record types (TX_OOO() above). + */ +typedef struct { + lr_t lr_common; /* common portion of log record */ + uint64_t lr_foid; /* object id */ +} lr_ooo_t; + /* * Handle option extended vattr attributes. * @@ -251,7 +286,7 @@ typedef struct { uint64_t lr_foid; /* file object to write */ uint64_t lr_offset; /* offset to write to */ uint64_t lr_length; /* user data length to write */ - uint64_t lr_blkoff; /* offset represented by lr_blkptr */ + uint64_t lr_blkoff; /* no longer used */ blkptr_t lr_blkptr; /* spa block pointer for replay */ /* write data will follow for small writes */ } lr_write_t; @@ -299,13 +334,34 @@ typedef struct { */ /* - * ZFS intent log transaction structure + * Writes are handled in three different ways: + * + * WR_INDIRECT: + * In this mode, if we need to commit the write later, then the block + * is immediately written into the file system (using dmu_sync), + * and a pointer to the block is put into the log record. + * When the txg commits the block is linked in. + * This saves additionally writing the data into the log record. + * There are a few requirements for this to occur: + * - write is greater than zfs/zvol_immediate_write_sz + * - not using slogs (as slogs are assumed to always be faster + * than writing into the main pool) + * - the write occupies only one block + * WR_COPIED: + * If we know we'll immediately be committing the + * transaction (FSYNC or FDSYNC), the we allocate a larger + * log record here for the data and copy the data in. + * WR_NEED_COPY: + * Otherwise we don't allocate a buffer, and *if* we need to + * flush the write later then a buffer is allocated and + * we retrieve the data using the dmu. */ typedef enum { WR_INDIRECT, /* indirect - a large write (dmu_sync() data */ /* and put blkptr in log, rather than actual data) */ WR_COPIED, /* immediate - data is copied into lr_write_t */ WR_NEED_COPY, /* immediate - data needs to be copied if pushed */ + WR_NUM_STATES /* number of states */ } itx_wr_state_t; typedef struct itx { @@ -318,27 +374,14 @@ typedef struct itx { /* followed by type-specific part of lr_xx_t and its immediate data */ } itx_t; - -/* - * zgd_t is passed through dmu_sync() to the callback routine zfs_get_done() - * to handle the cleanup of the dmu_sync() buffer write - */ -typedef struct { - zilog_t *zgd_zilog; /* zilog */ - blkptr_t *zgd_bp; /* block pointer */ - struct rl *zgd_rl; /* range lock */ -} zgd_t; - - -typedef void zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, +typedef int zil_parse_blk_func_t(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t txg); -typedef void zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, +typedef int zil_parse_lr_func_t(zilog_t *zilog, lr_t *lr, void *arg, uint64_t txg); typedef int zil_replay_func_t(); -typedef void zil_replay_cleaner_t(); typedef int zil_get_data_t(void *arg, lr_write_t *lr, char *dbuf, zio_t *zio); -extern uint64_t zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, +extern int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg); extern void zil_init(void); @@ -350,28 +393,31 @@ extern void zil_free(zilog_t *zilog); extern zilog_t *zil_open(objset_t *os, zil_get_data_t *get_data); extern void zil_close(zilog_t *zilog); -extern void zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE], - zil_replay_cleaner_t *replay_cleaner); +extern void zil_replay(objset_t *os, void *arg, + zil_replay_func_t *replay_func[TX_MAX_TYPE]); +extern boolean_t zil_replaying(zilog_t *zilog, dmu_tx_t *tx); extern void zil_destroy(zilog_t *zilog, boolean_t keep_first); extern void zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx); extern itx_t *zil_itx_create(uint64_t txtype, size_t lrsize); +extern void zil_itx_destroy(itx_t *itx); extern uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx); extern void zil_commit(zilog_t *zilog, uint64_t seq, uint64_t oid); -extern int zil_claim(char *osname, void *txarg); -extern int zil_check_log_chain(char *osname, void *txarg); -extern int zil_clear_log_chain(char *osname, void *txarg); +extern int zil_vdev_offline(const char *osname, void *txarg); +extern int zil_claim(const char *osname, void *txarg); +extern int zil_check_log_chain(const char *osname, void *txarg); extern void zil_sync(zilog_t *zilog, dmu_tx_t *tx); extern void zil_clean(zilog_t *zilog); -extern int zil_is_committed(zilog_t *zilog); extern int zil_suspend(zilog_t *zilog); extern void zil_resume(zilog_t *zilog); -extern void zil_add_block(zilog_t *zilog, blkptr_t *bp); +extern void zil_add_block(zilog_t *zilog, const blkptr_t *bp); +extern int zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp); + +extern void zil_set_logbias(zilog_t *zilog, uint64_t slogval); extern int zil_disable; diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h index 0fc800b96dea9..c46063b0527af 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zil_impl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZIL_IMPL_H #define _SYS_ZIL_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -45,8 +43,8 @@ typedef struct lwb { int lwb_sz; /* size of block and buffer */ char *lwb_buf; /* log write buffer */ zio_t *lwb_zio; /* zio for this buffer */ + dmu_tx_t *lwb_tx; /* tx for log block allocation */ uint64_t lwb_max_txg; /* highest txg in this lwb */ - txg_handle_t lwb_txgh; /* txg handle for txg_exit() */ list_node_t lwb_node; /* zilog->zl_lwb_list linkage */ } lwb_t; @@ -59,6 +57,8 @@ typedef struct zil_vdev_node { avl_node_t zv_node; /* AVL tree linkage */ } zil_vdev_node_t; +#define ZIL_PREV_BLKS 16 + /* * Stable storage intent log management structure. One per dataset. */ @@ -70,20 +70,27 @@ struct zilog { objset_t *zl_os; /* object set we're logging */ zil_get_data_t *zl_get_data; /* callback to get object content */ zio_t *zl_root_zio; /* log writer root zio */ - uint64_t zl_itx_seq; /* next itx sequence number */ + uint64_t zl_itx_seq; /* next in-core itx sequence number */ + uint64_t zl_lr_seq; /* on-disk log record sequence number */ uint64_t zl_commit_seq; /* committed upto this number */ - uint64_t zl_lr_seq; /* log record sequence number */ + uint64_t zl_commit_lr_seq; /* last committed on-disk lr seq */ uint64_t zl_destroy_txg; /* txg of last zil_destroy() */ - uint64_t zl_replay_seq[TXG_SIZE]; /* seq of last replayed rec */ + uint64_t zl_replayed_seq[TXG_SIZE]; /* last replayed rec seq */ + uint64_t zl_replaying_seq; /* current replay seq number */ uint32_t zl_suspend; /* log suspend count */ kcondvar_t zl_cv_writer; /* log writer thread completion */ kcondvar_t zl_cv_suspend; /* log suspend completion */ uint8_t zl_suspending; /* log is currently suspending */ uint8_t zl_keep_first; /* keep first log block in destroy */ - uint8_t zl_stop_replay; /* don't replay any further */ + uint8_t zl_replay; /* replaying records while set */ uint8_t zl_stop_sync; /* for debugging */ uint8_t zl_writer; /* boolean: write setup in progress */ - uint8_t zl_log_error; /* boolean: log write error */ + uint8_t zl_logbias; /* latency or throughput */ + int zl_parse_error; /* last zil_parse() error */ + uint64_t zl_parse_blk_seq; /* highest blk seq on last parse */ + uint64_t zl_parse_lr_seq; /* highest lr seq on last parse */ + uint64_t zl_parse_blk_count; /* number of blocks parsed */ + uint64_t zl_parse_lr_count; /* number of log records parsed */ list_t zl_itx_list; /* in-memory itx list */ uint64_t zl_itx_list_sz; /* total size of records on list */ uint64_t zl_cur_used; /* current commit log size used */ @@ -92,15 +99,21 @@ struct zilog { kmutex_t zl_vdev_lock; /* protects zl_vdev_tree */ avl_tree_t zl_vdev_tree; /* vdevs to flush in zil_commit() */ taskq_t *zl_clean_taskq; /* runs lwb and itx clean tasks */ - avl_tree_t zl_dva_tree; /* track DVAs during log parse */ + avl_tree_t zl_bp_tree; /* track bps during log parse */ clock_t zl_replay_time; /* lbolt of when replay started */ uint64_t zl_replay_blks; /* number of log blocks replayed */ + zil_header_t zl_old_header; /* debugging aid */ + uint_t zl_prev_blks[ZIL_PREV_BLKS]; /* size - sector rounded */ + uint_t zl_prev_rotor; /* rotor for zl_prev[] */ }; -typedef struct zil_dva_node { +typedef struct zil_bp_node { dva_t zn_dva; avl_node_t zn_node; -} zil_dva_node_t; +} zil_bp_node_t; + +#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_chain_t) - \ + sizeof (lr_write_t)) #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h index 4de78dfee0141..b81b6a4392b0a 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -38,12 +38,15 @@ extern "C" { #endif -#define ZBT_MAGIC 0x210da7ab10c7a11ULL /* zio data bloc tail */ +/* + * Embedded checksum + */ +#define ZEC_MAGIC 0x210da7ab10c7a11ULL -typedef struct zio_block_tail { - uint64_t zbt_magic; /* for validation, endianness */ - zio_cksum_t zbt_cksum; /* 256-bit checksum */ -} zio_block_tail_t; +typedef struct zio_eck { + uint64_t zec_magic; /* for validation, endianness */ + zio_cksum_t zec_cksum; /* 256-bit checksum */ +} zio_eck_t; /* * Gang block headers are self-checksumming and contain an array @@ -51,16 +54,16 @@ typedef struct zio_block_tail { */ #define SPA_GANGBLOCKSIZE SPA_MINBLOCKSIZE #define SPA_GBH_NBLKPTRS ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_block_tail_t)) / sizeof (blkptr_t)) + sizeof (zio_eck_t)) / sizeof (blkptr_t)) #define SPA_GBH_FILLER ((SPA_GANGBLOCKSIZE - \ - sizeof (zio_block_tail_t) - \ + sizeof (zio_eck_t) - \ (SPA_GBH_NBLKPTRS * sizeof (blkptr_t))) /\ sizeof (uint64_t)) typedef struct zio_gbh { blkptr_t zg_blkptr[SPA_GBH_NBLKPTRS]; uint64_t zg_filler[SPA_GBH_FILLER]; - zio_block_tail_t zg_tail; + zio_eck_t zg_tail; } zio_gbh_phys_t; enum zio_checksum { @@ -73,12 +76,19 @@ enum zio_checksum { ZIO_CHECKSUM_FLETCHER_2, ZIO_CHECKSUM_FLETCHER_4, ZIO_CHECKSUM_SHA256, + ZIO_CHECKSUM_ZILOG2, ZIO_CHECKSUM_FUNCTIONS }; -#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_2 +#define ZIO_CHECKSUM_ON_VALUE ZIO_CHECKSUM_FLETCHER_4 #define ZIO_CHECKSUM_DEFAULT ZIO_CHECKSUM_ON +#define ZIO_CHECKSUM_MASK 0xffULL +#define ZIO_CHECKSUM_VERIFY (1 << 8) + +#define ZIO_DEDUPCHECKSUM ZIO_CHECKSUM_SHA256 +#define ZIO_DEDUPDITTO_MIN 100 + enum zio_compress { ZIO_COMPRESS_INHERIT = 0, ZIO_COMPRESS_ON, @@ -94,12 +104,19 @@ enum zio_compress { ZIO_COMPRESS_GZIP_7, ZIO_COMPRESS_GZIP_8, ZIO_COMPRESS_GZIP_9, + ZIO_COMPRESS_ZLE, ZIO_COMPRESS_FUNCTIONS }; #define ZIO_COMPRESS_ON_VALUE ZIO_COMPRESS_LZJB #define ZIO_COMPRESS_DEFAULT ZIO_COMPRESS_OFF +#define BOOTFS_COMPRESS_VALID(compress) \ + ((compress) == ZIO_COMPRESS_LZJB || \ + ((compress) == ZIO_COMPRESS_ON && \ + ZIO_COMPRESS_ON_VALUE == ZIO_COMPRESS_LZJB) || \ + (compress) == ZIO_COMPRESS_OFF) + #define ZIO_FAILURE_MODE_WAIT 0 #define ZIO_FAILURE_MODE_CONTINUE 1 #define ZIO_FAILURE_MODE_PANIC 2 @@ -107,65 +124,88 @@ enum zio_compress { #define ZIO_PRIORITY_NOW (zio_priority_table[0]) #define ZIO_PRIORITY_SYNC_READ (zio_priority_table[1]) #define ZIO_PRIORITY_SYNC_WRITE (zio_priority_table[2]) -#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[3]) -#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[4]) -#define ZIO_PRIORITY_FREE (zio_priority_table[5]) -#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[6]) -#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[7]) -#define ZIO_PRIORITY_RESILVER (zio_priority_table[8]) -#define ZIO_PRIORITY_SCRUB (zio_priority_table[9]) -#define ZIO_PRIORITY_TABLE_SIZE 10 - -#define ZIO_FLAG_MUSTSUCCEED 0x00000 -#define ZIO_FLAG_CANFAIL 0x00001 -#define ZIO_FLAG_SPECULATIVE 0x00002 -#define ZIO_FLAG_CONFIG_WRITER 0x00004 -#define ZIO_FLAG_DONT_RETRY 0x00008 - -#define ZIO_FLAG_DONT_CACHE 0x00010 -#define ZIO_FLAG_DONT_QUEUE 0x00020 -#define ZIO_FLAG_DONT_AGGREGATE 0x00040 -#define ZIO_FLAG_DONT_PROPAGATE 0x00080 - -#define ZIO_FLAG_IO_BYPASS 0x00100 -#define ZIO_FLAG_IO_REPAIR 0x00200 -#define ZIO_FLAG_IO_RETRY 0x00400 -#define ZIO_FLAG_IO_REWRITE 0x00800 - -#define ZIO_FLAG_PROBE 0x01000 -#define ZIO_FLAG_RESILVER 0x02000 -#define ZIO_FLAG_SCRUB 0x04000 -#define ZIO_FLAG_SCRUB_THREAD 0x08000 - -#define ZIO_FLAG_GANG_CHILD 0x10000 - -#define ZIO_FLAG_GANG_INHERIT \ - (ZIO_FLAG_CANFAIL | \ - ZIO_FLAG_SPECULATIVE | \ - ZIO_FLAG_CONFIG_WRITER | \ - ZIO_FLAG_DONT_RETRY | \ - ZIO_FLAG_DONT_CACHE | \ - ZIO_FLAG_DONT_AGGREGATE | \ - ZIO_FLAG_RESILVER | \ - ZIO_FLAG_SCRUB | \ - ZIO_FLAG_SCRUB_THREAD) - -#define ZIO_FLAG_VDEV_INHERIT \ - (ZIO_FLAG_GANG_INHERIT | \ - ZIO_FLAG_IO_REPAIR | \ - ZIO_FLAG_IO_RETRY | \ - ZIO_FLAG_PROBE) +#define ZIO_PRIORITY_LOG_WRITE (zio_priority_table[3]) +#define ZIO_PRIORITY_CACHE_FILL (zio_priority_table[4]) +#define ZIO_PRIORITY_AGG (zio_priority_table[5]) +#define ZIO_PRIORITY_FREE (zio_priority_table[6]) +#define ZIO_PRIORITY_ASYNC_WRITE (zio_priority_table[7]) +#define ZIO_PRIORITY_ASYNC_READ (zio_priority_table[8]) +#define ZIO_PRIORITY_RESILVER (zio_priority_table[9]) +#define ZIO_PRIORITY_SCRUB (zio_priority_table[10]) +#define ZIO_PRIORITY_TABLE_SIZE 11 #define ZIO_PIPELINE_CONTINUE 0x100 #define ZIO_PIPELINE_STOP 0x101 +enum zio_flag { + /* + * Flags inherited by gang, ddt, and vdev children, + * and that must be equal for two zios to aggregate + */ + ZIO_FLAG_DONT_AGGREGATE = 1 << 0, + ZIO_FLAG_IO_REPAIR = 1 << 1, + ZIO_FLAG_SELF_HEAL = 1 << 2, + ZIO_FLAG_RESILVER = 1 << 3, + ZIO_FLAG_SCRUB = 1 << 4, + ZIO_FLAG_SCRUB_THREAD = 1 << 5, + +#define ZIO_FLAG_AGG_INHERIT (ZIO_FLAG_CANFAIL - 1) + + /* + * Flags inherited by ddt, gang, and vdev children. + */ + ZIO_FLAG_CANFAIL = 1 << 6, /* must be first for INHERIT */ + ZIO_FLAG_SPECULATIVE = 1 << 7, + ZIO_FLAG_CONFIG_WRITER = 1 << 8, + ZIO_FLAG_DONT_RETRY = 1 << 9, + ZIO_FLAG_DONT_CACHE = 1 << 10, + ZIO_FLAG_NODATA = 1 << 11, + ZIO_FLAG_INDUCE_DAMAGE = 1 << 12, + +#define ZIO_FLAG_DDT_INHERIT (ZIO_FLAG_IO_RETRY - 1) +#define ZIO_FLAG_GANG_INHERIT (ZIO_FLAG_IO_RETRY - 1) + + /* + * Flags inherited by vdev children. + */ + ZIO_FLAG_IO_RETRY = 1 << 13, /* must be first for INHERIT */ + ZIO_FLAG_PROBE = 1 << 14, + ZIO_FLAG_TRYHARD = 1 << 15, + ZIO_FLAG_OPTIONAL = 1 << 16, + +#define ZIO_FLAG_VDEV_INHERIT (ZIO_FLAG_DONT_QUEUE - 1) + + /* + * Flags not inherited by any children. + */ + ZIO_FLAG_DONT_QUEUE = 1 << 17, /* must be first for INHERIT */ + ZIO_FLAG_DONT_PROPAGATE = 1 << 18, + ZIO_FLAG_IO_BYPASS = 1 << 19, + ZIO_FLAG_IO_REWRITE = 1 << 20, + ZIO_FLAG_RAW = 1 << 21, + ZIO_FLAG_GANG_CHILD = 1 << 22, + ZIO_FLAG_DDT_CHILD = 1 << 23, + ZIO_FLAG_GODFATHER = 1 << 24 +}; + +#define ZIO_FLAG_MUSTSUCCEED 0 + +#define ZIO_DDT_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_DDT_INHERIT) | \ + ZIO_FLAG_DDT_CHILD | ZIO_FLAG_CANFAIL) + #define ZIO_GANG_CHILD_FLAGS(zio) \ (((zio)->io_flags & ZIO_FLAG_GANG_INHERIT) | \ ZIO_FLAG_GANG_CHILD | ZIO_FLAG_CANFAIL) +#define ZIO_VDEV_CHILD_FLAGS(zio) \ + (((zio)->io_flags & ZIO_FLAG_VDEV_INHERIT) | \ + ZIO_FLAG_CANFAIL) + enum zio_child { ZIO_CHILD_VDEV = 0, ZIO_CHILD_GANG, + ZIO_CHILD_DDT, ZIO_CHILD_LOGICAL, ZIO_CHILD_TYPES }; @@ -183,7 +223,6 @@ enum zio_wait_type { #define ECKSUM EBADE #define EFRAGS EBADR -typedef struct zio zio_t; typedef void zio_done_func_t(zio_t *zio); extern uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE]; @@ -192,18 +231,15 @@ extern char *zio_type_name[ZIO_TYPES]; /* * A bookmark is a four-tuple that uniquely * identifies any block in the pool. By convention, the meta-objset (MOS) - * is objset 0, the meta-dnode is object 0, the root block (osphys_t) is - * level -1 of the meta-dnode, and intent log blocks (which are chained - * off the root block) have blkid == sequence number. In summary: + * is objset 0, and the meta-dnode is object 0. This covers all blocks + * except root blocks and ZIL blocks, which are defined as follows: * - * mos is objset 0 - * meta-dnode is object 0 - * root block is - * intent log is + * Root blocks (objset_phys_t) are object 0, level -1: . + * ZIL blocks are bookmarked . + * dmu_sync()ed ZIL data blocks are bookmarked . * - * Note: this structure is called a bookmark because its first purpose was - * to remember where to resume a pool-wide traverse. The absolute ordering - * for block visitation during traversal is defined in compare_bookmark(). + * Note: this structure is called a bookmark because its original purpose + * was to remember where to resume a pool-wide traverse. * * Note: this structure is passed between userland and the kernel. * Therefore it must not change size or alignment between 32/64 bit @@ -216,14 +252,66 @@ typedef struct zbookmark { uint64_t zb_blkid; } zbookmark_t; +#define SET_BOOKMARK(zb, objset, object, level, blkid) \ +{ \ + (zb)->zb_objset = objset; \ + (zb)->zb_object = object; \ + (zb)->zb_level = level; \ + (zb)->zb_blkid = blkid; \ +} + +#define ZB_DESTROYED_OBJSET (-1ULL) + +#define ZB_ROOT_OBJECT (0ULL) +#define ZB_ROOT_LEVEL (-1LL) +#define ZB_ROOT_BLKID (0ULL) + +#define ZB_ZIL_OBJECT (0ULL) +#define ZB_ZIL_LEVEL (-2LL) + typedef struct zio_prop { enum zio_checksum zp_checksum; enum zio_compress zp_compress; dmu_object_type_t zp_type; uint8_t zp_level; - uint8_t zp_ndvas; + uint8_t zp_copies; + uint8_t zp_dedup; + uint8_t zp_dedup_verify; } zio_prop_t; +typedef struct zio_cksum_report zio_cksum_report_t; + +typedef void zio_cksum_finish_f(zio_cksum_report_t *rep, + const void *good_data); +typedef void zio_cksum_free_f(void *cbdata, size_t size); + +struct zio_bad_cksum; /* defined in zio_checksum.h */ + +struct zio_cksum_report { + struct zio_cksum_report *zcr_next; + nvlist_t *zcr_ereport; + nvlist_t *zcr_detector; + void *zcr_cbdata; + size_t zcr_cbinfo; /* passed to zcr_free() */ + uint64_t zcr_align; + uint64_t zcr_length; + zio_cksum_finish_f *zcr_finish; + zio_cksum_free_f *zcr_free; + + /* internal use only */ + struct zio_bad_cksum *zcr_ckinfo; /* information from failure */ +}; + +typedef void zio_vsd_cksum_report_f(zio_t *zio, zio_cksum_report_t *zcr, + void *arg); + +zio_vsd_cksum_report_f zio_vsd_default_cksum_report; + +typedef struct zio_vsd_ops { + zio_done_func_t *vsd_free; + zio_vsd_cksum_report_f *vsd_cksum_report; +} zio_vsd_ops_t; + typedef struct zio_gang_node { zio_gbh_phys_t *gn_gbh; struct zio_gang_node *gn_child[SPA_GBH_NBLKPTRS]; @@ -254,6 +342,13 @@ typedef int zio_pipe_stage_t(zio_t *zio); #define ZIO_REEXECUTE_NOW 0x01 #define ZIO_REEXECUTE_SUSPEND 0x02 +typedef struct zio_link { + zio_t *zl_parent; + zio_t *zl_child; + list_node_t zl_parent_node; + list_node_t zl_child_node; +} zio_link_t; + struct zio { /* Core information about this I/O */ zbookmark_t io_bookmark; @@ -263,15 +358,15 @@ struct zio { int io_cmd; uint8_t io_priority; uint8_t io_reexecute; - uint8_t io_async_root; + uint8_t io_state[ZIO_WAIT_TYPES]; uint64_t io_txg; spa_t *io_spa; blkptr_t *io_bp; + blkptr_t *io_bp_override; blkptr_t io_bp_copy; - zio_t *io_parent; - zio_t *io_child; - zio_t *io_sibling_prev; - zio_t *io_sibling_next; + list_t io_parent_list; + list_t io_child_list; + zio_link_t *io_walk_link; zio_t *io_logical; zio_transform_t *io_transform_stack; @@ -279,35 +374,40 @@ struct zio { zio_done_func_t *io_ready; zio_done_func_t *io_done; void *io_private; + int64_t io_prev_space_delta; /* DMU private */ blkptr_t io_bp_orig; /* Data represented by this I/O */ void *io_data; + void *io_orig_data; uint64_t io_size; + uint64_t io_orig_size; /* Stuff for the vdev stack */ vdev_t *io_vd; void *io_vsd; - zio_done_func_t *io_vsd_free; + const zio_vsd_ops_t *io_vsd_ops; + uint64_t io_offset; uint64_t io_deadline; avl_node_t io_offset_node; avl_node_t io_deadline_node; avl_tree_t *io_vdev_tree; - zio_t *io_delegate_list; - zio_t *io_delegate_next; /* Internal pipeline state */ - int io_flags; - zio_stage_t io_stage; - uint32_t io_pipeline; - int io_orig_flags; - zio_stage_t io_orig_stage; - uint32_t io_orig_pipeline; + enum zio_flag io_flags; + enum zio_stage io_stage; + enum zio_stage io_pipeline; + enum zio_flag io_orig_flags; + enum zio_stage io_orig_stage; + enum zio_stage io_orig_pipeline; int io_error; int io_child_error[ZIO_CHILD_TYPES]; uint64_t io_children[ZIO_CHILD_TYPES][ZIO_WAIT_TYPES]; + uint64_t io_child_count; + uint64_t io_parent_count; uint64_t *io_stall; + zio_t *io_gang_leader; zio_gang_node_t *io_gang_tree; void *io_executor; void *io_waiter; @@ -315,59 +415,69 @@ struct zio { kcondvar_t io_cv; /* FMA state */ + zio_cksum_report_t *io_cksum_report; uint64_t io_ena; }; -extern zio_t *zio_null(zio_t *pio, spa_t *spa, - zio_done_func_t *done, void *private, int flags); +extern zio_t *zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, + zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_root(spa_t *spa, - zio_done_func_t *done, void *private, int flags); + zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, const zbookmark_t *zb); + int priority, enum zio_flag flags, const zbookmark_t *zb); extern zio_t *zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_prop_t *zp, + void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *done, void *private, - int priority, int flags, const zbookmark_t *zb); + int priority, enum zio_flag flags, const zbookmark_t *zb); extern zio_t *zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, zbookmark_t *zb); + int priority, enum zio_flag flags, zbookmark_t *zb); -extern void zio_skip_write(zio_t *zio); +extern void zio_write_override(zio_t *zio, blkptr_t *bp, int copies); -extern zio_t *zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, int flags); +extern void zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp); -extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, int flags); +extern zio_t *zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, + zio_done_func_t *done, void *private, enum zio_flag flags); extern zio_t *zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, int flags); + zio_done_func_t *done, void *private, int priority, enum zio_flag flags); extern zio_t *zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags, + zio_done_func_t *done, void *private, int priority, enum zio_flag flags, boolean_t labels); extern zio_t *zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, - zio_done_func_t *done, void *private, int priority, int flags, + zio_done_func_t *done, void *private, int priority, enum zio_flag flags, boolean_t labels); -extern int zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, - blkptr_t *old_bp, uint64_t txg); -extern void zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg); +extern zio_t *zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, + const blkptr_t *bp, enum zio_flag flags); + +extern int zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, + blkptr_t *old_bp, uint64_t size, boolean_t use_slog); +extern void zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp); extern void zio_flush(zio_t *zio, vdev_t *vd); +extern void zio_shrink(zio_t *zio, uint64_t size); extern int zio_wait(zio_t *zio); extern void zio_nowait(zio_t *zio); extern void zio_execute(zio_t *zio); extern void zio_interrupt(zio_t *zio); +extern zio_t *zio_walk_parents(zio_t *cio); +extern zio_t *zio_walk_children(zio_t *pio); +extern zio_t *zio_unique_parent(zio_t *cio); +extern void zio_add_child(zio_t *pio, zio_t *cio); + extern void *zio_buf_alloc(size_t size); extern void zio_buf_free(void *buf, size_t size); extern void *zio_data_buf_alloc(size_t size); @@ -377,11 +487,11 @@ extern void zio_resubmit_stage_async(void *); extern zio_t *zio_vdev_child_io(zio_t *zio, blkptr_t *bp, vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, - int flags, zio_done_func_t *done, void *private); + enum zio_flag flags, zio_done_func_t *done, void *private); extern zio_t *zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, int type, int priority, - int flags, zio_done_func_t *done, void *private); + enum zio_flag flags, zio_done_func_t *done, void *private); extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); @@ -390,11 +500,15 @@ extern void zio_vdev_io_redone(zio_t *zio); extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); -extern uint8_t zio_checksum_select(uint8_t child, uint8_t parent); -extern uint8_t zio_compress_select(uint8_t child, uint8_t parent); +extern enum zio_checksum zio_checksum_select(enum zio_checksum child, + enum zio_checksum parent); +extern enum zio_checksum zio_checksum_dedup_select(spa_t *spa, + enum zio_checksum child, enum zio_checksum parent); +extern enum zio_compress zio_compress_select(enum zio_compress child, + enum zio_compress parent); extern void zio_suspend(spa_t *spa, zio_t *zio); -extern void zio_resume(spa_t *spa); +extern int zio_resume(spa_t *spa); extern void zio_resume_wait(spa_t *spa); /* @@ -413,9 +527,30 @@ extern int zio_inject_fault(char *name, int flags, int *id, extern int zio_inject_list_next(int *id, char *name, size_t buflen, struct zinject_record *record); extern int zio_clear_fault(int id); +extern void zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type); extern int zio_handle_fault_injection(zio_t *zio, int error); -extern int zio_handle_device_injection(vdev_t *vd, int error); +extern int zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error); extern int zio_handle_label_injection(zio_t *zio, int error); +extern void zio_handle_ignored_writes(zio_t *zio); + +/* + * Checksum ereport functions + */ +extern void zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, struct zio *zio, + uint64_t offset, uint64_t length, void *arg, struct zio_bad_cksum *info); +extern void zfs_ereport_finish_checksum(zio_cksum_report_t *report, + const void *good_data, const void *bad_data, boolean_t drop_if_identical); + +extern void zfs_ereport_send_interim_checksum(zio_cksum_report_t *report); +extern void zfs_ereport_free_checksum(zio_cksum_report_t *report); + +/* If we have the good data in hand, this function can be used */ +extern void zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, + const void *good_data, const void *bad_data, struct zio_bad_cksum *info); + +/* Called from spa_sync(), but primarily an injection handler */ +extern void spa_handle_ignored_writes(spa_t *spa); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h index da407399da060..d1a5f34d52234 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_checksum.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,6 +27,7 @@ #define _SYS_ZIO_CHECKSUM_H #include +#include #ifdef __cplusplus extern "C" { @@ -43,28 +44,30 @@ typedef void zio_checksum_t(const void *data, uint64_t size, zio_cksum_t *zcp); typedef struct zio_checksum_info { zio_checksum_t *ci_func[2]; /* checksum function for each byteorder */ int ci_correctable; /* number of correctable bits */ - int ci_zbt; /* uses zio block tail? */ + int ci_eck; /* uses zio embedded checksum? */ + int ci_dedup; /* strong enough for dedup? */ char *ci_name; /* descriptive name */ } zio_checksum_info_t; +typedef struct zio_bad_cksum { + zio_cksum_t zbc_expected; + zio_cksum_t zbc_actual; + const char *zbc_checksum_name; + uint8_t zbc_byteswapped; + uint8_t zbc_injected; + uint8_t zbc_has_cksum; /* expected/actual valid */ +} zio_bad_cksum_t; + extern zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS]; /* * Checksum routines. */ -extern zio_checksum_t fletcher_2_native; -extern zio_checksum_t fletcher_4_native; -extern zio_checksum_t fletcher_4_incremental_native; - -extern zio_checksum_t fletcher_2_byteswap; -extern zio_checksum_t fletcher_4_byteswap; -extern zio_checksum_t fletcher_4_incremental_byteswap; - extern zio_checksum_t zio_checksum_SHA256; extern void zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, void *data, uint64_t size); -extern int zio_checksum_error(zio_t *zio); +extern int zio_checksum_error(zio_t *zio, zio_bad_cksum_t *out); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h index 66ee8d45b3b67..30bed1a676e32 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_compress.h @@ -20,15 +20,13 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZIO_COMPRESS_H #define _SYS_ZIO_COMPRESS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -66,14 +64,18 @@ extern size_t gzip_compress(void *src, void *dst, size_t s_len, size_t d_len, int level); extern int gzip_decompress(void *src, void *dst, size_t s_len, size_t d_len, int level); +extern size_t zle_compress(void *src, void *dst, size_t s_len, size_t d_len, + int level); +extern int zle_decompress(void *src, void *dst, size_t s_len, size_t d_len, + int level); /* * Compress and decompress data if necessary. */ -extern int zio_compress_data(int cpfunc, void *src, uint64_t srcsize, - void **destp, uint64_t *destsizep, uint64_t *destbufsizep); -extern int zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, - void *dest, uint64_t destsize); +extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, + size_t s_len); +extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, + size_t s_len, size_t d_len); #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h index e7503b733cc05..d90bd8bd59217 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zio_impl.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -34,104 +34,136 @@ extern "C" { #endif /* - * I/O Groups: pipeline stage definitions. + * zio pipeline stage definitions */ -typedef enum zio_stage { - ZIO_STAGE_OPEN = 0, /* RWFCI */ +enum zio_stage { + ZIO_STAGE_OPEN = 1 << 0, /* RWFCI */ - ZIO_STAGE_ISSUE_ASYNC, /* -W--- */ + ZIO_STAGE_READ_BP_INIT = 1 << 1, /* R---- */ + ZIO_STAGE_FREE_BP_INIT = 1 << 2, /* --F-- */ + ZIO_STAGE_ISSUE_ASYNC = 1 << 3, /* RWF-- */ + ZIO_STAGE_WRITE_BP_INIT = 1 << 4, /* -W--- */ - ZIO_STAGE_READ_BP_INIT, /* R---- */ - ZIO_STAGE_WRITE_BP_INIT, /* -W--- */ + ZIO_STAGE_CHECKSUM_GENERATE = 1 << 5, /* -W--- */ - ZIO_STAGE_CHECKSUM_GENERATE, /* -W--- */ + ZIO_STAGE_DDT_READ_START = 1 << 6, /* R---- */ + ZIO_STAGE_DDT_READ_DONE = 1 << 7, /* R---- */ + ZIO_STAGE_DDT_WRITE = 1 << 8, /* -W--- */ + ZIO_STAGE_DDT_FREE = 1 << 9, /* --F-- */ - ZIO_STAGE_GANG_ASSEMBLE, /* RWFC- */ - ZIO_STAGE_GANG_ISSUE, /* RWFC- */ + ZIO_STAGE_GANG_ASSEMBLE = 1 << 10, /* RWFC- */ + ZIO_STAGE_GANG_ISSUE = 1 << 11, /* RWFC- */ - ZIO_STAGE_DVA_ALLOCATE, /* -W--- */ - ZIO_STAGE_DVA_FREE, /* --F-- */ - ZIO_STAGE_DVA_CLAIM, /* ---C- */ + ZIO_STAGE_DVA_ALLOCATE = 1 << 12, /* -W--- */ + ZIO_STAGE_DVA_FREE = 1 << 13, /* --F-- */ + ZIO_STAGE_DVA_CLAIM = 1 << 14, /* ---C- */ - ZIO_STAGE_READY, /* RWFCI */ + ZIO_STAGE_READY = 1 << 15, /* RWFCI */ - ZIO_STAGE_VDEV_IO_START, /* RW--I */ - ZIO_STAGE_VDEV_IO_DONE, /* RW--I */ - ZIO_STAGE_VDEV_IO_ASSESS, /* RW--I */ + ZIO_STAGE_VDEV_IO_START = 1 << 16, /* RW--I */ + ZIO_STAGE_VDEV_IO_DONE = 1 << 17, /* RW--I */ + ZIO_STAGE_VDEV_IO_ASSESS = 1 << 18, /* RW--I */ - ZIO_STAGE_CHECKSUM_VERIFY, /* R---- */ + ZIO_STAGE_CHECKSUM_VERIFY = 1 << 19, /* R---- */ - ZIO_STAGE_DONE, /* RWFCI */ - ZIO_STAGES -} zio_stage_t; + ZIO_STAGE_DONE = 1 << 20 /* RWFCI */ +}; -#define ZIO_INTERLOCK_STAGES \ - ((1U << ZIO_STAGE_READY) | \ - (1U << ZIO_STAGE_DONE)) +#define ZIO_INTERLOCK_STAGES \ + (ZIO_STAGE_READY | \ + ZIO_STAGE_DONE) -#define ZIO_INTERLOCK_PIPELINE \ +#define ZIO_INTERLOCK_PIPELINE \ ZIO_INTERLOCK_STAGES -#define ZIO_VDEV_IO_STAGES \ - ((1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_VDEV_IO_DONE) | \ - (1U << ZIO_STAGE_VDEV_IO_ASSESS)) +#define ZIO_VDEV_IO_STAGES \ + (ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_DONE | \ + ZIO_STAGE_VDEV_IO_ASSESS) -#define ZIO_VDEV_CHILD_PIPELINE \ - (ZIO_VDEV_IO_STAGES | \ - (1U << ZIO_STAGE_DONE)) +#define ZIO_VDEV_CHILD_PIPELINE \ + (ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DONE) -#define ZIO_READ_COMMON_STAGES \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES | \ - (1U << ZIO_STAGE_CHECKSUM_VERIFY)) +#define ZIO_READ_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_CHECKSUM_VERIFY) -#define ZIO_READ_PHYS_PIPELINE \ +#define ZIO_READ_PHYS_PIPELINE \ ZIO_READ_COMMON_STAGES -#define ZIO_READ_PIPELINE \ - (ZIO_READ_COMMON_STAGES | \ - (1U << ZIO_STAGE_READ_BP_INIT)) +#define ZIO_READ_PIPELINE \ + (ZIO_READ_COMMON_STAGES | \ + ZIO_STAGE_READ_BP_INIT) -#define ZIO_WRITE_COMMON_STAGES \ - (ZIO_INTERLOCK_STAGES | \ - ZIO_VDEV_IO_STAGES | \ - (1U << ZIO_STAGE_ISSUE_ASYNC) | \ - (1U << ZIO_STAGE_CHECKSUM_GENERATE)) - -#define ZIO_WRITE_PHYS_PIPELINE \ - ZIO_WRITE_COMMON_STAGES - -#define ZIO_REWRITE_PIPELINE \ - (ZIO_WRITE_COMMON_STAGES | \ - (1U << ZIO_STAGE_WRITE_BP_INIT)) - -#define ZIO_WRITE_PIPELINE \ - (ZIO_WRITE_COMMON_STAGES | \ - (1U << ZIO_STAGE_WRITE_BP_INIT) | \ - (1U << ZIO_STAGE_DVA_ALLOCATE)) - -#define ZIO_GANG_STAGES \ - ((1U << ZIO_STAGE_GANG_ASSEMBLE) | \ - (1U << ZIO_STAGE_GANG_ISSUE)) +#define ZIO_DDT_CHILD_READ_PIPELINE \ + ZIO_READ_COMMON_STAGES -#define ZIO_FREE_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - (1U << ZIO_STAGE_DVA_FREE)) +#define ZIO_DDT_READ_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_READ_BP_INIT | \ + ZIO_STAGE_DDT_READ_START | \ + ZIO_STAGE_DDT_READ_DONE) -#define ZIO_CLAIM_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - (1U << ZIO_STAGE_DVA_CLAIM)) +#define ZIO_WRITE_COMMON_STAGES \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_CHECKSUM_GENERATE) -#define ZIO_IOCTL_PIPELINE \ - (ZIO_INTERLOCK_STAGES | \ - (1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_VDEV_IO_ASSESS)) +#define ZIO_WRITE_PHYS_PIPELINE \ + ZIO_WRITE_COMMON_STAGES -#define ZIO_CONFIG_LOCK_BLOCKING_STAGES \ - ((1U << ZIO_STAGE_VDEV_IO_START) | \ - (1U << ZIO_STAGE_DVA_ALLOCATE) | \ - (1U << ZIO_STAGE_DVA_CLAIM)) +#define ZIO_REWRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT) + +#define ZIO_WRITE_PIPELINE \ + (ZIO_WRITE_COMMON_STAGES | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_CHILD_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_VDEV_IO_STAGES | \ + ZIO_STAGE_DVA_ALLOCATE) + +#define ZIO_DDT_WRITE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_WRITE_BP_INIT | \ + ZIO_STAGE_CHECKSUM_GENERATE | \ + ZIO_STAGE_DDT_WRITE) + +#define ZIO_GANG_STAGES \ + (ZIO_STAGE_GANG_ASSEMBLE | \ + ZIO_STAGE_GANG_ISSUE) + +#define ZIO_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_DVA_FREE) + +#define ZIO_DDT_FREE_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_FREE_BP_INIT | \ + ZIO_STAGE_ISSUE_ASYNC | \ + ZIO_STAGE_DDT_FREE) + +#define ZIO_CLAIM_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_DVA_CLAIM) + +#define ZIO_IOCTL_PIPELINE \ + (ZIO_INTERLOCK_STAGES | \ + ZIO_STAGE_VDEV_IO_START | \ + ZIO_STAGE_VDEV_IO_ASSESS) + +#define ZIO_BLOCKING_STAGES \ + (ZIO_STAGE_DVA_ALLOCATE | \ + ZIO_STAGE_DVA_CLAIM | \ + ZIO_STAGE_VDEV_IO_START) extern void zio_inject_init(void); extern void zio_inject_fini(void); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h index 06adc667e1243..6284a4154a661 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/sys/zvol.h @@ -20,15 +20,13 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ZVOL_H #define _SYS_ZVOL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -43,10 +41,10 @@ extern int zvol_check_volsize(uint64_t volsize, uint64_t blocksize); extern int zvol_check_volblocksize(uint64_t volblocksize); extern int zvol_get_stats(objset_t *os, nvlist_t *nv); extern void zvol_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx); -extern int zvol_create_minor(const char *, major_t); +extern int zvol_create_minor(const char *); extern int zvol_remove_minor(const char *); +extern void zvol_remove_minors(const char *); extern int zvol_set_volsize(const char *, major_t, uint64_t); -extern int zvol_set_volblocksize(const char *, uint64_t); extern int zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr); extern int zvol_dump(dev_t dev, caddr_t addr, daddr_t offset, int nblocks); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c b/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c index 2bbf2f086c154..fb62f108940fc 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/txg.c @@ -19,13 +19,14 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include #include #include +#include #include #include @@ -57,12 +58,20 @@ txg_init(dsl_pool_t *dp, uint64_t txg) for (i = 0; i < TXG_SIZE; i++) { cv_init(&tx->tx_cpu[c].tc_cv[i], NULL, CV_DEFAULT, NULL); + list_create(&tx->tx_cpu[c].tc_callbacks[i], + sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); } } - rw_init(&tx->tx_suspend, NULL, RW_DEFAULT, NULL); mutex_init(&tx->tx_sync_lock, NULL, MUTEX_DEFAULT, NULL); + cv_init(&tx->tx_sync_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_sync_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_more_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_quiesce_done_cv, NULL, CV_DEFAULT, NULL); + cv_init(&tx->tx_exit_cv, NULL, CV_DEFAULT, NULL); + tx->tx_open_txg = txg; } @@ -77,17 +86,27 @@ txg_fini(dsl_pool_t *dp) ASSERT(tx->tx_threads == 0); - rw_destroy(&tx->tx_suspend); mutex_destroy(&tx->tx_sync_lock); + cv_destroy(&tx->tx_sync_more_cv); + cv_destroy(&tx->tx_sync_done_cv); + cv_destroy(&tx->tx_quiesce_more_cv); + cv_destroy(&tx->tx_quiesce_done_cv); + cv_destroy(&tx->tx_exit_cv); + for (c = 0; c < max_ncpus; c++) { int i; mutex_destroy(&tx->tx_cpu[c].tc_lock); - for (i = 0; i < TXG_SIZE; i++) + for (i = 0; i < TXG_SIZE; i++) { cv_destroy(&tx->tx_cpu[c].tc_cv[i]); + list_destroy(&tx->tx_cpu[c].tc_callbacks[i]); + } } + if (tx->tx_commit_cb_taskq != NULL) + taskq_destroy(tx->tx_commit_cb_taskq); + kmem_free(tx->tx_cpu, max_ncpus * sizeof (tx_cpu_t)); bzero(tx, sizeof (tx_state_t)); @@ -147,7 +166,8 @@ txg_thread_wait(tx_state_t *tx, callb_cpr_t *cpr, kcondvar_t *cv, uint64_t time) CALLB_CPR_SAFE_BEGIN(cpr); if (time) - (void) cv_timedwait(cv, &tx->tx_sync_lock, lbolt + time); + (void) cv_timedwait(cv, &tx->tx_sync_lock, + ddi_get_lbolt() + time); else cv_wait(cv, &tx->tx_sync_lock); @@ -167,7 +187,11 @@ txg_sync_stop(dsl_pool_t *dp) * Finish off any work in progress. */ ASSERT(tx->tx_threads == 2); - txg_wait_synced(dp, 0); + + /* + * We need to ensure that we've vacated the deferred space_maps. + */ + txg_wait_synced(dp, tx->tx_open_txg + TXG_DEFER_SIZE); /* * Wake all sync threads and wait for them to die. @@ -216,6 +240,17 @@ txg_rele_to_quiesce(txg_handle_t *th) mutex_exit(&tc->tc_lock); } +void +txg_register_callbacks(txg_handle_t *th, list_t *tx_callbacks) +{ + tx_cpu_t *tc = th->th_cpu; + int g = th->th_txg & TXG_MASK; + + mutex_enter(&tc->tc_lock); + list_move_tail(&tc->tc_callbacks[g], tx_callbacks); + mutex_exit(&tc->tc_lock); +} + void txg_rele_to_sync(txg_handle_t *th) { @@ -266,9 +301,59 @@ txg_quiesce(dsl_pool_t *dp, uint64_t txg) } } +static void +txg_do_callbacks(list_t *cb_list) +{ + dmu_tx_do_callbacks(cb_list, 0); + + list_destroy(cb_list); + + kmem_free(cb_list, sizeof (list_t)); +} + +/* + * Dispatch the commit callbacks registered on this txg to worker threads. + */ +static void +txg_dispatch_callbacks(dsl_pool_t *dp, uint64_t txg) +{ + int c; + tx_state_t *tx = &dp->dp_tx; + list_t *cb_list; + + for (c = 0; c < max_ncpus; c++) { + tx_cpu_t *tc = &tx->tx_cpu[c]; + /* No need to lock tx_cpu_t at this point */ + + int g = txg & TXG_MASK; + + if (list_is_empty(&tc->tc_callbacks[g])) + continue; + + if (tx->tx_commit_cb_taskq == NULL) { + /* + * Commit callback taskq hasn't been created yet. + */ + tx->tx_commit_cb_taskq = taskq_create("tx_commit_cb", + max_ncpus, minclsyspri, max_ncpus, max_ncpus * 2, + TASKQ_PREPOPULATE); + } + + cb_list = kmem_alloc(sizeof (list_t), KM_SLEEP); + list_create(cb_list, sizeof (dmu_tx_callback_t), + offsetof(dmu_tx_callback_t, dcb_node)); + + list_move_tail(&tc->tc_callbacks[g], cb_list); + + (void) taskq_dispatch(tx->tx_commit_cb_taskq, (task_func_t *) + txg_do_callbacks, cb_list, TQ_SLEEP); + } +} + static void txg_sync_thread(dsl_pool_t *dp) { + spa_t *spa = dp->dp_spa; tx_state_t *tx = &dp->dp_tx; callb_cpr_t cpr; uint64_t start, delta; @@ -287,14 +372,15 @@ txg_sync_thread(dsl_pool_t *dp) */ timer = (delta >= timeout ? 0 : timeout - delta); while ((dp->dp_scrub_func == SCRUB_FUNC_NONE || - spa_shutting_down(dp->dp_spa)) && + spa_load_state(spa) != SPA_LOAD_NONE || + spa_shutting_down(spa)) && !tx->tx_exiting && timer > 0 && tx->tx_synced_txg >= tx->tx_sync_txg_waiting && tx->tx_quiesced_txg == 0) { dprintf("waiting; tx_synced=%llu waiting=%llu dp=%p\n", tx->tx_synced_txg, tx->tx_sync_txg_waiting, dp); txg_thread_wait(tx, &cpr, &tx->tx_sync_more_cv, timer); - delta = lbolt - start; + delta = ddi_get_lbolt() - start; timer = (delta > timeout ? 0 : timeout - delta); } @@ -312,8 +398,6 @@ txg_sync_thread(dsl_pool_t *dp) if (tx->tx_exiting) txg_thread_exit(tx, &cpr, &tx->tx_sync_thread); - rw_enter(&tx->tx_suspend, RW_WRITER); - /* * Consume the quiesced txg which has been handed off to * us. This may cause the quiescing thread to now be @@ -323,22 +407,24 @@ txg_sync_thread(dsl_pool_t *dp) tx->tx_quiesced_txg = 0; tx->tx_syncing_txg = txg; cv_broadcast(&tx->tx_quiesce_more_cv); - rw_exit(&tx->tx_suspend); dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", txg, tx->tx_quiesce_txg_waiting, tx->tx_sync_txg_waiting); mutex_exit(&tx->tx_sync_lock); - start = lbolt; - spa_sync(dp->dp_spa, txg); - delta = lbolt - start; + start = ddi_get_lbolt(); + spa_sync(spa, txg); + delta = ddi_get_lbolt() - start; mutex_enter(&tx->tx_sync_lock); - rw_enter(&tx->tx_suspend, RW_WRITER); tx->tx_synced_txg = txg; tx->tx_syncing_txg = 0; - rw_exit(&tx->tx_suspend); cv_broadcast(&tx->tx_sync_done_cv); + + /* + * Dispatch commit callbacks to worker threads. + */ + txg_dispatch_callbacks(dp, txg); } } @@ -395,7 +481,7 @@ void txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) { tx_state_t *tx = &dp->dp_tx; - int timeout = lbolt + ticks; + int timeout = ddi_get_lbolt() + ticks; /* don't delay if this txg could transition to quiesing immediately */ if (tx->tx_open_txg > txg || @@ -408,7 +494,7 @@ txg_delay(dsl_pool_t *dp, uint64_t txg, int ticks) return; } - while (lbolt < timeout && + while (ddi_get_lbolt() < timeout && tx->tx_syncing_txg < txg-1 && !txg_stalled(dp)) (void) cv_timedwait(&tx->tx_quiesce_more_cv, &tx->tx_sync_lock, timeout); @@ -424,7 +510,7 @@ txg_wait_synced(dsl_pool_t *dp, uint64_t txg) mutex_enter(&tx->tx_sync_lock); ASSERT(tx->tx_threads == 2); if (txg == 0) - txg = tx->tx_open_txg; + txg = tx->tx_open_txg + TXG_DEFER_SIZE; if (tx->tx_sync_txg_waiting < txg) tx->tx_sync_txg_waiting = txg; dprintf("txg=%llu quiesce_txg=%llu sync_txg=%llu\n", @@ -475,21 +561,6 @@ txg_sync_waiting(dsl_pool_t *dp) tx->tx_quiesced_txg != 0); } -void -txg_suspend(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - /* XXX some code paths suspend when they are already suspended! */ - rw_enter(&tx->tx_suspend, RW_READER); -} - -void -txg_resume(dsl_pool_t *dp) -{ - tx_state_t *tx = &dp->dp_tx; - rw_exit(&tx->tx_suspend); -} - /* * Per-txg object lists. */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c index 16a27e514a41b..48082c8bf9479 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -39,6 +39,7 @@ #include #include #include +#include /* * Virtual device management. @@ -53,6 +54,7 @@ static vdev_ops_t *vdev_ops_table[] = { &vdev_disk_ops, &vdev_file_ops, &vdev_missing_ops, + &vdev_hole_ops, NULL }; @@ -83,9 +85,8 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) { uint64_t asize = P2ROUNDUP(psize, 1ULL << vd->vdev_top->vdev_ashift); uint64_t csize; - uint64_t c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { csize = vdev_psize_to_asize(vd->vdev_child[c], psize); asize = MAX(asize, csize); } @@ -94,40 +95,47 @@ vdev_default_asize(vdev_t *vd, uint64_t psize) } /* - * Get the replaceable or attachable device size. - * If the parent is a mirror or raidz, the replaceable size is the minimum - * psize of all its children. For the rest, just return our own psize. - * - * e.g. - * psize rsize - * root - - - * mirror/raidz - - - * disk1 20g 20g - * disk2 40g 20g - * disk3 80g 80g + * Get the minimum allocatable size. We define the allocatable size as + * the vdev's asize rounded to the nearest metaslab. This allows us to + * replace or attach devices which don't have the same physical size but + * can still satisfy the same number of allocations. */ uint64_t -vdev_get_rsize(vdev_t *vd) +vdev_get_min_asize(vdev_t *vd) { - vdev_t *pvd, *cvd; - uint64_t c, rsize; + vdev_t *pvd = vd->vdev_parent; - pvd = vd->vdev_parent; + /* + * The our parent is NULL (inactive spare or cache) or is the root, + * just return our own asize. + */ + if (pvd == NULL) + return (vd->vdev_asize); + + /* + * The top-level vdev just returns the allocatable size rounded + * to the nearest metaslab. + */ + if (vd == vd->vdev_top) + return (P2ALIGN(vd->vdev_asize, 1ULL << vd->vdev_ms_shift)); /* - * If our parent is NULL or the root, just return our own psize. + * The allocatable space for a raidz vdev is N * sizeof(smallest child), + * so each child must provide at least 1/Nth of its asize. */ - if (pvd == NULL || pvd->vdev_parent == NULL) - return (vd->vdev_psize); + if (pvd->vdev_ops == &vdev_raidz_ops) + return (pvd->vdev_min_asize / pvd->vdev_children); - rsize = 0; + return (pvd->vdev_min_asize); +} - for (c = 0; c < pvd->vdev_children; c++) { - cvd = pvd->vdev_child[c]; - rsize = MIN(rsize - 1, cvd->vdev_psize - 1) + 1; - } +void +vdev_set_min_asize(vdev_t *vd) +{ + vd->vdev_min_asize = vdev_get_min_asize(vd); - return (rsize); + for (int c = 0; c < vd->vdev_children; c++) + vdev_set_min_asize(vd->vdev_child[c]); } vdev_t * @@ -148,13 +156,12 @@ vdev_lookup_top(spa_t *spa, uint64_t vdev) vdev_t * vdev_lookup_by_guid(vdev_t *vd, uint64_t guid) { - int c; vdev_t *mvd; if (vd->vdev_guid == guid) return (vd); - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if ((mvd = vdev_lookup_by_guid(vd->vdev_child[c], guid)) != NULL) return (mvd); @@ -250,17 +257,17 @@ vdev_compact_children(vdev_t *pvd) { vdev_t **newchild, *cvd; int oldc = pvd->vdev_children; - int newc, c; + int newc; ASSERT(spa_config_held(pvd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); - for (c = newc = 0; c < oldc; c++) + for (int c = newc = 0; c < oldc; c++) if (pvd->vdev_child[c]) newc++; newchild = kmem_alloc(newc * sizeof (vdev_t *), KM_SLEEP); - for (c = newc = 0; c < oldc; c++) { + for (int c = newc = 0; c < oldc; c++) { if ((cvd = pvd->vdev_child[c]) != NULL) { newchild[newc] = cvd; cvd->vdev_id = newc++; @@ -275,7 +282,7 @@ vdev_compact_children(vdev_t *pvd) /* * Allocate and minimally initialize a vdev_t. */ -static vdev_t * +vdev_t * vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) { vdev_t *vd; @@ -287,21 +294,18 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) spa->spa_root_vdev = vd; } - if (guid == 0) { + if (guid == 0 && ops != &vdev_hole_ops) { if (spa->spa_root_vdev == vd) { /* * The root vdev's guid will also be the pool guid, * which must be unique among all pools. */ - while (guid == 0 || spa_guid_exists(guid, 0)) - guid = spa_get_random(-1ULL); + guid = spa_generate_guid(NULL); } else { /* * Any other vdev's guid must be unique within the pool. */ - while (guid == 0 || - spa_guid_exists(spa_guid(spa), guid)) - guid = spa_get_random(-1ULL); + guid = spa_generate_guid(spa); } ASSERT(!spa_guid_exists(spa_guid(spa), guid)); } @@ -312,12 +316,15 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) vd->vdev_guid_sum = guid; vd->vdev_ops = ops; vd->vdev_state = VDEV_STATE_CLOSED; + vd->vdev_ishole = (ops == &vdev_hole_ops); mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); - space_map_create(&vd->vdev_dtl_map, 0, -1ULL, 0, &vd->vdev_dtl_lock); - space_map_create(&vd->vdev_dtl_scrub, 0, -1ULL, 0, &vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + space_map_create(&vd->vdev_dtl[t], 0, -1ULL, 0, + &vd->vdev_dtl_lock); + } txg_list_create(&vd->vdev_ms_list, offsetof(struct metaslab, ms_txg_node)); txg_list_create(&vd->vdev_dtl_list, @@ -370,6 +377,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, } else if (alloctype == VDEV_ALLOC_L2CACHE) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) return (EINVAL); + } else if (alloctype == VDEV_ALLOC_ROOTPOOL) { + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &guid) != 0) + return (EINVAL); } /* @@ -386,6 +396,9 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (islog && spa_version(spa) < SPA_VERSION_SLOGS) return (ENOTSUP); + if (ops == &vdev_hole_ops && spa_version(spa) < SPA_VERSION_HOLES) + return (ENOTSUP); + /* * Set the nparity property for RAID-Z vdevs. */ @@ -393,23 +406,24 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (ops == &vdev_raidz_ops) { if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NPARITY, &nparity) == 0) { - /* - * Currently, we can only support 2 parity devices. - */ - if (nparity == 0 || nparity > 2) + if (nparity == 0 || nparity > VDEV_RAIDZ_MAXPARITY) return (EINVAL); /* - * Older versions can only support 1 parity device. + * Previous versions could only support 1 or 2 parity + * device. */ - if (nparity == 2 && - spa_version(spa) < SPA_VERSION_RAID6) + if (nparity > 1 && + spa_version(spa) < SPA_VERSION_RAIDZ2) + return (ENOTSUP); + if (nparity > 2 && + spa_version(spa) < SPA_VERSION_RAIDZ3) return (ENOTSUP); } else { /* * We require the parity to be specified for SPAs that * support multiple parity levels. */ - if (spa_version(spa) >= SPA_VERSION_RAID6) + if (spa_version(spa) >= SPA_VERSION_RAIDZ2) return (EINVAL); /* * Otherwise, we default to 1 parity device for RAID-Z. @@ -433,6 +447,8 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PHYS_PATH, &vd->vdev_physpath) == 0) vd->vdev_physpath = spa_strdup(vd->vdev_physpath); + if (nvlist_lookup_string(nv, ZPOOL_CONFIG_FRU, &vd->vdev_fru) == 0) + vd->vdev_fru = spa_strdup(vd->vdev_fru); /* * Set the whole_disk property. If it's not specified, leave the value @@ -446,19 +462,25 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, * Look for the 'not present' flag. This will only be set if the device * was not present at the time of import. */ - if (!spa->spa_import_faulted) - (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, - &vd->vdev_not_present); + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, + &vd->vdev_not_present); /* * Get the alignment requirement. */ (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_ASHIFT, &vd->vdev_ashift); + /* + * Retrieve the vdev creation time. + */ + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, + &vd->vdev_crtxg); + /* * If we're a top-level vdev, try to load the allocation parameters. */ - if (parent && !parent->vdev_parent && alloctype == VDEV_ALLOC_LOAD) { + if (parent && !parent->vdev_parent && + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_SPLIT)) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_ARRAY, &vd->vdev_ms_array); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_METASLAB_SHIFT, @@ -467,32 +489,63 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, &vd->vdev_asize); } + if (parent && !parent->vdev_parent) { + ASSERT(alloctype == VDEV_ALLOC_LOAD || + alloctype == VDEV_ALLOC_ADD || + alloctype == VDEV_ALLOC_SPLIT || + alloctype == VDEV_ALLOC_ROOTPOOL); + vd->vdev_mg = metaslab_group_create(islog ? + spa_log_class(spa) : spa_normal_class(spa), vd); + } + /* * If we're a leaf vdev, try to load the DTL object and other state. */ if (vd->vdev_ops->vdev_op_leaf && - (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE)) { + (alloctype == VDEV_ALLOC_LOAD || alloctype == VDEV_ALLOC_L2CACHE || + alloctype == VDEV_ALLOC_ROOTPOOL)) { if (alloctype == VDEV_ALLOC_LOAD) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DTL, - &vd->vdev_dtl.smo_object); + &vd->vdev_dtl_smo.smo_object); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_UNSPARE, &vd->vdev_unspare); } + + if (alloctype == VDEV_ALLOC_ROOTPOOL) { + uint64_t spare = 0; + + if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_IS_SPARE, + &spare) == 0 && spare) + spa_spare_add(vd); + } + (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_OFFLINE, &vd->vdev_offline); /* * When importing a pool, we want to ignore the persistent fault * state, as the diagnosis made on another system may not be - * valid in the current context. + * valid in the current context. Local vdevs will + * remain in the faulted state. */ - if (spa->spa_load_state == SPA_LOAD_OPEN) { + if (spa_load_state(spa) == SPA_LOAD_OPEN) { (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_FAULTED, &vd->vdev_faulted); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_DEGRADED, &vd->vdev_degraded); (void) nvlist_lookup_uint64(nv, ZPOOL_CONFIG_REMOVED, &vd->vdev_removed); + + if (vd->vdev_faulted || vd->vdev_degraded) { + char *aux; + + vd->vdev_label_aux = + VDEV_AUX_ERR_EXCEEDED; + if (nvlist_lookup_string(nv, + ZPOOL_CONFIG_AUX_STATE, &aux) == 0 && + strcmp(aux, "external") == 0) + vd->vdev_label_aux = VDEV_AUX_EXTERNAL; + } } } @@ -509,7 +562,6 @@ vdev_alloc(spa_t *spa, vdev_t **vdp, nvlist_t *nv, vdev_t *parent, uint_t id, void vdev_free(vdev_t *vd) { - int c; spa_t *spa = vd->vdev_spa; /* @@ -519,11 +571,12 @@ vdev_free(vdev_t *vd) vdev_close(vd); ASSERT(!list_link_active(&vd->vdev_config_dirty_node)); + ASSERT(!list_link_active(&vd->vdev_state_dirty_node)); /* * Free all children. */ - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_free(vd->vdev_child[c]); ASSERT(vd->vdev_child == NULL); @@ -532,8 +585,10 @@ vdev_free(vdev_t *vd) /* * Discard allocation state. */ - if (vd == vd->vdev_top) + if (vd->vdev_mg != NULL) { vdev_metaslab_fini(vd); + metaslab_group_destroy(vd->vdev_mg); + } ASSERT3U(vd->vdev_stat.vs_space, ==, 0); ASSERT3U(vd->vdev_stat.vs_dspace, ==, 0); @@ -558,6 +613,8 @@ vdev_free(vdev_t *vd) spa_strfree(vd->vdev_devid); if (vd->vdev_physpath) spa_strfree(vd->vdev_physpath); + if (vd->vdev_fru) + spa_strfree(vd->vdev_fru); if (vd->vdev_isspare) spa_spare_remove(vd); @@ -566,12 +623,14 @@ vdev_free(vdev_t *vd) txg_list_destroy(&vd->vdev_ms_list); txg_list_destroy(&vd->vdev_dtl_list); + mutex_enter(&vd->vdev_dtl_lock); - space_map_unload(&vd->vdev_dtl_map); - space_map_destroy(&vd->vdev_dtl_map); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - space_map_destroy(&vd->vdev_dtl_scrub); + for (int t = 0; t < DTL_TYPES; t++) { + space_map_unload(&vd->vdev_dtl[t]); + space_map_destroy(&vd->vdev_dtl[t]); + } mutex_exit(&vd->vdev_dtl_lock); + mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); @@ -649,14 +708,12 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) static void vdev_top_update(vdev_t *tvd, vdev_t *vd) { - int c; - if (vd == NULL) return; vd->vdev_top = tvd; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_top_update(tvd, vd->vdev_child[c]); } @@ -675,8 +732,10 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd = vdev_alloc_common(spa, cvd->vdev_id, 0, ops); mvd->vdev_asize = cvd->vdev_asize; + mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_ashift = cvd->vdev_ashift; mvd->vdev_state = cvd->vdev_state; + mvd->vdev_crtxg = cvd->vdev_crtxg; vdev_remove_child(pvd, cvd); vdev_add_child(pvd, mvd); @@ -709,14 +768,19 @@ vdev_remove_parent(vdev_t *cvd) vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); + /* * If cvd will replace mvd as a top-level vdev, preserve mvd's guid. * Otherwise, we could have detached an offline device, and when we * go to import the pool we'll think we have two top-level vdevs, * instead of a different version of the same top-level vdev. */ - if (mvd->vdev_top == mvd) - cvd->vdev_guid = cvd->vdev_guid_sum = mvd->vdev_guid; + if (mvd->vdev_top == mvd) { + uint64_t guid_delta = mvd->vdev_guid - cvd->vdev_guid; + cvd->vdev_orig_guid = cvd->vdev_guid; + cvd->vdev_guid += guid_delta; + cvd->vdev_guid_sum += guid_delta; + } cvd->vdev_id = mvd->vdev_id; vdev_add_child(pvd, cvd); vdev_top_update(cvd->vdev_top, cvd->vdev_top); @@ -733,25 +797,32 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; objset_t *mos = spa->spa_meta_objset; - metaslab_class_t *mc; uint64_t m; uint64_t oldc = vd->vdev_ms_count; uint64_t newc = vd->vdev_asize >> vd->vdev_ms_shift; metaslab_t **mspp; int error; - if (vd->vdev_ms_shift == 0) /* not being allocated from yet */ + ASSERT(txg == 0 || spa_config_held(spa, SCL_ALLOC, RW_WRITER)); + + /* + * This vdev is not being allocated from yet or is a hole. + */ + if (vd->vdev_ms_shift == 0) return (0); - ASSERT(oldc <= newc); + ASSERT(!vd->vdev_ishole); - if (vd->vdev_islog) - mc = spa->spa_log_class; - else - mc = spa->spa_normal_class; + /* + * Compute the raidz-deflation ratio. Note, we hard-code + * in 128k (1 << 17) because it is the current "typical" blocksize. + * Even if SPA_MAXBLOCKSIZE changes, this algorithm must never change, + * or we will inconsistently account for existing bp's. + */ + vd->vdev_deflate_ratio = (1 << 17) / + (vdev_psize_to_asize(vd, 1 << 17) >> SPA_MINBLOCKSHIFT); - if (vd->vdev_mg == NULL) - vd->vdev_mg = metaslab_group_create(mc, vd); + ASSERT(oldc <= newc); mspp = kmem_zalloc(newc * sizeof (*mspp), KM_SLEEP); @@ -768,7 +839,8 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) if (txg == 0) { uint64_t object = 0; error = dmu_read(mos, vd->vdev_ms_array, - m * sizeof (uint64_t), sizeof (uint64_t), &object); + m * sizeof (uint64_t), sizeof (uint64_t), &object, + DMU_READ_PREFETCH); if (error) return (error); if (object != 0) { @@ -786,6 +858,15 @@ vdev_metaslab_init(vdev_t *vd, uint64_t txg) m << vd->vdev_ms_shift, 1ULL << vd->vdev_ms_shift, txg); } + if (txg == 0) + spa_config_enter(spa, SCL_ALLOC, FTAG, RW_WRITER); + + if (oldc == 0) + metaslab_group_activate(vd->vdev_mg); + + if (txg == 0) + spa_config_exit(spa, SCL_ALLOC, FTAG); + return (0); } @@ -796,6 +877,7 @@ vdev_metaslab_fini(vdev_t *vd) uint64_t count = vd->vdev_ms_count; if (vd->vdev_ms != NULL) { + metaslab_group_passivate(vd->vdev_mg); for (m = 0; m < count; m++) if (vd->vdev_ms[m] != NULL) metaslab_fini(vd->vdev_ms[m]); @@ -808,22 +890,22 @@ typedef struct vdev_probe_stats { boolean_t vps_readable; boolean_t vps_writeable; int vps_flags; - zio_t *vps_root; - vdev_t *vps_vd; } vdev_probe_stats_t; static void vdev_probe_done(zio_t *zio) { + spa_t *spa = zio->io_spa; + vdev_t *vd = zio->io_vd; vdev_probe_stats_t *vps = zio->io_private; - vdev_t *vd = vps->vps_vd; + + ASSERT(vd->vdev_probe_zio != NULL); if (zio->io_type == ZIO_TYPE_READ) { - ASSERT(zio->io_vd == vd); if (zio->io_error == 0) vps->vps_readable = 1; - if (zio->io_error == 0 && (spa_mode & FWRITE)) { - zio_nowait(zio_write_phys(vps->vps_root, vd, + if (zio->io_error == 0 && spa_writeable(spa)) { + zio_nowait(zio_write_phys(vd->vdev_probe_zio, vd, zio->io_offset, zio->io_size, zio->io_data, ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_WRITE, vps->vps_flags, B_TRUE)); @@ -831,26 +913,34 @@ vdev_probe_done(zio_t *zio) zio_buf_free(zio->io_data, zio->io_size); } } else if (zio->io_type == ZIO_TYPE_WRITE) { - ASSERT(zio->io_vd == vd); if (zio->io_error == 0) vps->vps_writeable = 1; zio_buf_free(zio->io_data, zio->io_size); } else if (zio->io_type == ZIO_TYPE_NULL) { - ASSERT(zio->io_vd == NULL); - ASSERT(zio == vps->vps_root); + zio_t *pio; vd->vdev_cant_read |= !vps->vps_readable; vd->vdev_cant_write |= !vps->vps_writeable; if (vdev_readable(vd) && - (vdev_writeable(vd) || !(spa_mode & FWRITE))) { + (vdev_writeable(vd) || !spa_writeable(spa))) { zio->io_error = 0; } else { ASSERT(zio->io_error != 0); zfs_ereport_post(FM_EREPORT_ZFS_PROBE_FAILURE, - zio->io_spa, vd, NULL, 0, 0); + spa, vd, NULL, 0, 0); zio->io_error = ENXIO; } + + mutex_enter(&vd->vdev_probe_lock); + ASSERT(vd->vdev_probe_zio == zio); + vd->vdev_probe_zio = NULL; + mutex_exit(&vd->vdev_probe_lock); + + while ((pio = zio_walk_parents(zio)) != NULL) + if (!vdev_accessible(vd, pio)) + pio->io_error = ENXIO; + kmem_free(vps, sizeof (*vps)); } } @@ -861,53 +951,139 @@ vdev_probe_done(zio_t *zio) * but the first (which we leave alone in case it contains a VTOC). */ zio_t * -vdev_probe(vdev_t *vd, zio_t *pio) +vdev_probe(vdev_t *vd, zio_t *zio) { spa_t *spa = vd->vdev_spa; - vdev_probe_stats_t *vps; - zio_t *zio; + vdev_probe_stats_t *vps = NULL; + zio_t *pio; - vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + ASSERT(vd->vdev_ops->vdev_op_leaf); - vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | ZIO_FLAG_DONT_RETRY; + /* + * Don't probe the probe. + */ + if (zio && (zio->io_flags & ZIO_FLAG_PROBE)) + return (NULL); - if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { - /* - * vdev_cant_read and vdev_cant_write can only transition - * from TRUE to FALSE when we have the SCL_ZIO lock as writer; - * otherwise they can only transition from FALSE to TRUE. - * This ensures that any zio looking at these values can - * assume that failures persist for the life of the I/O. - * That's important because when a device has intermittent - * connectivity problems, we want to ensure that they're - * ascribed to the device (ENXIO) and not the zio (EIO). - * - * Since we hold SCL_ZIO as writer here, clear both values - * so the probe can reevaluate from first principles. - */ - vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; - vd->vdev_cant_read = B_FALSE; - vd->vdev_cant_write = B_FALSE; + /* + * To prevent 'probe storms' when a device fails, we create + * just one probe i/o at a time. All zios that want to probe + * this vdev will become parents of the probe io. + */ + mutex_enter(&vd->vdev_probe_lock); + + if ((pio = vd->vdev_probe_zio) == NULL) { + vps = kmem_zalloc(sizeof (*vps), KM_SLEEP); + + vps->vps_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_PROBE | + ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE | + ZIO_FLAG_TRYHARD; + + if (spa_config_held(spa, SCL_ZIO, RW_WRITER)) { + /* + * vdev_cant_read and vdev_cant_write can only + * transition from TRUE to FALSE when we have the + * SCL_ZIO lock as writer; otherwise they can only + * transition from FALSE to TRUE. This ensures that + * any zio looking at these values can assume that + * failures persist for the life of the I/O. That's + * important because when a device has intermittent + * connectivity problems, we want to ensure that + * they're ascribed to the device (ENXIO) and not + * the zio (EIO). + * + * Since we hold SCL_ZIO as writer here, clear both + * values so the probe can reevaluate from first + * principles. + */ + vps->vps_flags |= ZIO_FLAG_CONFIG_WRITER; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + } + + vd->vdev_probe_zio = pio = zio_null(NULL, spa, vd, + vdev_probe_done, vps, + vps->vps_flags | ZIO_FLAG_DONT_PROPAGATE); + + if (zio != NULL) { + vd->vdev_probe_wanted = B_TRUE; + spa_async_request(spa, SPA_ASYNC_PROBE); + } } - ASSERT(vd->vdev_ops->vdev_op_leaf); + if (zio != NULL) + zio_add_child(zio, pio); - zio = zio_null(pio, spa, vdev_probe_done, vps, vps->vps_flags); + mutex_exit(&vd->vdev_probe_lock); - vps->vps_root = zio; - vps->vps_vd = vd; + if (vps == NULL) { + ASSERT(zio != NULL); + return (NULL); + } for (int l = 1; l < VDEV_LABELS; l++) { - zio_nowait(zio_read_phys(zio, vd, + zio_nowait(zio_read_phys(pio, vd, vdev_label_offset(vd->vdev_psize, l, - offsetof(vdev_label_t, vl_pad)), - VDEV_SKIP_SIZE, zio_buf_alloc(VDEV_SKIP_SIZE), + offsetof(vdev_label_t, vl_pad2)), + VDEV_PAD_SIZE, zio_buf_alloc(VDEV_PAD_SIZE), ZIO_CHECKSUM_OFF, vdev_probe_done, vps, ZIO_PRIORITY_SYNC_READ, vps->vps_flags, B_TRUE)); } - return (zio); + if (zio == NULL) + return (pio); + + zio_nowait(pio); + return (NULL); +} + +static void +vdev_open_child(void *arg) +{ + vdev_t *vd = arg; + + vd->vdev_open_thread = curthread; + vd->vdev_open_error = vdev_open(vd); + vd->vdev_open_thread = NULL; +} + +boolean_t +vdev_uses_zvols(vdev_t *vd) +{ + if (vd->vdev_path && strncmp(vd->vdev_path, ZVOL_DIR, + strlen(ZVOL_DIR)) == 0) + return (B_TRUE); + for (int c = 0; c < vd->vdev_children; c++) + if (vdev_uses_zvols(vd->vdev_child[c])) + return (B_TRUE); + return (B_FALSE); +} + +void +vdev_open_children(vdev_t *vd) +{ + taskq_t *tq; + int children = vd->vdev_children; + + /* + * in order to handle pools on top of zvols, do the opens + * in a single thread so that the same thread holds the + * spa_namespace_lock + */ + if (vdev_uses_zvols(vd)) { + for (int c = 0; c < children; c++) + vd->vdev_child[c]->vdev_open_error = + vdev_open(vd->vdev_child[c]); + return; + } + tq = taskq_create("vdev_open", children, minclsyspri, + children, children, TASKQ_PREPOPULATE); + + for (int c = 0; c < children; c++) + VERIFY(taskq_dispatch(tq, vdev_open_child, vd->vdev_child[c], + TQ_SLEEP) != NULL); + + taskq_destroy(tq); } /* @@ -916,22 +1092,33 @@ vdev_probe(vdev_t *vd, zio_t *pio) int vdev_open(vdev_t *vd) { + spa_t *spa = vd->vdev_spa; int error; - int c; uint64_t osize = 0; uint64_t asize, psize; uint64_t ashift = 0; + ASSERT(vd->vdev_open_thread == curthread || + spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); ASSERT(vd->vdev_state == VDEV_STATE_CLOSED || vd->vdev_state == VDEV_STATE_CANT_OPEN || vd->vdev_state == VDEV_STATE_OFFLINE); vd->vdev_stat.vs_aux = VDEV_AUX_NONE; + vd->vdev_cant_read = B_FALSE; + vd->vdev_cant_write = B_FALSE; + vd->vdev_min_asize = vdev_get_min_asize(vd); + /* + * If this vdev is not removed, check its fault status. If it's + * faulted, bail out of the open. + */ if (!vd->vdev_removed && vd->vdev_faulted) { ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, - VDEV_AUX_ERR_EXCEEDED); + vd->vdev_label_aux); return (ENXIO); } else if (vd->vdev_offline) { ASSERT(vd->vdev_children == 0); @@ -941,8 +1128,13 @@ vdev_open(vdev_t *vd) error = vd->vdev_ops->vdev_op_open(vd, &osize, &ashift); + /* + * Reset the vdev_reopening flag so that we actually close + * the vdev on error. + */ + vd->vdev_reopening = B_FALSE; if (zio_injection_enabled && error == 0) - error = zio_handle_device_injection(vd, ENXIO); + error = zio_handle_device_injection(vd, NULL, ENXIO); if (error) { if (vd->vdev_removed && @@ -956,20 +1148,40 @@ vdev_open(vdev_t *vd) vd->vdev_removed = B_FALSE; + /* + * Recheck the faulted flag now that we have confirmed that + * the vdev is accessible. If we're faulted, bail. + */ + if (vd->vdev_faulted) { + ASSERT(vd->vdev_children == 0); + ASSERT(vd->vdev_label_aux == VDEV_AUX_ERR_EXCEEDED || + vd->vdev_label_aux == VDEV_AUX_EXTERNAL); + vdev_set_state(vd, B_TRUE, VDEV_STATE_FAULTED, + vd->vdev_label_aux); + return (ENXIO); + } + if (vd->vdev_degraded) { ASSERT(vd->vdev_children == 0); vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_ERR_EXCEEDED); } else { - vd->vdev_state = VDEV_STATE_HEALTHY; + vdev_set_state(vd, B_TRUE, VDEV_STATE_HEALTHY, 0); } - for (c = 0; c < vd->vdev_children; c++) + /* + * For hole or missing vdevs we just return success. + */ + if (vd->vdev_ishole || vd->vdev_ops == &vdev_missing_ops) + return (0); + + for (int c = 0; c < vd->vdev_children; c++) { if (vd->vdev_child[c]->vdev_state != VDEV_STATE_HEALTHY) { vdev_set_state(vd, B_TRUE, VDEV_STATE_DEGRADED, VDEV_AUX_NONE); break; } + } osize = P2ALIGN(osize, (uint64_t)sizeof (vdev_label_t)); @@ -994,6 +1206,15 @@ vdev_open(vdev_t *vd) vd->vdev_psize = psize; + /* + * Make sure the allocatable size hasn't shrunk. + */ + if (asize < vd->vdev_min_asize) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); + } + if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. @@ -1010,25 +1231,18 @@ vdev_open(vdev_t *vd) VDEV_AUX_BAD_LABEL); return (EINVAL); } + } - /* - * Make sure the device hasn't shrunk. - */ - if (asize < vd->vdev_asize) { - vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, - VDEV_AUX_BAD_LABEL); - return (EINVAL); - } + /* + * If all children are healthy and the asize has increased, + * then we've experienced dynamic LUN growth. If automatic + * expansion is enabled then use the additional space. + */ + if (vd->vdev_state == VDEV_STATE_HEALTHY && asize > vd->vdev_asize && + (vd->vdev_expanding || spa->spa_autoexpand)) + vd->vdev_asize = asize; - /* - * If all children are healthy and the asize has increased, - * then we've experienced dynamic LUN growth. - */ - if (vd->vdev_state == VDEV_STATE_HEALTHY && - asize > vd->vdev_asize) { - vd->vdev_asize = asize; - } - } + vdev_set_min_asize(vd); /* * Ensure we can issue some IO before declaring the @@ -1041,30 +1255,14 @@ vdev_open(vdev_t *vd) return (error); } - /* - * If this is a top-level vdev, compute the raidz-deflation - * ratio. Note, we hard-code in 128k (1<<17) because it is the - * current "typical" blocksize. Even if SPA_MAXBLOCKSIZE - * changes, this algorithm must never change, or we will - * inconsistently account for existing bp's. - */ - if (vd->vdev_top == vd) { - vd->vdev_deflate_ratio = (1<<17) / - (vdev_psize_to_asize(vd, 1<<17) >> SPA_MINBLOCKSHIFT); - } - /* * If a leaf vdev has a DTL, and seems healthy, then kick off a - * resilver. But don't do this if we are doing a reopen for a - * scrub, since this would just restart the scrub we are already - * doing. + * resilver. But don't do this if we are doing a reopen for a scrub, + * since this would just restart the scrub we are already doing. */ - if (vd->vdev_children == 0 && !vd->vdev_spa->spa_scrub_reopen) { - mutex_enter(&vd->vdev_dtl_lock); - if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) - spa_async_request(vd->vdev_spa, SPA_ASYNC_RESILVER); - mutex_exit(&vd->vdev_dtl_lock); - } + if (vd->vdev_ops->vdev_op_leaf && !spa->spa_scrub_reopen && + vdev_resilver_needed(vd, NULL, NULL)) + spa_async_request(spa, SPA_ASYNC_RESILVER); return (0); } @@ -1083,12 +1281,11 @@ int vdev_validate(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - int c; nvlist_t *label; - uint64_t guid, top_guid; + uint64_t guid = 0, top_guid; uint64_t state; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) if (vdev_validate(vd->vdev_child[c]) != 0) return (EBADF); @@ -1098,6 +1295,8 @@ vdev_validate(vdev_t *vd) * overwrite the previous state. */ if (vd->vdev_ops->vdev_op_leaf && vdev_readable(vd)) { + uint64_t aux_guid = 0; + nvlist_t *nvl; if ((label = vdev_label_read_config(vd)) == NULL) { vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, @@ -1105,6 +1304,18 @@ vdev_validate(vdev_t *vd) return (0); } + /* + * Determine if this vdev has been split off into another + * pool. If so, then refuse to open it. + */ + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_SPLIT_GUID, + &aux_guid) == 0 && aux_guid == spa_guid(spa)) { + vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_SPLIT_POOL); + nvlist_free(label); + return (0); + } + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &guid) != 0 || guid != spa_guid(spa)) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, @@ -1113,6 +1324,11 @@ vdev_validate(vdev_t *vd) return (0); } + if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvl) + != 0 || nvlist_lookup_uint64(nvl, ZPOOL_CONFIG_ORIG_GUID, + &aux_guid) != 0) + aux_guid = 0; + /* * If this vdev just became a top-level vdev because its * sibling was detached, it will have adopted the parent's @@ -1120,12 +1336,16 @@ vdev_validate(vdev_t *vd) * Fortunately, either version of the label will have the * same top guid, so if we're a top-level vdev, we can * safely compare to that instead. + * + * If we split this vdev off instead, then we also check the + * original pool's guid. We don't want to consider the vdev + * corrupt if it is partway through a split operation. */ if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &guid) != 0 || nvlist_lookup_uint64(label, ZPOOL_CONFIG_TOP_GUID, &top_guid) != 0 || - (vd->vdev_guid != guid && + ((vd->vdev_guid != guid && vd->vdev_guid != aux_guid) && (vd->vdev_guid != top_guid || vd != vd->vdev_top))) { vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, VDEV_AUX_CORRUPT_DATA); @@ -1143,7 +1363,12 @@ vdev_validate(vdev_t *vd) nvlist_free(label); - if (spa->spa_load_state == SPA_LOAD_OPEN && + /* + * If spa->spa_load_verbatim is true, no need to check the + * state of the pool. + */ + if (!spa->spa_load_verbatim && + spa_load_state(spa) == SPA_LOAD_OPEN && state != POOL_STATE_ACTIVE) return (EBADF); @@ -1165,12 +1390,24 @@ vdev_validate(vdev_t *vd) void vdev_close(vdev_t *vd) { + spa_t *spa = vd->vdev_spa; + vdev_t *pvd = vd->vdev_parent; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + /* + * If our parent is reopening, then we are as well, unless we are + * going offline. + */ + if (pvd != NULL && pvd->vdev_reopening) + vd->vdev_reopening = (pvd->vdev_reopening && !vd->vdev_offline); + vd->vdev_ops->vdev_op_close(vd); vdev_cache_purge(vd); /* - * We record the previous state before we close it, so that if we are + * We record the previous state before we close it, so that if we are * doing a reopen(), we don't generate FMA ereports if we notice that * it's still faulted. */ @@ -1183,6 +1420,12 @@ vdev_close(vdev_t *vd) vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } +/* + * Reopen all interior vdevs and any unopened leaves. We don't actually + * reopen leaf vdevs which had previously been opened as they might deadlock + * on the spa_config_lock. Instead we only obtain the leaf's physical size. + * If the leaf has never been opened then open it, as usual. + */ void vdev_reopen(vdev_t *vd) { @@ -1190,6 +1433,8 @@ vdev_reopen(vdev_t *vd) ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + /* set the reopening flag unless we're taking the vdev offline */ + vd->vdev_reopening = !vd->vdev_offline; vdev_close(vd); (void) vdev_open(vd); @@ -1201,12 +1446,9 @@ vdev_reopen(vdev_t *vd) if (vd->vdev_aux) { (void) vdev_validate_aux(vd); if (vdev_readable(vd) && vdev_writeable(vd) && - !l2arc_vdev_present(vd)) { - uint64_t size = vdev_get_rsize(vd); - l2arc_add_vdev(spa, vd, - VDEV_LABEL_START_SIZE, - size - VDEV_LABEL_START_SIZE); - } + vd->vdev_aux == &spa->spa_l2cache && + !l2arc_vdev_present(vd)) + l2arc_add_vdev(spa, vd); } else { (void) vdev_validate(vd); } @@ -1246,32 +1488,21 @@ vdev_create(vdev_t *vd, uint64_t txg, boolean_t isreplacing) return (0); } -/* - * The is the latter half of vdev_create(). It is distinct because it - * involves initiating transactions in order to do metaslab creation. - * For creation, we want to try to create all vdevs at once and then undo it - * if anything fails; this is much harder if we have pending transactions. - */ void -vdev_init(vdev_t *vd, uint64_t txg) +vdev_metaslab_set_size(vdev_t *vd) { /* * Aim for roughly 200 metaslabs per vdev. */ vd->vdev_ms_shift = highbit(vd->vdev_asize / 200); vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); - - /* - * Initialize the vdev's metaslabs. This can't fail because - * there's nothing to read when creating all new metaslabs. - */ - VERIFY(vdev_metaslab_init(vd, txg) == 0); } void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { ASSERT(vd == vd->vdev_top); + ASSERT(!vd->vdev_ishole); ASSERT(ISP2(flags)); if (flags & VDD_METASLAB) @@ -1283,34 +1514,88 @@ vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) (void) txg_list_add(&vd->vdev_spa->spa_vdev_txg_list, vd, txg); } +/* + * DTLs. + * + * A vdev's DTL (dirty time log) is the set of transaction groups for which + * the vdev has less than perfect replication. There are three kinds of DTL: + * + * DTL_MISSING: txgs for which the vdev has no valid copies of the data + * + * DTL_PARTIAL: txgs for which data is available, but not fully replicated + * + * DTL_SCRUB: the txgs that could not be repaired by the last scrub; upon + * scrub completion, DTL_SCRUB replaces DTL_MISSING in the range of + * txgs that was scrubbed. + * + * DTL_OUTAGE: txgs which cannot currently be read, whether due to + * persistent errors or just some device being offline. + * Unlike the other three, the DTL_OUTAGE map is not generally + * maintained; it's only computed when needed, typically to + * determine whether a device can be detached. + * + * For leaf vdevs, DTL_MISSING and DTL_PARTIAL are identical: the device + * either has the data or it doesn't. + * + * For interior vdevs such as mirror and RAID-Z the picture is more complex. + * A vdev's DTL_PARTIAL is the union of its children's DTL_PARTIALs, because + * if any child is less than fully replicated, then so is its parent. + * A vdev's DTL_MISSING is a modified union of its children's DTL_MISSINGs, + * comprising only those txgs which appear in 'maxfaults' or more children; + * those are the txgs we don't have enough replication to read. For example, + * double-parity RAID-Z can tolerate up to two missing devices (maxfaults == 2); + * thus, its DTL_MISSING consists of the set of txgs that appear in more than + * two child DTL_MISSING maps. + * + * It should be clear from the above that to compute the DTLs and outage maps + * for all vdevs, it suffices to know just the leaf vdevs' DTL_MISSING maps. + * Therefore, that is all we keep on disk. When loading the pool, or after + * a configuration change, we generate all other DTLs from first principles. + */ void -vdev_dtl_dirty(space_map_t *sm, uint64_t txg, uint64_t size) +vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { + space_map_t *sm = &vd->vdev_dtl[t]; + + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + mutex_enter(sm->sm_lock); if (!space_map_contains(sm, txg, size)) space_map_add(sm, txg, size); mutex_exit(sm->sm_lock); } -int -vdev_dtl_contains(space_map_t *sm, uint64_t txg, uint64_t size) +boolean_t +vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t t, uint64_t txg, uint64_t size) { - int dirty; + space_map_t *sm = &vd->vdev_dtl[t]; + boolean_t dirty = B_FALSE; - /* - * Quick test without the lock -- covers the common case that - * there are no dirty time segments. - */ - if (sm->sm_space == 0) - return (0); + ASSERT(t < DTL_TYPES); + ASSERT(vd != vd->vdev_spa->spa_root_vdev); mutex_enter(sm->sm_lock); - dirty = space_map_contains(sm, txg, size); + if (sm->sm_space != 0) + dirty = space_map_contains(sm, txg, size); mutex_exit(sm->sm_lock); return (dirty); } +boolean_t +vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) +{ + space_map_t *sm = &vd->vdev_dtl[t]; + boolean_t empty; + + mutex_enter(sm->sm_lock); + empty = (sm->sm_space == 0); + mutex_exit(sm->sm_lock); + + return (empty); +} + /* * Reassess DTLs after a config change or scrub completion. */ @@ -1318,11 +1603,19 @@ void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) { spa_t *spa = vd->vdev_spa; - int c; + avl_tree_t reftree; + int minref; + + ASSERT(spa_config_held(spa, SCL_ALL, RW_READER) != 0); - ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + for (int c = 0; c < vd->vdev_children; c++) + vdev_dtl_reassess(vd->vdev_child[c], txg, + scrub_txg, scrub_done); - if (vd->vdev_children == 0) { + if (vd == spa->spa_root_vdev || vd->vdev_ishole || vd->vdev_aux) + return; + + if (vd->vdev_ops->vdev_op_leaf) { mutex_enter(&vd->vdev_dtl_lock); if (scrub_txg != 0 && (spa->spa_scrub_started || spa->spa_scrub_errors == 0)) { @@ -1333,12 +1626,38 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) * will be valid, so excise the old region and * fold in the scrub dtl. Otherwise, leave the * dtl as-is if there was an error. + * + * There's little trick here: to excise the beginning + * of the DTL_MISSING map, we put it into a reference + * tree and then add a segment with refcnt -1 that + * covers the range [0, scrub_txg). This means + * that each txg in that range has refcnt -1 or 0. + * We then add DTL_SCRUB with a refcnt of 2, so that + * entries in the range [0, scrub_txg) will have a + * positive refcnt -- either 1 or 2. We then convert + * the reference tree into the new DTL_MISSING map. */ - space_map_excise(&vd->vdev_dtl_map, 0, scrub_txg); - space_map_union(&vd->vdev_dtl_map, &vd->vdev_dtl_scrub); + space_map_ref_create(&reftree); + space_map_ref_add_map(&reftree, + &vd->vdev_dtl[DTL_MISSING], 1); + space_map_ref_add_seg(&reftree, 0, scrub_txg, -1); + space_map_ref_add_map(&reftree, + &vd->vdev_dtl[DTL_SCRUB], 2); + space_map_ref_generate_map(&reftree, + &vd->vdev_dtl[DTL_MISSING], 1); + space_map_ref_destroy(&reftree); } + space_map_vacate(&vd->vdev_dtl[DTL_PARTIAL], NULL, NULL); + space_map_walk(&vd->vdev_dtl[DTL_MISSING], + space_map_add, &vd->vdev_dtl[DTL_PARTIAL]); if (scrub_done) - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); + space_map_vacate(&vd->vdev_dtl[DTL_SCRUB], NULL, NULL); + space_map_vacate(&vd->vdev_dtl[DTL_OUTAGE], NULL, NULL); + if (!vdev_readable(vd)) + space_map_add(&vd->vdev_dtl[DTL_OUTAGE], 0, -1ULL); + else + space_map_walk(&vd->vdev_dtl[DTL_MISSING], + space_map_add, &vd->vdev_dtl[DTL_OUTAGE]); mutex_exit(&vd->vdev_dtl_lock); if (txg != 0) @@ -1346,35 +1665,36 @@ vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done) return; } - /* - * Make sure the DTLs are always correct under the scrub lock. - */ - if (vd == spa->spa_root_vdev) - mutex_enter(&spa->spa_scrub_lock); - mutex_enter(&vd->vdev_dtl_lock); - space_map_vacate(&vd->vdev_dtl_map, NULL, NULL); - space_map_vacate(&vd->vdev_dtl_scrub, NULL, NULL); - mutex_exit(&vd->vdev_dtl_lock); - - for (c = 0; c < vd->vdev_children; c++) { - vdev_t *cvd = vd->vdev_child[c]; - vdev_dtl_reassess(cvd, txg, scrub_txg, scrub_done); - mutex_enter(&vd->vdev_dtl_lock); - space_map_union(&vd->vdev_dtl_map, &cvd->vdev_dtl_map); - space_map_union(&vd->vdev_dtl_scrub, &cvd->vdev_dtl_scrub); - mutex_exit(&vd->vdev_dtl_lock); + for (int t = 0; t < DTL_TYPES; t++) { + /* account for child's outage in parent's missing map */ + int s = (t == DTL_MISSING) ? DTL_OUTAGE: t; + if (t == DTL_SCRUB) + continue; /* leaf vdevs only */ + if (t == DTL_PARTIAL) + minref = 1; /* i.e. non-zero */ + else if (vd->vdev_nparity != 0) + minref = vd->vdev_nparity + 1; /* RAID-Z */ + else + minref = vd->vdev_children; /* any kind of mirror */ + space_map_ref_create(&reftree); + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + mutex_enter(&cvd->vdev_dtl_lock); + space_map_ref_add_map(&reftree, &cvd->vdev_dtl[s], 1); + mutex_exit(&cvd->vdev_dtl_lock); + } + space_map_ref_generate_map(&reftree, &vd->vdev_dtl[t], minref); + space_map_ref_destroy(&reftree); } - - if (vd == spa->spa_root_vdev) - mutex_exit(&spa->spa_scrub_lock); + mutex_exit(&vd->vdev_dtl_lock); } static int vdev_dtl_load(vdev_t *vd) { spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl; + space_map_obj_t *smo = &vd->vdev_dtl_smo; objset_t *mos = spa->spa_meta_objset; dmu_buf_t *db; int error; @@ -1384,6 +1704,8 @@ vdev_dtl_load(vdev_t *vd) if (smo->smo_object == 0) return (0); + ASSERT(!vd->vdev_ishole); + if ((error = dmu_bonus_hold(mos, smo->smo_object, FTAG, &db)) != 0) return (error); @@ -1392,7 +1714,8 @@ vdev_dtl_load(vdev_t *vd) dmu_buf_rele(db, FTAG); mutex_enter(&vd->vdev_dtl_lock); - error = space_map_load(&vd->vdev_dtl_map, NULL, SM_ALLOC, smo, mos); + error = space_map_load(&vd->vdev_dtl[DTL_MISSING], + NULL, SM_ALLOC, smo, mos); mutex_exit(&vd->vdev_dtl_lock); return (error); @@ -1402,14 +1725,16 @@ void vdev_dtl_sync(vdev_t *vd, uint64_t txg) { spa_t *spa = vd->vdev_spa; - space_map_obj_t *smo = &vd->vdev_dtl; - space_map_t *sm = &vd->vdev_dtl_map; + space_map_obj_t *smo = &vd->vdev_dtl_smo; + space_map_t *sm = &vd->vdev_dtl[DTL_MISSING]; objset_t *mos = spa->spa_meta_objset; space_map_t smsync; kmutex_t smlock; dmu_buf_t *db; dmu_tx_t *tx; + ASSERT(!vd->vdev_ishole); + tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); if (vd->vdev_detached) { @@ -1460,6 +1785,37 @@ vdev_dtl_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } +/* + * Determine whether the specified vdev can be offlined/detached/removed + * without losing data. + */ +boolean_t +vdev_dtl_required(vdev_t *vd) +{ + spa_t *spa = vd->vdev_spa; + vdev_t *tvd = vd->vdev_top; + uint8_t cant_read = vd->vdev_cant_read; + boolean_t required; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + + if (vd == spa->spa_root_vdev || vd == tvd) + return (B_TRUE); + + /* + * Temporarily mark the device as unreadable, and then determine + * whether this results in any DTL outages in the top-level vdev. + * If not, we can safely offline/detach/remove the device. + */ + vd->vdev_cant_read = B_TRUE; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + required = !vdev_dtl_empty(tvd, DTL_OUTAGE); + vd->vdev_cant_read = cant_read; + vdev_dtl_reassess(tvd, 0, 0, B_FALSE); + + return (required); +} + /* * Determine if resilver is needed, and if so the txg range. */ @@ -1472,19 +1828,19 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) if (vd->vdev_children == 0) { mutex_enter(&vd->vdev_dtl_lock); - if (vd->vdev_dtl_map.sm_space != 0 && vdev_writeable(vd)) { + if (vd->vdev_dtl[DTL_MISSING].sm_space != 0 && + vdev_writeable(vd)) { space_seg_t *ss; - ss = avl_first(&vd->vdev_dtl_map.sm_root); + ss = avl_first(&vd->vdev_dtl[DTL_MISSING].sm_root); thismin = ss->ss_start - 1; - ss = avl_last(&vd->vdev_dtl_map.sm_root); + ss = avl_last(&vd->vdev_dtl[DTL_MISSING].sm_root); thismax = ss->ss_end; needed = B_TRUE; } mutex_exit(&vd->vdev_dtl_lock); } else { - int c; - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; uint64_t cmin, cmax; @@ -1506,18 +1862,16 @@ vdev_resilver_needed(vdev_t *vd, uint64_t *minp, uint64_t *maxp) void vdev_load(vdev_t *vd) { - int c; - /* * Recursively load all children. */ - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_load(vd->vdev_child[c]); /* * If this is a top-level vdev, initialize its metaslabs. */ - if (vd == vd->vdev_top && + if (vd == vd->vdev_top && !vd->vdev_ishole && (vd->vdev_ashift == 0 || vd->vdev_asize == 0 || vdev_metaslab_init(vd, 0) != 0)) vdev_set_state(vd, B_FALSE, VDEV_STATE_CANT_OPEN, @@ -1573,13 +1927,55 @@ vdev_validate_aux(vdev_t *vd) return (0); } +void +vdev_remove(vdev_t *vd, uint64_t txg) +{ + spa_t *spa = vd->vdev_spa; + objset_t *mos = spa->spa_meta_objset; + dmu_tx_t *tx; + + tx = dmu_tx_create_assigned(spa_get_dsl(spa), txg); + + if (vd->vdev_dtl_smo.smo_object) { + ASSERT3U(vd->vdev_dtl_smo.smo_alloc, ==, 0); + (void) dmu_object_free(mos, vd->vdev_dtl_smo.smo_object, tx); + vd->vdev_dtl_smo.smo_object = 0; + } + + if (vd->vdev_ms != NULL) { + for (int m = 0; m < vd->vdev_ms_count; m++) { + metaslab_t *msp = vd->vdev_ms[m]; + + if (msp == NULL || msp->ms_smo.smo_object == 0) + continue; + + ASSERT3U(msp->ms_smo.smo_alloc, ==, 0); + (void) dmu_object_free(mos, msp->ms_smo.smo_object, tx); + msp->ms_smo.smo_object = 0; + } + } + + if (vd->vdev_ms_array) { + (void) dmu_object_free(mos, vd->vdev_ms_array, tx); + vd->vdev_ms_array = 0; + vd->vdev_ms_shift = 0; + } + dmu_tx_commit(tx); +} + void vdev_sync_done(vdev_t *vd, uint64_t txg) { metaslab_t *msp; + boolean_t reassess = !txg_list_empty(&vd->vdev_ms_list, TXG_CLEAN(txg)); + + ASSERT(!vd->vdev_ishole); while (msp = txg_list_remove(&vd->vdev_ms_list, TXG_CLEAN(txg))) metaslab_sync_done(msp, txg); + + if (reassess) + metaslab_sync_reassess(vd->vdev_mg); } void @@ -1590,6 +1986,8 @@ vdev_sync(vdev_t *vd, uint64_t txg) metaslab_t *msp; dmu_tx_t *tx; + ASSERT(!vd->vdev_ishole); + if (vd->vdev_ms_array == 0 && vd->vdev_ms_shift != 0) { ASSERT(vd == vd->vdev_top); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -1600,6 +1998,9 @@ vdev_sync(vdev_t *vd, uint64_t txg) dmu_tx_commit(tx); } + if (vd->vdev_removing) + vdev_remove(vd, txg); + while ((msp = txg_list_remove(&vd->vdev_ms_list, txg)) != NULL) { metaslab_sync(msp, txg); (void) txg_list_add(&vd->vdev_ms_list, msp, TXG_CLEAN(txg)); @@ -1622,11 +2023,11 @@ vdev_psize_to_asize(vdev_t *vd, uint64_t psize) * not be opened, and no I/O is attempted. */ int -vdev_fault(spa_t *spa, uint64_t guid) +vdev_fault(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1634,19 +2035,27 @@ vdev_fault(spa_t *spa, uint64_t guid) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + /* + * We don't directly use the aux state here, but if we do a + * vdev_reopen(), we need this value to be present to remember why we + * were faulted. + */ + vd->vdev_label_aux = aux; + /* * Faulted state takes precedence over degraded. */ vd->vdev_faulted = 1ULL; vd->vdev_degraded = 0ULL; - vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, VDEV_AUX_ERR_EXCEEDED); + vdev_set_state(vd, B_FALSE, VDEV_STATE_FAULTED, aux); /* * If marking the vdev as faulted cause the top-level vdev to become * unavailable, then back off and simply mark the vdev as degraded * instead. */ - if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + if (vdev_is_dead(vd->vdev_top) && !vd->vdev_islog && + vd->vdev_aux == NULL) { vd->vdev_degraded = 1ULL; vd->vdev_faulted = 0ULL; @@ -1656,10 +2065,8 @@ vdev_fault(spa_t *spa, uint64_t guid) */ vdev_reopen(vd); - if (vdev_readable(vd)) { - vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, - VDEV_AUX_ERR_EXCEEDED); - } + if (vdev_readable(vd)) + vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, aux); } return (spa_vdev_state_exit(spa, vd, 0)); @@ -1671,11 +2078,11 @@ vdev_fault(spa_t *spa, uint64_t guid) * as I/O is concerned. */ int -vdev_degrade(spa_t *spa, uint64_t guid) +vdev_degrade(spa_t *spa, uint64_t guid, vdev_aux_t aux) { vdev_t *vd; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1692,7 +2099,7 @@ vdev_degrade(spa_t *spa, uint64_t guid) vd->vdev_degraded = 1ULL; if (!vdev_is_dead(vd)) vdev_set_state(vd, B_FALSE, VDEV_STATE_DEGRADED, - VDEV_AUX_ERR_EXCEEDED); + aux); return (spa_vdev_state_exit(spa, vd, 0)); } @@ -1706,9 +2113,9 @@ vdev_degrade(spa_t *spa, uint64_t guid) int vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) { - vdev_t *vd; + vdev_t *vd, *tvd, *pvd, *rvd = spa->spa_root_vdev; - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1716,13 +2123,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; vd->vdev_offline = B_FALSE; vd->vdev_tmpoffline = B_FALSE; vd->vdev_checkremove = !!(flags & ZFS_ONLINE_CHECKREMOVE); vd->vdev_forcefault = !!(flags & ZFS_ONLINE_FORCEFAULT); - vdev_reopen(vd->vdev_top); + + /* XXX - L2ARC 1.0 does not support expansion */ + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = !!(flags & ZFS_ONLINE_EXPAND); + } + + vdev_reopen(tvd); vd->vdev_checkremove = vd->vdev_forcefault = B_FALSE; + if (!vd->vdev_aux) { + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + pvd->vdev_expanding = B_FALSE; + } + if (newstate) *newstate = vd->vdev_state; if ((flags & ZFS_ONLINE_UNSPARE) && @@ -1731,19 +2151,26 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate) vd->vdev_parent->vdev_child[0] == vd) vd->vdev_unspare = B_TRUE; - (void) spa_vdev_state_exit(spa, vd, 0); - - VERIFY3U(spa_scrub(spa, POOL_SCRUB_RESILVER), ==, 0); + if ((flags & ZFS_ONLINE_EXPAND) || spa->spa_autoexpand) { - return (0); + /* XXX - L2ARC 1.0 does not support expansion */ + if (vd->vdev_aux) + return (spa_vdev_state_exit(spa, vd, ENOTSUP)); + spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE); + } + return (spa_vdev_state_exit(spa, vd, 0)); } -int -vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +static int +vdev_offline_locked(spa_t *spa, uint64_t guid, uint64_t flags) { - vdev_t *vd; + vdev_t *vd, *tvd; + int error = 0; + uint64_t generation; + metaslab_group_t *mg; - spa_vdev_state_enter(spa); +top: + spa_vdev_state_enter(spa, SCL_ALLOC); if ((vd = spa_lookup_by_guid(spa, guid, B_TRUE)) == NULL) return (spa_vdev_state_exit(spa, NULL, ENODEV)); @@ -1751,32 +2178,76 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) if (!vd->vdev_ops->vdev_op_leaf) return (spa_vdev_state_exit(spa, NULL, ENOTSUP)); + tvd = vd->vdev_top; + mg = tvd->vdev_mg; + generation = spa->spa_config_generation + 1; + /* * If the device isn't already offline, try to offline it. */ if (!vd->vdev_offline) { /* - * If this device's top-level vdev has a non-empty DTL, - * don't allow the device to be offlined. - * - * XXX -- make this more precise by allowing the offline - * as long as the remaining devices don't have any DTL holes. + * If this device has the only valid copy of some data, + * don't allow it to be offlined. Log devices are always + * expendable. */ - if (vd->vdev_top->vdev_dtl_map.sm_space != 0) + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_dtl_required(vd)) return (spa_vdev_state_exit(spa, NULL, EBUSY)); + /* + * If the top-level is a slog and it has had allocations + * then proceed. We check that the vdev's metaslab group + * is not NULL since it's possible that we may have just + * added this vdev but not yet initialized its metaslabs. + */ + if (tvd->vdev_islog && mg != NULL) { + /* + * Prevent any future allocations. + */ + metaslab_group_passivate(mg); + (void) spa_vdev_state_exit(spa, vd, 0); + + error = spa_offline_log(spa); + + spa_vdev_state_enter(spa, SCL_ALLOC); + + /* + * Check to see if the config has changed. + */ + if (error || generation != spa->spa_config_generation) { + metaslab_group_activate(mg); + if (error) + return (spa_vdev_state_exit(spa, + vd, error)); + (void) spa_vdev_state_exit(spa, vd, 0); + goto top; + } + ASSERT3U(tvd->vdev_stat.vs_alloc, ==, 0); + } + /* * Offline this device and reopen its top-level vdev. - * If this action results in the top-level vdev becoming - * unusable, undo it and fail the request. + * If the top-level vdev is a log device then just offline + * it. Otherwise, if this action results in the top-level + * vdev becoming unusable, undo it and fail the request. */ vd->vdev_offline = B_TRUE; - vdev_reopen(vd->vdev_top); - if (vdev_is_dead(vd->vdev_top) && vd->vdev_aux == NULL) { + vdev_reopen(tvd); + + if (!tvd->vdev_islog && vd->vdev_aux == NULL && + vdev_is_dead(tvd)) { vd->vdev_offline = B_FALSE; - vdev_reopen(vd->vdev_top); + vdev_reopen(tvd); return (spa_vdev_state_exit(spa, NULL, EBUSY)); } + + /* + * Add the device back into the metaslab rotor so that + * once we online the device it's open for business. + */ + if (tvd->vdev_islog && mg != NULL) + metaslab_group_activate(mg); } vd->vdev_tmpoffline = !!(flags & ZFS_OFFLINE_TEMPORARY); @@ -1784,6 +2255,18 @@ vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) return (spa_vdev_state_exit(spa, vd, 0)); } +int +vdev_offline(spa_t *spa, uint64_t guid, uint64_t flags) +{ + int error; + + mutex_enter(&spa->spa_vdev_top_lock); + error = vdev_offline_locked(spa, guid, flags); + mutex_exit(&spa->spa_vdev_top_lock); + + return (error); +} + /* * Clear the error counts associated with this vdev. Unlike vdev_online() and * vdev_offline(), we assume the spa config is locked. We also clear all @@ -1815,12 +2298,21 @@ vdev_clear(spa_t *spa, vdev_t *vd) if (vd->vdev_faulted || vd->vdev_degraded || !vdev_readable(vd) || !vdev_writeable(vd)) { + /* + * When reopening in reponse to a clear event, it may be due to + * a fmadm repair request. In this case, if the device is + * still broken, we want to still post the ereport again. + */ + vd->vdev_forcefault = B_TRUE; + vd->vdev_faulted = vd->vdev_degraded = 0; vd->vdev_cant_read = B_FALSE; vd->vdev_cant_write = B_FALSE; vdev_reopen(vd); + vd->vdev_forcefault = B_FALSE; + if (vd != rvd) vdev_state_dirty(vd->vdev_top); @@ -1829,12 +2321,30 @@ vdev_clear(spa_t *spa, vdev_t *vd) spa_event_notify(spa, vd, ESC_ZFS_VDEV_CLEAR); } + + /* + * When clearing a FMA-diagnosed fault, we always want to + * unspare the device, as we assume that the original spare was + * done in response to the FMA fault. + */ + if (!vdev_is_dead(vd) && vd->vdev_parent != NULL && + vd->vdev_parent->vdev_ops == &vdev_spare_ops && + vd->vdev_parent->vdev_child[0] == vd) + vd->vdev_unspare = B_TRUE; } boolean_t vdev_is_dead(vdev_t *vd) { - return (vd->vdev_state < VDEV_STATE_DEGRADED); + /* + * Holes and missing devices are always considered "dead". + * This simplifies the code since we don't have to check for + * these types of devices in the various code paths. + * Instead we rely on the fact that we skip over dead devices + * before issuing I/O to them. + */ + return (vd->vdev_state < VDEV_STATE_DEGRADED || vd->vdev_ishole || + vd->vdev_ops == &vdev_missing_ops); } boolean_t @@ -1852,14 +2362,18 @@ vdev_writeable(vdev_t *vd) boolean_t vdev_allocatable(vdev_t *vd) { + uint64_t state = vd->vdev_state; + /* - * We currently allow allocations from vdevs which maybe in the + * We currently allow allocations from vdevs which may be in the * process of reopening (i.e. VDEV_STATE_CLOSED). If the device * fails to reopen then we'll catch it later when we're holding - * the proper locks. + * the proper locks. Note that we have to get the vdev state + * in a local variable because although it changes atomically, + * we're asking two separate questions about it. */ - return (!(vdev_is_dead(vd) && vd->vdev_state != VDEV_STATE_CLOSED) && - !vd->vdev_cant_write); + return (!(state < VDEV_STATE_DEGRADED && state != VDEV_STATE_CLOSED) && + !vd->vdev_cant_write && !vd->vdev_ishole && !vd->vdev_removing); } boolean_t @@ -1892,7 +2406,9 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) vs->vs_scrub_errors = vd->vdev_spa->spa_scrub_errors; vs->vs_timestamp = gethrtime() - vs->vs_timestamp; vs->vs_state = vd->vdev_state; - vs->vs_rsize = vdev_get_rsize(vd); + vs->vs_rsize = vdev_get_min_asize(vd); + if (vd->vdev_ops->vdev_op_leaf) + vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; mutex_exit(&vd->vdev_stat_lock); /* @@ -1928,7 +2444,8 @@ vdev_clear_stats(vdev_t *vd) void vdev_stat_update(zio_t *zio, uint64_t psize) { - vdev_t *rvd = zio->io_spa->spa_root_vdev; + spa_t *spa = zio->io_spa; + vdev_t *rvd = spa->spa_root_vdev; vdev_t *vd = zio->io_vd ? zio->io_vd : rvd; vdev_t *pvd; uint64_t txg = zio->io_txg; @@ -1961,61 +2478,106 @@ vdev_stat_update(zio_t *zio, uint64_t psize) return; ASSERT(vd == zio->io_vd); - if (!(flags & ZIO_FLAG_IO_BYPASS)) { - mutex_enter(&vd->vdev_stat_lock); - vs->vs_ops[type]++; - vs->vs_bytes[type] += psize; - mutex_exit(&vd->vdev_stat_lock); - } + + if (flags & ZIO_FLAG_IO_BYPASS) + return; + + mutex_enter(&vd->vdev_stat_lock); + if (flags & ZIO_FLAG_IO_REPAIR) { - ASSERT(zio->io_delegate_list == NULL); - mutex_enter(&vd->vdev_stat_lock); if (flags & ZIO_FLAG_SCRUB_THREAD) vs->vs_scrub_repaired += psize; - else + if (flags & ZIO_FLAG_SELF_HEAL) vs->vs_self_healed += psize; - mutex_exit(&vd->vdev_stat_lock); } + + vs->vs_ops[type]++; + vs->vs_bytes[type] += psize; + + mutex_exit(&vd->vdev_stat_lock); return; } if (flags & ZIO_FLAG_SPECULATIVE) return; + /* + * If this is an I/O error that is going to be retried, then ignore the + * error. Otherwise, the user may interpret B_FAILFAST I/O errors as + * hard errors, when in reality they can happen for any number of + * innocuous reasons (bus resets, MPxIO link failure, etc). + */ + if (zio->io_error == EIO && + !(zio->io_flags & ZIO_FLAG_IO_RETRY)) + return; + + /* + * Intent logs writes won't propagate their error to the root + * I/O so don't mark these types of failures as pool-level + * errors. + */ + if (zio->io_vd == NULL && (zio->io_flags & ZIO_FLAG_DONT_PROPAGATE)) + return; + mutex_enter(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_READ) { + if (type == ZIO_TYPE_READ && !vdev_is_dead(vd)) { if (zio->io_error == ECKSUM) vs->vs_checksum_errors++; else vs->vs_read_errors++; } - if (type == ZIO_TYPE_WRITE) + if (type == ZIO_TYPE_WRITE && !vdev_is_dead(vd)) vs->vs_write_errors++; mutex_exit(&vd->vdev_stat_lock); - if (type == ZIO_TYPE_WRITE && txg != 0 && vd->vdev_children == 0) { - if (flags & ZIO_FLAG_SCRUB_THREAD) { - ASSERT(flags & ZIO_FLAG_IO_REPAIR); - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - vdev_dtl_dirty(&pvd->vdev_dtl_scrub, txg, 1); - } - if (!(flags & ZIO_FLAG_IO_REPAIR)) { - if (vdev_dtl_contains(&vd->vdev_dtl_map, txg, 1)) + if (type == ZIO_TYPE_WRITE && txg != 0 && + (!(flags & ZIO_FLAG_IO_REPAIR) || + (flags & ZIO_FLAG_SCRUB_THREAD) || + spa->spa_claiming)) { + /* + * This is either a normal write (not a repair), or it's + * a repair induced by the scrub thread, or it's a repair + * made by zil_claim() during spa_load() in the first txg. + * In the normal case, we commit the DTL change in the same + * txg as the block was born. In the scrub-induced repair + * case, we know that scrubs run in first-pass syncing context, + * so we commit the DTL change in spa_syncing_txg(spa). + * In the zil_claim() case, we commit in spa_first_txg(spa). + * + * We currently do not make DTL entries for failed spontaneous + * self-healing writes triggered by normal (non-scrubbing) + * reads, because we have no transactional context in which to + * do so -- and it's not clear that it'd be desirable anyway. + */ + if (vd->vdev_ops->vdev_op_leaf) { + uint64_t commit_txg = txg; + if (flags & ZIO_FLAG_SCRUB_THREAD) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + ASSERT(spa_sync_pass(spa) == 1); + vdev_dtl_dirty(vd, DTL_SCRUB, txg, 1); + commit_txg = spa_syncing_txg(spa); + } else if (spa->spa_claiming) { + ASSERT(flags & ZIO_FLAG_IO_REPAIR); + commit_txg = spa_first_txg(spa); + } + ASSERT(commit_txg >= spa_syncing_txg(spa)); + if (vdev_dtl_contains(vd, DTL_MISSING, txg, 1)) return; - vdev_dirty(vd->vdev_top, VDD_DTL, vd, txg); - for (pvd = vd; pvd != NULL; pvd = pvd->vdev_parent) - vdev_dtl_dirty(&pvd->vdev_dtl_map, txg, 1); + for (pvd = vd; pvd != rvd; pvd = pvd->vdev_parent) + vdev_dtl_dirty(pvd, DTL_PARTIAL, txg, 1); + vdev_dirty(vd->vdev_top, VDD_DTL, vd, commit_txg); } + if (vd != rvd) + vdev_dtl_dirty(vd, DTL_MISSING, txg, 1); } } void vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) { - int c; vdev_stat_t *vs = &vd->vdev_stat; - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_scrub_stat_update(vd->vdev_child[c], type, complete); mutex_enter(&vd->vdev_stat_lock); @@ -2040,15 +2602,18 @@ vdev_scrub_stat_update(vdev_t *vd, pool_scrub_type_t type, boolean_t complete) } /* - * Update the in-core space usage stats for this vdev and the root vdev. + * Update the in-core space usage stats for this vdev, its metaslab class, + * and the root vdev. */ void -vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, - boolean_t update_root) +vdev_space_update(vdev_t *vd, int64_t alloc_delta, int64_t defer_delta, + int64_t space_delta) { int64_t dspace_delta = space_delta; spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; + metaslab_group_t *mg = vd->vdev_mg; + metaslab_class_t *mc = mg ? mg->mg_class : NULL; ASSERT(vd == vd->vdev_top); @@ -2059,32 +2624,31 @@ vdev_space_update(vdev_t *vd, int64_t space_delta, int64_t alloc_delta, * childrens', thus not accurate enough for us. */ ASSERT((dspace_delta & (SPA_MINBLOCKSIZE-1)) == 0); + ASSERT(vd->vdev_deflate_ratio != 0 || vd->vdev_isl2cache); dspace_delta = (dspace_delta >> SPA_MINBLOCKSHIFT) * vd->vdev_deflate_ratio; mutex_enter(&vd->vdev_stat_lock); - vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_alloc += alloc_delta; + vd->vdev_stat.vs_space += space_delta; vd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&vd->vdev_stat_lock); - if (update_root) { - ASSERT(rvd == vd->vdev_parent); - ASSERT(vd->vdev_ms_count != 0); - - /* - * Don't count non-normal (e.g. intent log) space as part of - * the pool's capacity. - */ - if (vd->vdev_mg->mg_class != spa->spa_normal_class) - return; - + if (mc == spa_normal_class(spa)) { mutex_enter(&rvd->vdev_stat_lock); - rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_alloc += alloc_delta; + rvd->vdev_stat.vs_space += space_delta; rvd->vdev_stat.vs_dspace += dspace_delta; mutex_exit(&rvd->vdev_stat_lock); } + + if (mc != NULL) { + ASSERT(rvd == vd->vdev_parent); + ASSERT(vd->vdev_ms_count != 0); + + metaslab_class_space_update(mc, + alloc_delta, defer_delta, space_delta, dspace_delta); + } } /* @@ -2100,8 +2664,8 @@ vdev_config_dirty(vdev_t *vd) int c; /* - * If this is an aux vdev (as with l2cache devices), then we update the - * vdev config manually and set the sync flag. + * If this is an aux vdev (as with l2cache and spare devices), then we + * update the vdev config manually and set the sync flag. */ if (vd->vdev_aux != NULL) { spa_aux_vdev_t *sav = vd->vdev_aux; @@ -2123,8 +2687,11 @@ vdev_config_dirty(vdev_t *vd) sav->sav_sync = B_TRUE; - VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, - ZPOOL_CONFIG_L2CACHE, &aux, &naux) == 0); + if (nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_L2CACHE, &aux, &naux) != 0) { + VERIFY(nvlist_lookup_nvlist_array(sav->sav_config, + ZPOOL_CONFIG_SPARES, &aux, &naux) == 0); + } ASSERT(c < naux); @@ -2154,7 +2721,8 @@ vdev_config_dirty(vdev_t *vd) } else { ASSERT(vd == vd->vdev_top); - if (!list_link_active(&vd->vdev_config_dirty_node)) + if (!list_link_active(&vd->vdev_config_dirty_node) && + !vd->vdev_ishole) list_insert_head(&spa->spa_config_dirty_list, vd); } } @@ -2195,7 +2763,7 @@ vdev_state_dirty(vdev_t *vd) (dsl_pool_sync_context(spa_get_dsl(spa)) && spa_config_held(spa, SCL_STATE, RW_READER))); - if (!list_link_active(&vd->vdev_state_dirty_node)) + if (!list_link_active(&vd->vdev_state_dirty_node) && !vd->vdev_ishole) list_insert_head(&spa->spa_state_dirty_list, vd); } @@ -2218,18 +2786,24 @@ vdev_state_clean(vdev_t *vd) void vdev_propagate_state(vdev_t *vd) { - vdev_t *rvd = vd->vdev_spa->spa_root_vdev; + spa_t *spa = vd->vdev_spa; + vdev_t *rvd = spa->spa_root_vdev; int degraded = 0, faulted = 0; int corrupted = 0; - int c; vdev_t *child; if (vd->vdev_children > 0) { - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { child = vd->vdev_child[c]; + /* + * Don't factor holes into the decision. + */ + if (child->vdev_ishole) + continue; + if (!vdev_readable(child) || - (!vdev_writeable(child) && (spa_mode & FWRITE))) { + (!vdev_writeable(child) && spa_writeable(spa))) { /* * Root special: if there is a top-level log * device, treat the root vdev as if it were @@ -2300,6 +2874,19 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) if (vdev_is_dead(vd) && vd->vdev_ops->vdev_op_leaf) vd->vdev_ops->vdev_op_close(vd); + /* + * If we have brought this vdev back into service, we need + * to notify fmd so that it can gracefully repair any outstanding + * cases due to a missing device. We do this in all cases, even those + * that probably don't correlate to a repaired fault. This is sure to + * catch all cases, and we let the zfs-retire agent sort it out. If + * this is a transient state it's OK, as the retire agent will + * double-check the state of the vdev before repairing it. + */ + if (state == VDEV_STATE_HEALTHY && vd->vdev_ops->vdev_op_leaf && + vd->vdev_prevstate != state) + zfs_post_state_change(spa, vd); + if (vd->vdev_removed && state == VDEV_STATE_CANT_OPEN && (aux == VDEV_AUX_OPEN_FAILED || vd->vdev_checkremove)) { @@ -2315,11 +2902,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_state = VDEV_STATE_REMOVED; vd->vdev_stat.vs_aux = VDEV_AUX_NONE; } else if (state == VDEV_STATE_REMOVED) { - /* - * Indicate to the ZFS DE that this device has been removed, and - * any recent errors should be ignored. - */ - zfs_post_remove(spa, vd); vd->vdev_removed = B_TRUE; } else if (state == VDEV_STATE_CANT_OPEN) { /* @@ -2328,8 +2910,7 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) * begin with. Failure to open such a device is not considered * an error. */ - if (spa->spa_load_state == SPA_LOAD_IMPORT && - !spa->spa_import_faulted && + if (spa_load_state(spa) == SPA_LOAD_IMPORT && vd->vdev_ops->vdev_op_leaf) vd->vdev_not_present = 1; @@ -2388,8 +2969,8 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) vd->vdev_removed = B_FALSE; } - if (!isopen) - vdev_propagate_state(vd); + if (!isopen && vd->vdev_parent) + vdev_propagate_state(vd->vdev_parent); } /* @@ -2401,8 +2982,6 @@ vdev_set_state(vdev_t *vd, boolean_t isopen, vdev_state_t state, vdev_aux_t aux) boolean_t vdev_is_bootable(vdev_t *vd) { - int c; - if (!vd->vdev_ops->vdev_op_leaf) { char *vdev_type = vd->vdev_ops->vdev_op_type; @@ -2417,9 +2996,71 @@ vdev_is_bootable(vdev_t *vd) return (B_FALSE); } - for (c = 0; c < vd->vdev_children; c++) { + for (int c = 0; c < vd->vdev_children; c++) { if (!vdev_is_bootable(vd->vdev_child[c])) return (B_FALSE); } return (B_TRUE); } + +/* + * Load the state from the original vdev tree (ovd) which + * we've retrieved from the MOS config object. If the original + * vdev was offline then we transfer that state to the device + * in the current vdev tree (nvd). + */ +void +vdev_load_log_state(vdev_t *nvd, vdev_t *ovd) +{ + spa_t *spa = nvd->vdev_spa; + + ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); + ASSERT3U(nvd->vdev_guid, ==, ovd->vdev_guid); + + for (int c = 0; c < nvd->vdev_children; c++) + vdev_load_log_state(nvd->vdev_child[c], ovd->vdev_child[c]); + + if (nvd->vdev_ops->vdev_op_leaf && ovd->vdev_offline) { + /* + * It would be nice to call vdev_offline() + * directly but the pool isn't fully loaded and + * the txg threads have not been started yet. + */ + nvd->vdev_offline = ovd->vdev_offline; + vdev_reopen(nvd->vdev_top); + } +} + +/* + * Expand a vdev if possible. + */ +void +vdev_expand(vdev_t *vd, uint64_t txg) +{ + ASSERT(vd->vdev_top == vd); + ASSERT(spa_config_held(vd->vdev_spa, SCL_ALL, RW_WRITER) == SCL_ALL); + + if ((vd->vdev_asize >> vd->vdev_ms_shift) > vd->vdev_ms_count) { + VERIFY(vdev_metaslab_init(vd, txg) == 0); + vdev_config_dirty(vd); + } +} + +/* + * Split a vdev. + */ +void +vdev_split(vdev_t *vd) +{ + vdev_t *cvd, *pvd = vd->vdev_parent; + + vdev_remove_child(pvd, vd); + vdev_compact_children(pvd); + + cvd = pvd->vdev_child[0]; + if (pvd->vdev_children == 1) { + vdev_remove_parent(cvd); + cvd->vdev_splitting = B_TRUE; + } + vdev_propagate_state(cvd); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c index 5a7b59f6ed845..688d541344cbc 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_cache.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -172,7 +172,7 @@ vdev_cache_allocate(zio_t *zio) ve = kmem_zalloc(sizeof (vdev_cache_entry_t), KM_SLEEP); ve->ve_offset = offset; - ve->ve_lastused = lbolt; + ve->ve_lastused = ddi_get_lbolt(); ve->ve_data = zio_buf_alloc(VCBS); avl_add(&vc->vc_offset_tree, ve); @@ -189,9 +189,9 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) ASSERT(MUTEX_HELD(&vc->vc_lock)); ASSERT(ve->ve_fill_io == NULL); - if (ve->ve_lastused != lbolt) { + if (ve->ve_lastused != ddi_get_lbolt()) { avl_remove(&vc->vc_lastused_tree, ve); - ve->ve_lastused = lbolt; + ve->ve_lastused = ddi_get_lbolt(); avl_add(&vc->vc_lastused_tree, ve); } @@ -203,23 +203,23 @@ vdev_cache_hit(vdev_cache_t *vc, vdev_cache_entry_t *ve, zio_t *zio) * Fill a previously allocated cache entry with data. */ static void -vdev_cache_fill(zio_t *zio) +vdev_cache_fill(zio_t *fio) { - vdev_t *vd = zio->io_vd; + vdev_t *vd = fio->io_vd; vdev_cache_t *vc = &vd->vdev_cache; - vdev_cache_entry_t *ve = zio->io_private; - zio_t *dio; + vdev_cache_entry_t *ve = fio->io_private; + zio_t *pio; - ASSERT(zio->io_size == VCBS); + ASSERT(fio->io_size == VCBS); /* * Add data to the cache. */ mutex_enter(&vc->vc_lock); - ASSERT(ve->ve_fill_io == zio); - ASSERT(ve->ve_offset == zio->io_offset); - ASSERT(ve->ve_data == zio->io_data); + ASSERT(ve->ve_fill_io == fio); + ASSERT(ve->ve_offset == fio->io_offset); + ASSERT(ve->ve_data == fio->io_data); ve->ve_fill_io = NULL; @@ -228,20 +228,13 @@ vdev_cache_fill(zio_t *zio) * any reads that were queued up before the missed update are still * valid, so we can satisfy them from this line before we evict it. */ - for (dio = zio->io_delegate_list; dio; dio = dio->io_delegate_next) - vdev_cache_hit(vc, ve, dio); + while ((pio = zio_walk_parents(fio)) != NULL) + vdev_cache_hit(vc, ve, pio); - if (zio->io_error || ve->ve_missed_update) + if (fio->io_error || ve->ve_missed_update) vdev_cache_evict(vc, ve); mutex_exit(&vc->vc_lock); - - while ((dio = zio->io_delegate_list) != NULL) { - zio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - dio->io_error = zio->io_error; - zio_execute(dio); - } } /* @@ -284,9 +277,8 @@ vdev_cache_read(zio_t *zio) } if ((fio = ve->ve_fill_io) != NULL) { - zio->io_delegate_next = fio->io_delegate_list; - fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); VDCSTAT_BUMP(vdc_stat_delegations); return (0); @@ -296,7 +288,6 @@ vdev_cache_read(zio_t *zio) zio_vdev_io_bypass(zio); mutex_exit(&vc->vc_lock); - zio_execute(zio); VDCSTAT_BUMP(vdc_stat_hits); return (0); } @@ -313,8 +304,8 @@ vdev_cache_read(zio_t *zio) ZIO_FLAG_DONT_CACHE, vdev_cache_fill, ve); ve->ve_fill_io = fio; - fio->io_delegate_list = zio; zio_vdev_io_bypass(zio); + zio_add_child(zio, fio); mutex_exit(&vc->vc_lock); zio_nowait(fio); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c index 35d4e2a9200db..08e28b274902a 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_disk.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -47,6 +47,7 @@ typedef struct vdev_disk_buf { static int vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) { + spa_t *spa = vd->vdev_spa; vdev_disk_t *dvd; struct dk_minfo dkm; int error; @@ -61,6 +62,16 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) return (EINVAL); } + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + dvd = vd->vdev_tsd; + goto skip_open; + } + dvd = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_disk_t), KM_SLEEP); /* @@ -78,12 +89,6 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * * 3. Otherwise, the device may have moved. Try opening the device * by the devid instead. - * - * If the vdev is part of the root pool, we avoid opening it by path. - * We do this because there is no /dev path available early in boot, - * and if we try to open the device by path at a later point, we can - * deadlock when devfsadm attempts to open the underlying backing store - * file. */ if (vd->vdev_devid != NULL) { if (ddi_devid_str_decode(vd->vdev_devid, &dvd->vd_devid, @@ -95,7 +100,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) error = EINVAL; /* presume failure */ - if (vd->vdev_path != NULL && !spa_is_root(vd->vdev_spa)) { + if (vd->vdev_path != NULL) { ddi_devid_t devid; if (vd->vdev_wholedisk == -1ULL) { @@ -105,18 +110,18 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) (void) snprintf(buf, len, "%ss0", vd->vdev_path); - if (ldi_open_by_name(buf, spa_mode, kcred, + if (ldi_open_by_name(buf, spa_mode(spa), kcred, &lh, zfs_li) == 0) { spa_strfree(vd->vdev_path); vd->vdev_path = buf; vd->vdev_wholedisk = 1ULL; - (void) ldi_close(lh, spa_mode, kcred); + (void) ldi_close(lh, spa_mode(spa), kcred); } else { kmem_free(buf, len); } } - error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* @@ -126,7 +131,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) ldi_get_devid(dvd->vd_lh, &devid) == 0) { if (ddi_devid_compare(devid, dvd->vd_devid) != 0) { error = EINVAL; - (void) ldi_close(dvd->vd_lh, spa_mode, kcred); + (void) ldi_close(dvd->vd_lh, spa_mode(spa), + kcred); dvd->vd_lh = NULL; } ddi_devid_free(devid); @@ -146,7 +152,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) */ if (error != 0 && vd->vdev_devid != NULL) error = ldi_open_by_devid(dvd->vd_devid, dvd->vd_minor, - spa_mode, kcred, &dvd->vd_lh, zfs_li); + spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* * If all else fails, then try opening by physical path (if available) @@ -156,8 +162,8 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) */ if (error) { if (vd->vdev_physpath != NULL && - (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != ENODEV) - error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode, + (dev = ddi_pathname_to_dev_t(vd->vdev_physpath)) != NODEV) + error = ldi_open_by_dev(&dev, OTYP_BLK, spa_mode(spa), kcred, &dvd->vd_lh, zfs_li); /* @@ -165,10 +171,9 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * as above. This hasn't been used in a very long time and we * don't need to propagate its oddities to this edge condition. */ - if (error && vd->vdev_path != NULL && - !spa_is_root(vd->vdev_spa)) - error = ldi_open_by_name(vd->vdev_path, spa_mode, kcred, - &dvd->vd_lh, zfs_li); + if (error && vd->vdev_path != NULL) + error = ldi_open_by_name(vd->vdev_path, spa_mode(spa), + kcred, &dvd->vd_lh, zfs_li); } if (error) { @@ -201,6 +206,7 @@ vdev_disk_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) kmem_free(physpath, MAXPATHLEN); } +skip_open: /* * Determine the actual size of the device. */ @@ -243,7 +249,7 @@ vdev_disk_close(vdev_t *vd) { vdev_disk_t *dvd = vd->vdev_tsd; - if (dvd == NULL) + if (vd->vdev_reopening || dvd == NULL) return; if (dvd->vd_minor != NULL) @@ -253,7 +259,7 @@ vdev_disk_close(vdev_t *vd) ddi_devid_free(dvd->vd_devid); if (dvd->vd_lh != NULL) - (void) ldi_close(dvd->vd_lh, spa_mode, kcred); + (void) ldi_close(dvd->vd_lh, spa_mode(vd->vdev_spa), kcred); kmem_free(dvd, sizeof (vdev_disk_t)); vd->vdev_tsd = NULL; @@ -314,6 +320,11 @@ vdev_disk_ioctl_free(zio_t *zio) kmem_free(zio->io_vsd, sizeof (struct dk_callback)); } +static const zio_vsd_ops_t vdev_disk_vsd_ops = { + vdev_disk_ioctl_free, + zio_vsd_default_cksum_report +}; + static void vdev_disk_ioctl_done(void *zio_arg, int error) { @@ -354,7 +365,7 @@ vdev_disk_io_start(zio_t *zio) } zio->io_vsd = dkc = kmem_alloc(sizeof (*dkc), KM_SLEEP); - zio->io_vsd_free = vdev_disk_ioctl_free; + zio->io_vsd_ops = &vdev_disk_vsd_ops; dkc->dkc_callback = vdev_disk_ioctl_done; dkc->dkc_flag = FLUSH_VOLATILE; @@ -400,8 +411,9 @@ vdev_disk_io_start(zio_t *zio) bioinit(bp); bp->b_flags = B_BUSY | B_NOCACHE | - (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE) | - ((zio->io_flags & ZIO_FLAG_IO_RETRY) ? 0 : B_FAILFAST); + (zio->io_type == ZIO_TYPE_READ ? B_READ : B_WRITE); + if (!(zio->io_flags & (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD))) + bp->b_flags |= B_FAILFAST; bp->b_bcount = zio->io_size; bp->b_un.b_addr = zio->io_data; bp->b_lblkno = lbtodb(zio->io_offset); @@ -425,12 +437,19 @@ vdev_disk_io_done(zio_t *zio) * asynchronous removal of the device. Otherwise, probe the device and * make sure it's still accessible. */ - if (zio->io_error == EIO) { + if (zio->io_error == EIO && !vd->vdev_remove_wanted) { vdev_disk_t *dvd = vd->vdev_tsd; int state = DKIO_NONE; if (ldi_ioctl(dvd->vd_lh, DKIOCSTATE, (intptr_t)&state, FKIOCTL, kcred, NULL) == 0 && state != DKIO_INSERTED) { + /* + * We post the resource as soon as possible, instead of + * when the async removal actually happens, because the + * DE is using this information to discard previous I/O + * errors. + */ + zfs_post_remove(zio->io_spa, vd); vd->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } @@ -469,7 +488,7 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) if (devid != NULL && ddi_devid_str_decode(devid, &tmpdevid, &minor_name) == 0) { error = ldi_open_by_devid(tmpdevid, minor_name, - spa_mode, kcred, &vd_lh, zfs_li); + FREAD, kcred, &vd_lh, zfs_li); ddi_devid_free(tmpdevid); ddi_devid_str_free(minor_name); } @@ -486,14 +505,14 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) size = P2ALIGN_TYPED(s, sizeof (vdev_label_t), uint64_t); label = kmem_alloc(sizeof (vdev_label_t), KM_SLEEP); + *config = NULL; for (l = 0; l < VDEV_LABELS; l++) { uint64_t offset, state, txg = 0; /* read vdev label */ offset = vdev_label_offset(size, l, 0); if (vdev_disk_physio(vd_lh, (caddr_t)label, - VDEV_SKIP_SIZE + VDEV_BOOT_HEADER_SIZE + - VDEV_PHYS_SIZE, offset, B_READ) != 0) + VDEV_SKIP_SIZE + VDEV_PHYS_SIZE, offset, B_READ) != 0) continue; if (nvlist_unpack(label->vl_vdev_phys.vp_nvlist, @@ -521,6 +540,8 @@ vdev_disk_read_rootlabel(char *devpath, char *devid, nvlist_t **config) kmem_free(label, sizeof (vdev_label_t)); (void) ldi_close(vd_lh, FREAD, kcred); + if (*config == NULL) + error = EIDRM; return (error); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c index dc0e920bfc521..779e88edb9f24 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_file.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -51,6 +51,16 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) return (EINVAL); } + /* + * Reopen the device if it's not currently open. Otherwise, + * just update the physical size of the device. + */ + if (vd->vdev_tsd != NULL) { + ASSERT(vd->vdev_reopening); + vf = vd->vdev_tsd; + goto skip_open; + } + vf = vd->vdev_tsd = kmem_zalloc(sizeof (vdev_file_t), KM_SLEEP); /* @@ -61,7 +71,7 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) */ ASSERT(vd->vdev_path != NULL && vd->vdev_path[0] == '/'); error = vn_openat(vd->vdev_path + 1, UIO_SYSSPACE, - spa_mode | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); + spa_mode(vd->vdev_spa) | FOFFMAX, 0, &vp, 0, 0, rootdir, -1); if (error) { vd->vdev_stat.vs_aux = VDEV_AUX_OPEN_FAILED; @@ -79,6 +89,8 @@ vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) return (ENODEV); } #endif + +skip_open: /* * Determine the physical size of the file. */ @@ -100,12 +112,13 @@ vdev_file_close(vdev_t *vd) { vdev_file_t *vf = vd->vdev_tsd; - if (vf == NULL) + if (vd->vdev_reopening || vf == NULL) return; if (vf->vf_vnode != NULL) { (void) VOP_PUTPAGE(vf->vf_vnode, 0, 0, B_INVAL, kcred, NULL); - (void) VOP_CLOSE(vf->vf_vnode, spa_mode, 1, 0, kcred, NULL); + (void) VOP_CLOSE(vf->vf_vnode, spa_mode(vd->vdev_spa), 1, 0, + kcred, NULL); VN_RELE(vf->vf_vnode); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c index 9c56d66364d56..d11b3df7c67e4 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_label.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -233,6 +233,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vd->vdev_physpath) == 0); + if (vd->vdev_fru != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_FRU, + vd->vdev_fru) == 0); + if (vd->vdev_nparity != 0) { ASSERT(strcmp(vd->vdev_ops->vdev_op_type, VDEV_TYPE_RAIDZ) == 0); @@ -242,8 +246,10 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, * into a crufty old storage pool. */ ASSERT(vd->vdev_nparity == 1 || - (vd->vdev_nparity == 2 && - spa_version(spa) >= SPA_VERSION_RAID6)); + (vd->vdev_nparity <= 2 && + spa_version(spa) >= SPA_VERSION_RAIDZ2) || + (vd->vdev_nparity <= 3 && + spa_version(spa) >= SPA_VERSION_RAIDZ3)); /* * Note that we'll add the nparity tag even on storage pools @@ -277,9 +283,13 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, vd->vdev_islog) == 0); } - if (vd->vdev_dtl.smo_object != 0) + if (vd->vdev_dtl_smo.smo_object != 0) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_DTL, - vd->vdev_dtl.smo_object) == 0); + vd->vdev_dtl_smo.smo_object) == 0); + + if (vd->vdev_crtxg) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_CREATE_TXG, + vd->vdev_crtxg) == 0); if (getstats) { vdev_stat_t vs; @@ -292,6 +302,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, nvlist_t **child; int c; + ASSERT(!vd->vdev_ishole); + child = kmem_alloc(vd->vdev_children * sizeof (nvlist_t *), KM_SLEEP); @@ -308,6 +320,8 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, kmem_free(child, vd->vdev_children * sizeof (nvlist_t *)); } else { + const char *aux = NULL; + if (vd->vdev_offline && !vd->vdev_tmpoffline) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_OFFLINE, B_TRUE) == 0); @@ -323,11 +337,67 @@ vdev_config_generate(spa_t *spa, vdev_t *vd, boolean_t getstats, if (vd->vdev_unspare) VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_UNSPARE, B_TRUE) == 0); + if (vd->vdev_ishole) + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_IS_HOLE, + B_TRUE) == 0); + + switch (vd->vdev_stat.vs_aux) { + case VDEV_AUX_ERR_EXCEEDED: + aux = "err_exceeded"; + break; + + case VDEV_AUX_EXTERNAL: + aux = "external"; + break; + } + + if (aux != NULL) + VERIFY(nvlist_add_string(nv, ZPOOL_CONFIG_AUX_STATE, + aux) == 0); + + if (vd->vdev_splitting && vd->vdev_orig_guid != 0LL) { + VERIFY(nvlist_add_uint64(nv, ZPOOL_CONFIG_ORIG_GUID, + vd->vdev_orig_guid) == 0); + } } return (nv); } +/* + * Generate a view of the top-level vdevs. If we currently have holes + * in the namespace, then generate an array which contains a list of holey + * vdevs. Additionally, add the number of top-level children that currently + * exist. + */ +void +vdev_top_config_generate(spa_t *spa, nvlist_t *config) +{ + vdev_t *rvd = spa->spa_root_vdev; + uint64_t *array; + uint_t idx; + + array = kmem_alloc(rvd->vdev_children * sizeof (uint64_t), KM_SLEEP); + + idx = 0; + for (int c = 0; c < rvd->vdev_children; c++) { + vdev_t *tvd = rvd->vdev_child[c]; + + if (tvd->vdev_ishole) + array[idx++] = c; + } + + if (idx) { + VERIFY(nvlist_add_uint64_array(config, ZPOOL_CONFIG_HOLE_ARRAY, + array, idx) == 0); + } + + VERIFY(nvlist_add_uint64(config, ZPOOL_CONFIG_VDEV_CHILDREN, + rvd->vdev_children) == 0); + + kmem_free(array, rvd->vdev_children * sizeof (uint64_t)); +} + nvlist_t * vdev_label_read_config(vdev_t *vd) { @@ -335,8 +405,8 @@ vdev_label_read_config(vdev_t *vd) nvlist_t *config = NULL; vdev_phys_t *vp; zio_t *zio; - int flags = - ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE; ASSERT(spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -345,6 +415,7 @@ vdev_label_read_config(vdev_t *vd) vp = zio_buf_alloc(sizeof (vdev_phys_t)); +retry: for (int l = 0; l < VDEV_LABELS; l++) { zio = zio_root(spa, NULL, NULL, flags); @@ -364,6 +435,11 @@ vdev_label_read_config(vdev_t *vd) } } + if (config == NULL && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + zio_buf_free(vp, sizeof (vdev_phys_t)); return (config); @@ -488,7 +564,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) spa_t *spa = vd->vdev_spa; nvlist_t *label; vdev_phys_t *vp; - vdev_boot_header_t *vb; + char *pad2; uberblock_t *ub; zio_t *zio; char *buf; @@ -504,6 +580,9 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) crtxg, reason)) != 0) return (error); + /* Track the creation time for this vdev */ + vd->vdev_crtxg = crtxg; + if (!vd->vdev_ops->vdev_op_leaf) return (0); @@ -516,7 +595,7 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) /* * Determine if the vdev is in use. */ - if (reason != VDEV_LABEL_REMOVE && + if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPLIT && vdev_inuse(vd, crtxg, reason, &spare_guid, &l2cache_guid)) return (EBUSY); @@ -542,7 +621,8 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) */ if (reason == VDEV_LABEL_SPARE) return (0); - ASSERT(reason == VDEV_LABEL_REPLACE); + ASSERT(reason == VDEV_LABEL_REPLACE || + reason == VDEV_LABEL_SPLIT); } if (reason != VDEV_LABEL_REMOVE && reason != VDEV_LABEL_SPARE && @@ -607,7 +687,11 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); } else { - label = spa_config_generate(spa, vd, 0ULL, B_FALSE); + uint64_t txg = 0ULL; + + if (reason == VDEV_LABEL_SPLIT) + txg = spa->spa_uberblock.ub_txg; + label = spa_config_generate(spa, vd, txg, B_FALSE); /* * Add our creation time. This allows us to detect multiple @@ -629,27 +713,22 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) return (error == EFAULT ? ENAMETOOLONG : EINVAL); } - /* - * Initialize boot block header. - */ - vb = zio_buf_alloc(sizeof (vdev_boot_header_t)); - bzero(vb, sizeof (vdev_boot_header_t)); - vb->vb_magic = VDEV_BOOT_MAGIC; - vb->vb_version = VDEV_BOOT_VERSION; - vb->vb_offset = VDEV_BOOT_OFFSET; - vb->vb_size = VDEV_BOOT_SIZE; - /* * Initialize uberblock template. */ - ub = zio_buf_alloc(VDEV_UBERBLOCK_SIZE(vd)); - bzero(ub, VDEV_UBERBLOCK_SIZE(vd)); + ub = zio_buf_alloc(VDEV_UBERBLOCK_RING); + bzero(ub, VDEV_UBERBLOCK_RING); *ub = spa->spa_uberblock; ub->ub_txg = 0; + /* Initialize the 2nd padding area. */ + pad2 = zio_buf_alloc(VDEV_PAD_SIZE); + bzero(pad2, VDEV_PAD_SIZE); + /* * Write everything in parallel. */ +retry: zio = zio_root(spa, NULL, NULL, flags); for (int l = 0; l < VDEV_LABELS; l++) { @@ -658,22 +737,30 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) offsetof(vdev_label_t, vl_vdev_phys), sizeof (vdev_phys_t), NULL, NULL, flags); - vdev_label_write(zio, vd, l, vb, - offsetof(vdev_label_t, vl_boot_header), - sizeof (vdev_boot_header_t), NULL, NULL, flags); + /* + * Skip the 1st padding area. + * Zero out the 2nd padding area where it might have + * left over data from previous filesystem format. + */ + vdev_label_write(zio, vd, l, pad2, + offsetof(vdev_label_t, vl_pad2), + VDEV_PAD_SIZE, NULL, NULL, flags); - for (int n = 0; n < VDEV_UBERBLOCK_COUNT(vd); n++) { - vdev_label_write(zio, vd, l, ub, - VDEV_UBERBLOCK_OFFSET(vd, n), - VDEV_UBERBLOCK_SIZE(vd), NULL, NULL, flags); - } + vdev_label_write(zio, vd, l, ub, + offsetof(vdev_label_t, vl_uberblock), + VDEV_UBERBLOCK_RING, NULL, NULL, flags); } error = zio_wait(zio); + if (error != 0 && !(flags & ZIO_FLAG_TRYHARD)) { + flags |= ZIO_FLAG_TRYHARD; + goto retry; + } + nvlist_free(label); - zio_buf_free(ub, VDEV_UBERBLOCK_SIZE(vd)); - zio_buf_free(vb, sizeof (vdev_boot_header_t)); + zio_buf_free(pad2, VDEV_PAD_SIZE); + zio_buf_free(ub, VDEV_UBERBLOCK_RING); zio_buf_free(vp, sizeof (vdev_phys_t)); /* @@ -730,6 +817,7 @@ vdev_uberblock_compare(uberblock_t *ub1, uberblock_t *ub2) static void vdev_uberblock_load_done(zio_t *zio) { + spa_t *spa = zio->io_spa; zio_t *rio = zio->io_private; uberblock_t *ub = zio->io_data; uberblock_t *ubbest = rio->io_private; @@ -738,7 +826,8 @@ vdev_uberblock_load_done(zio_t *zio) if (zio->io_error == 0 && uberblock_verify(ub) == 0) { mutex_enter(&rio->io_lock); - if (vdev_uberblock_compare(ub, ubbest) > 0) + if (ub->ub_txg <= spa->spa_load_max_txg && + vdev_uberblock_compare(ub, ubbest) > 0) *ubbest = *ub; mutex_exit(&rio->io_lock); } @@ -751,8 +840,8 @@ vdev_uberblock_load(zio_t *zio, vdev_t *vd, uberblock_t *ubbest) { spa_t *spa = vd->vdev_spa; vdev_t *rvd = spa->spa_root_vdev; - int flags = - ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE; + int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL | + ZIO_FLAG_SPECULATIVE | ZIO_FLAG_TRYHARD; if (vd == rvd) { ASSERT(zio == NULL); @@ -955,7 +1044,10 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) for (vd = list_head(dl); vd != NULL; vd = list_next(dl, vd)) { uint64_t *good_writes = kmem_zalloc(sizeof (uint64_t), KM_SLEEP); - zio_t *vio = zio_null(zio, spa, + + ASSERT(!vd->vdev_ishole); + + zio_t *vio = zio_null(zio, spa, NULL, (vd->vdev_islog || vd->vdev_aux != NULL) ? vdev_label_sync_ignore_done : vdev_label_sync_top_done, good_writes, flags); @@ -990,7 +1082,7 @@ vdev_label_sync_list(spa_t *spa, int l, uint64_t txg, int flags) * at any time, you can just call it again, and it will resume its work. */ int -vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) +vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg, boolean_t tryhard) { spa_t *spa = svd[0]->vdev_spa; uberblock_t *ub = &spa->spa_uberblock; @@ -999,6 +1091,16 @@ vdev_config_sync(vdev_t **svd, int svdcount, uint64_t txg) int error; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + /* + * Normally, we don't want to try too hard to write every label and + * uberblock. If there is a flaky disk, we don't want the rest of the + * sync process to block while we retry. But if we can't write a + * single label out, we should retry with ZIO_FLAG_TRYHARD before + * bailing out and declaring the pool faulted. + */ + if (tryhard) + flags |= ZIO_FLAG_TRYHARD; + ASSERT(ub->ub_txg <= txg); /* diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c index c4629ff45087c..ac2a9b0f4dddf 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_mirror.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -60,6 +60,11 @@ vdev_mirror_map_free(zio_t *zio) kmem_free(mm, offsetof(mirror_map_t, mm_child[mm->mm_children])); } +static const zio_vsd_ops_t vdev_mirror_vsd_ops = { + vdev_mirror_map_free, + zio_vsd_default_cksum_report +}; + static mirror_map_t * vdev_mirror_map_alloc(zio_t *zio) { @@ -117,28 +122,28 @@ vdev_mirror_map_alloc(zio_t *zio) } zio->io_vsd = mm; - zio->io_vsd_free = vdev_mirror_map_free; + zio->io_vsd_ops = &vdev_mirror_vsd_ops; return (mm); } static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - vdev_t *cvd; - uint64_t c; int numerrors = 0; - int ret, lasterror = 0; + int lasterror = 0; if (vd->vdev_children == 0) { vd->vdev_stat.vs_aux = VDEV_AUX_BAD_LABEL; return (EINVAL); } - for (c = 0; c < vd->vdev_children; c++) { - cvd = vd->vdev_child[c]; + vdev_open_children(vd); - if ((ret = vdev_open(cvd)) != 0) { - lasterror = ret; + for (int c = 0; c < vd->vdev_children; c++) { + vdev_t *cvd = vd->vdev_child[c]; + + if (cvd->vdev_open_error) { + lasterror = cvd->vdev_open_error; numerrors++; continue; } @@ -158,9 +163,7 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_mirror_close(vdev_t *vd) { - uint64_t c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } @@ -180,11 +183,16 @@ vdev_mirror_scrub_done(zio_t *zio) mirror_child_t *mc = zio->io_private; if (zio->io_error == 0) { - zio_t *pio = zio->io_parent; - mutex_enter(&pio->io_lock); - ASSERT3U(zio->io_size, >=, pio->io_size); - bcopy(zio->io_data, pio->io_data, pio->io_size); - mutex_exit(&pio->io_lock); + zio_t *pio; + + mutex_enter(&zio->io_lock); + while ((pio = zio_walk_parents(zio)) != NULL) { + mutex_enter(&pio->io_lock); + ASSERT3U(zio->io_size, >=, pio->io_size); + bcopy(zio->io_data, pio->io_data, pio->io_size); + mutex_exit(&pio->io_lock); + } + mutex_exit(&zio->io_lock); } zio_buf_free(zio->io_data, zio->io_size); @@ -206,7 +214,7 @@ vdev_mirror_child_select(zio_t *zio) uint64_t txg = zio->io_txg; int i, c; - ASSERT(zio->io_bp == NULL || zio->io_bp->blk_birth == txg); + ASSERT(zio->io_bp == NULL || BP_PHYSICAL_BIRTH(zio->io_bp) == txg); /* * Try to find a child whose DTL doesn't contain the block to read. @@ -225,7 +233,7 @@ vdev_mirror_child_select(zio_t *zio) mc->mc_skipped = 1; continue; } - if (!vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, txg, 1)) + if (!vdev_dtl_contains(mc->mc_vd, DTL_MISSING, txg, 1)) return (c); mc->mc_error = ESTALE; mc->mc_skipped = 1; @@ -282,20 +290,10 @@ vdev_mirror_io_start(zio_t *zio) ASSERT(zio->io_type == ZIO_TYPE_WRITE); /* - * If this is a resilvering I/O to a replacing vdev, - * only the last child should be written -- unless the - * first child happens to have a DTL entry here as well. - * All other writes go to all children. + * Writes go to all children. */ - if ((zio->io_flags & ZIO_FLAG_RESILVER) && mm->mm_replacing && - !vdev_dtl_contains(&mm->mm_child[0].mc_vd->vdev_dtl_map, - zio->io_txg, 1)) { - c = mm->mm_children - 1; - children = 1; - } else { - c = 0; - children = mm->mm_children; - } + c = 0; + children = mm->mm_children; } while (children--) { @@ -398,7 +396,7 @@ vdev_mirror_io_done(zio_t *zio) ASSERT(zio->io_error != 0); } - if (good_copies && (spa_mode & FWRITE) && + if (good_copies && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER) || ((zio->io_flags & ZIO_FLAG_SCRUB) && mm->mm_replacing))) { @@ -419,7 +417,7 @@ vdev_mirror_io_done(zio_t *zio) if (mc->mc_tried) continue; if (!(zio->io_flags & ZIO_FLAG_SCRUB) && - !vdev_dtl_contains(&mc->mc_vd->vdev_dtl_map, + !vdev_dtl_contains(mc->mc_vd, DTL_PARTIAL, zio->io_txg, 1)) continue; mc->mc_error = ESTALE; @@ -429,7 +427,8 @@ vdev_mirror_io_done(zio_t *zio) mc->mc_vd, mc->mc_offset, zio->io_data, zio->io_size, ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR, NULL, NULL)); + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c index 731f7d3dcec90..e1bf7d86a361f 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_missing.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -48,8 +48,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *ashift) * VDEV_AUX_BAD_GUID_SUM. So we pretend to succeed, knowing that we * will fail the GUID sum check before ever trying to open the pool. */ - *psize = SPA_MINDEVSIZE; - *ashift = SPA_MINBLOCKSHIFT; + *psize = 0; + *ashift = 0; return (0); } @@ -83,3 +83,14 @@ vdev_ops_t vdev_missing_ops = { VDEV_TYPE_MISSING, /* name of this vdev type */ B_TRUE /* leaf vdev */ }; + +vdev_ops_t vdev_hole_ops = { + vdev_missing_open, + vdev_missing_close, + vdev_default_asize, + vdev_missing_io_start, + vdev_missing_io_done, + NULL, + VDEV_TYPE_HOLE, /* name of this vdev type */ + B_TRUE /* leaf vdev */ +}; diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c index 46fca0e3b629f..5a0d3ee97029d 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_queue.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include -#include #include #include #include @@ -38,20 +37,24 @@ * of i/os pending to each device (before it starts ramping up to * max_pending). */ -int zfs_vdev_max_pending = 35; +int zfs_vdev_max_pending = 10; int zfs_vdev_min_pending = 4; -/* deadline = pri + (lbolt >> time_shift) */ +/* deadline = pri + ddi_get_lbolt64() >> time_shift) */ int zfs_vdev_time_shift = 6; /* exponential I/O issue ramp-up rate */ int zfs_vdev_ramp_rate = 2; /* - * i/os will be aggregated into a single large i/o up to - * zfs_vdev_aggregation_limit bytes long. + * To reduce IOPs, we aggregate small adjacent I/Os into one large I/O. + * For read I/Os, we also aggregate across small adjacency gaps; for writes + * we include spans of optional I/Os to aid aggregation at the disk even when + * they aren't able to help us aggregate at this level. */ int zfs_vdev_aggregation_limit = SPA_MAXBLOCKSIZE; +int zfs_vdev_read_gap_limit = 32 << 10; +int zfs_vdev_write_gap_limit = 4 << 10; /* * Virtual device vector for disk I/O scheduling. @@ -149,34 +152,36 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio) static void vdev_queue_agg_io_done(zio_t *aio) { - zio_t *dio; - uint64_t offset = 0; + zio_t *pio; - while ((dio = aio->io_delegate_list) != NULL) { + while ((pio = zio_walk_parents(aio)) != NULL) if (aio->io_type == ZIO_TYPE_READ) - bcopy((char *)aio->io_data + offset, dio->io_data, - dio->io_size); - offset += dio->io_size; - aio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - dio->io_error = aio->io_error; - zio_execute(dio); - } - ASSERT3U(offset, ==, aio->io_size); + bcopy((char *)aio->io_data + (pio->io_offset - + aio->io_offset), pio->io_data, pio->io_size); zio_buf_free(aio->io_data, aio->io_size); } -#define IS_ADJACENT(io, nio) \ - ((io)->io_offset + (io)->io_size == (nio)->io_offset) +/* + * Compute the range spanned by two i/os, which is the endpoint of the last + * (lio->io_offset + lio->io_size) minus start of the first (fio->io_offset). + * Conveniently, the gap between fio and lio is given by -IO_SPAN(lio, fio); + * thus fio and lio are adjacent if and only if IO_SPAN(lio, fio) == 0. + */ +#define IO_SPAN(fio, lio) ((lio)->io_offset + (lio)->io_size - (fio)->io_offset) +#define IO_GAP(fio, lio) (-IO_SPAN(lio, fio)) static zio_t * vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) { - zio_t *fio, *lio, *aio, *dio; - avl_tree_t *tree; - uint64_t size; - + zio_t *fio, *lio, *aio, *dio, *nio, *mio; + avl_tree_t *t; + int flags; + uint64_t maxspan = zfs_vdev_aggregation_limit; + uint64_t maxgap; + int stretch; + +again: ASSERT(MUTEX_HELD(&vq->vq_lock)); if (avl_numnodes(&vq->vq_pending_tree) >= pending_limit || @@ -185,58 +190,150 @@ vdev_queue_io_to_issue(vdev_queue_t *vq, uint64_t pending_limit) fio = lio = avl_first(&vq->vq_deadline_tree); - tree = fio->io_vdev_tree; - size = fio->io_size; + t = fio->io_vdev_tree; + flags = fio->io_flags & ZIO_FLAG_AGG_INHERIT; + maxgap = (t == &vq->vq_read_tree) ? zfs_vdev_read_gap_limit : 0; + + if (!(flags & ZIO_FLAG_DONT_AGGREGATE)) { + /* + * We can aggregate I/Os that are sufficiently adjacent and of + * the same flavor, as expressed by the AGG_INHERIT flags. + * The latter requirement is necessary so that certain + * attributes of the I/O, such as whether it's a normal I/O + * or a scrub/resilver, can be preserved in the aggregate. + * We can include optional I/Os, but don't allow them + * to begin a range as they add no benefit in that situation. + */ + + /* + * We keep track of the last non-optional I/O. + */ + mio = (fio->io_flags & ZIO_FLAG_OPTIONAL) ? NULL : fio; + + /* + * Walk backwards through sufficiently contiguous I/Os + * recording the last non-option I/O. + */ + while ((dio = AVL_PREV(t, fio)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(dio, lio) <= maxspan && + IO_GAP(dio, fio) <= maxgap) { + fio = dio; + if (mio == NULL && !(fio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = fio; + } - while ((dio = AVL_PREV(tree, fio)) != NULL && IS_ADJACENT(dio, fio) && - !((dio->io_flags | fio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - dio->io_delegate_next = fio; - fio = dio; - size += dio->io_size; - } + /* + * Skip any initial optional I/Os. + */ + while ((fio->io_flags & ZIO_FLAG_OPTIONAL) && fio != lio) { + fio = AVL_NEXT(t, fio); + ASSERT(fio != NULL); + } + + /* + * Walk forward through sufficiently contiguous I/Os. + */ + while ((dio = AVL_NEXT(t, lio)) != NULL && + (dio->io_flags & ZIO_FLAG_AGG_INHERIT) == flags && + IO_SPAN(fio, dio) <= maxspan && + IO_GAP(lio, dio) <= maxgap) { + lio = dio; + if (!(lio->io_flags & ZIO_FLAG_OPTIONAL)) + mio = lio; + } + + /* + * Now that we've established the range of the I/O aggregation + * we must decide what to do with trailing optional I/Os. + * For reads, there's nothing to do. While we are unable to + * aggregate further, it's possible that a trailing optional + * I/O would allow the underlying device to aggregate with + * subsequent I/Os. We must therefore determine if the next + * non-optional I/O is close enough to make aggregation + * worthwhile. + */ + stretch = B_FALSE; + if (t != &vq->vq_read_tree && mio != NULL) { + nio = lio; + while ((dio = AVL_NEXT(t, nio)) != NULL && + IO_GAP(nio, dio) == 0 && + IO_GAP(mio, dio) <= zfs_vdev_write_gap_limit) { + nio = dio; + if (!(nio->io_flags & ZIO_FLAG_OPTIONAL)) { + stretch = B_TRUE; + break; + } + } + } - while ((dio = AVL_NEXT(tree, lio)) != NULL && IS_ADJACENT(lio, dio) && - !((lio->io_flags | dio->io_flags) & ZIO_FLAG_DONT_AGGREGATE) && - size + dio->io_size <= zfs_vdev_aggregation_limit) { - lio->io_delegate_next = dio; - lio = dio; - size += dio->io_size; + if (stretch) { + /* This may be a no-op. */ + VERIFY((dio = AVL_NEXT(t, lio)) != NULL); + dio->io_flags &= ~ZIO_FLAG_OPTIONAL; + } else { + while (lio != mio && lio != fio) { + ASSERT(lio->io_flags & ZIO_FLAG_OPTIONAL); + lio = AVL_PREV(t, lio); + ASSERT(lio != NULL); + } + } } if (fio != lio) { - char *buf = zio_buf_alloc(size); - uint64_t offset = 0; - + uint64_t size = IO_SPAN(fio, lio); ASSERT(size <= zfs_vdev_aggregation_limit); aio = zio_vdev_delegated_io(fio->io_vd, fio->io_offset, - buf, size, fio->io_type, ZIO_PRIORITY_NOW, - ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, + zio_buf_alloc(size), size, fio->io_type, ZIO_PRIORITY_AGG, + flags | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_QUEUE, vdev_queue_agg_io_done, NULL); - aio->io_delegate_list = fio; - - for (dio = fio; dio != NULL; dio = dio->io_delegate_next) { + nio = fio; + do { + dio = nio; + nio = AVL_NEXT(t, dio); ASSERT(dio->io_type == aio->io_type); - ASSERT(dio->io_vdev_tree == tree); - if (dio->io_type == ZIO_TYPE_WRITE) - bcopy(dio->io_data, buf + offset, dio->io_size); - offset += dio->io_size; + ASSERT(dio->io_vdev_tree == t); + + if (dio->io_flags & ZIO_FLAG_NODATA) { + ASSERT(dio->io_type == ZIO_TYPE_WRITE); + bzero((char *)aio->io_data + (dio->io_offset - + aio->io_offset), dio->io_size); + } else if (dio->io_type == ZIO_TYPE_WRITE) { + bcopy(dio->io_data, (char *)aio->io_data + + (dio->io_offset - aio->io_offset), + dio->io_size); + } + + zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); zio_vdev_io_bypass(dio); - } - - ASSERT(offset == size); + zio_execute(dio); + } while (dio != lio); avl_add(&vq->vq_pending_tree, aio); return (aio); } - ASSERT(fio->io_vdev_tree == tree); + ASSERT(fio->io_vdev_tree == t); vdev_queue_io_remove(vq, fio); + /* + * If the I/O is or was optional and therefore has no data, we need to + * simply discard it. We need to drop the vdev queue's lock to avoid a + * deadlock that we could encounter since this I/O will complete + * immediately. + */ + if (fio->io_flags & ZIO_FLAG_NODATA) { + mutex_exit(&vq->vq_lock); + zio_vdev_io_bypass(fio); + zio_execute(fio); + mutex_enter(&vq->vq_lock); + goto again; + } + avl_add(&vq->vq_pending_tree, fio); return (fio); @@ -262,7 +359,8 @@ vdev_queue_io(zio_t *zio) mutex_enter(&vq->vq_lock); - zio->io_deadline = (lbolt64 >> zfs_vdev_time_shift) + zio->io_priority; + zio->io_deadline = (ddi_get_lbolt64() >> zfs_vdev_time_shift) + + zio->io_priority; vdev_queue_io_add(vq, zio); diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c index 69e314468ee47..aa031dd25bd48 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_raidz.c @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -35,12 +35,27 @@ /* * Virtual device vector for RAID-Z. * - * This vdev supports both single and double parity. For single parity, we - * use a simple XOR of all the data columns. For double parity, we use both - * the simple XOR as well as a technique described in "The mathematics of - * RAID-6" by H. Peter Anvin. This technique defines a Galois field, GF(2^8), - * over the integers expressable in a single byte. Briefly, the operations on - * the field are defined as follows: + * This vdev supports single, double, and triple parity. For single parity, + * we use a simple XOR of all the data columns. For double or triple parity, + * we use a special case of Reed-Solomon coding. This extends the + * technique described in "The mathematics of RAID-6" by H. Peter Anvin by + * drawing on the system described in "A Tutorial on Reed-Solomon Coding for + * Fault-Tolerance in RAID-like Systems" by James S. Plank on which the + * former is also based. The latter is designed to provide higher performance + * for writes. + * + * Note that the Plank paper claimed to support arbitrary N+M, but was then + * amended six years later identifying a critical flaw that invalidates its + * claims. Nevertheless, the technique can be adapted to work for up to + * triple parity. For additional parity, the amendment "Note: Correction to + * the 1997 Tutorial on Reed-Solomon Coding" by James S. Plank and Ying Ding + * is viable, but the additional complexity means that write performance will + * suffer. + * + * All of the methods above operate on a Galois field, defined over the + * integers mod 2^N. In our case we choose N=8 for GF(8) so that all elements + * can be expressed with a single byte. Briefly, the operations on the + * field are defined as follows: * * o addition (+) is represented by a bitwise XOR * o subtraction (-) is therefore identical to addition: A + B = A - B @@ -55,22 +70,32 @@ * (A * 2)_0 = A_7 * * In C, multiplying by 2 is therefore ((a << 1) ^ ((a & 0x80) ? 0x1d : 0)). + * As an aside, this multiplication is derived from the error correcting + * primitive polynomial x^8 + x^4 + x^3 + x^2 + 1. * * Observe that any number in the field (except for 0) can be expressed as a * power of 2 -- a generator for the field. We store a table of the powers of * 2 and logs base 2 for quick look ups, and exploit the fact that A * B can * be rewritten as 2^(log_2(A) + log_2(B)) (where '+' is normal addition rather - * than field addition). The inverse of a field element A (A^-1) is A^254. + * than field addition). The inverse of a field element A (A^-1) is therefore + * A ^ (255 - 1) = A^254. * - * The two parity columns, P and Q, over several data columns, D_0, ... D_n-1, - * can be expressed by field operations: + * The up-to-three parity columns, P, Q, R over several data columns, + * D_0, ... D_n-1, can be expressed by field operations: * * P = D_0 + D_1 + ... + D_n-2 + D_n-1 * Q = 2^n-1 * D_0 + 2^n-2 * D_1 + ... + 2^1 * D_n-2 + 2^0 * D_n-1 * = ((...((D_0) * 2 + D_1) * 2 + ...) * 2 + D_n-2) * 2 + D_n-1 + * R = 4^n-1 * D_0 + 4^n-2 * D_1 + ... + 4^1 * D_n-2 + 4^0 * D_n-1 + * = ((...((D_0) * 4 + D_1) * 4 + ...) * 4 + D_n-2) * 4 + D_n-1 * - * See the reconstruction code below for how P and Q can used individually or - * in concert to recover missing data columns. + * We chose 1, 2, and 4 as our generators because 1 corresponds to the trival + * XOR operation, and 2 and 4 can be computed quickly and generate linearly- + * independent coefficients. (There are no additional coefficients that have + * this property which is why the uncorrected Plank method breaks down.) + * + * See the reconstruction code below for how P, Q and R can used individually + * or in concert to recover missing data columns. */ typedef struct raidz_col { @@ -78,27 +103,60 @@ typedef struct raidz_col { uint64_t rc_offset; /* device offset */ uint64_t rc_size; /* I/O size */ void *rc_data; /* I/O data */ + void *rc_gdata; /* used to store the "good" version */ int rc_error; /* I/O error for this device */ uint8_t rc_tried; /* Did we attempt this I/O column? */ uint8_t rc_skipped; /* Did we skip this I/O column? */ } raidz_col_t; typedef struct raidz_map { - uint64_t rm_cols; /* Column count */ + uint64_t rm_cols; /* Regular column count */ + uint64_t rm_scols; /* Count including skipped columns */ uint64_t rm_bigcols; /* Number of oversized columns */ uint64_t rm_asize; /* Actual total I/O size */ uint64_t rm_missingdata; /* Count of missing data devices */ uint64_t rm_missingparity; /* Count of missing parity devices */ uint64_t rm_firstdatacol; /* First data column/parity count */ + uint64_t rm_nskip; /* Skipped sectors for padding */ + uint64_t rm_skipstart; /* Column index of padding start */ + void *rm_datacopy; /* rm_asize-buffer of copied data */ + uintptr_t rm_reports; /* # of referencing checksum reports */ + uint8_t rm_freed; /* map no longer has referencing ZIO */ + uint8_t rm_ecksuminjected; /* checksum error was injected */ raidz_col_t rm_col[1]; /* Flexible array of I/O columns */ } raidz_map_t; #define VDEV_RAIDZ_P 0 #define VDEV_RAIDZ_Q 1 +#define VDEV_RAIDZ_R 2 + +#define VDEV_RAIDZ_MUL_2(x) (((x) << 1) ^ (((x) & 0x80) ? 0x1d : 0)) +#define VDEV_RAIDZ_MUL_4(x) (VDEV_RAIDZ_MUL_2(VDEV_RAIDZ_MUL_2(x))) + +/* + * We provide a mechanism to perform the field multiplication operation on a + * 64-bit value all at once rather than a byte at a time. This works by + * creating a mask from the top bit in each byte and using that to + * conditionally apply the XOR of 0x1d. + */ +#define VDEV_RAIDZ_64MUL_2(x, mask) \ +{ \ + (mask) = (x) & 0x8080808080808080ULL; \ + (mask) = ((mask) << 1) - ((mask) >> 7); \ + (x) = (((x) << 1) & 0xfefefefefefefefeULL) ^ \ + ((mask) & 0x1d1d1d1d1d1d1d1d); \ +} -#define VDEV_RAIDZ_MAXPARITY 2 +#define VDEV_RAIDZ_64MUL_4(x, mask) \ +{ \ + VDEV_RAIDZ_64MUL_2((x), mask); \ + VDEV_RAIDZ_64MUL_2((x), mask); \ +} -#define VDEV_RAIDZ_MUL_2(a) (((a) << 1) ^ (((a) & 0x80) ? 0x1d : 0)) +/* + * Force reconstruction to use the general purpose method. + */ +int vdev_raidz_default_to_general; /* * These two tables represent powers and logs of 2 in the Galois field defined @@ -173,6 +231,8 @@ static const uint8_t vdev_raidz_log2[256] = { 0x74, 0xd6, 0xf4, 0xea, 0xa8, 0x50, 0x58, 0xaf, }; +static void vdev_raidz_generate_parity(raidz_map_t *rm); + /* * Multiply a given number by 2 raised to the given power. */ @@ -193,17 +253,184 @@ vdev_raidz_exp2(uint_t a, int exp) } static void -vdev_raidz_map_free(zio_t *zio) +vdev_raidz_map_free(raidz_map_t *rm) { - raidz_map_t *rm = zio->io_vsd; int c; + size_t size; - for (c = 0; c < rm->rm_firstdatacol; c++) + for (c = 0; c < rm->rm_firstdatacol; c++) { zio_buf_free(rm->rm_col[c].rc_data, rm->rm_col[c].rc_size); - kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_cols])); + if (rm->rm_col[c].rc_gdata != NULL) + zio_buf_free(rm->rm_col[c].rc_gdata, + rm->rm_col[c].rc_size); + } + + size = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + size += rm->rm_col[c].rc_size; + + if (rm->rm_datacopy != NULL) + zio_buf_free(rm->rm_datacopy, size); + + kmem_free(rm, offsetof(raidz_map_t, rm_col[rm->rm_scols])); +} + +static void +vdev_raidz_map_free_vsd(zio_t *zio) +{ + raidz_map_t *rm = zio->io_vsd; + + ASSERT3U(rm->rm_freed, ==, 0); + rm->rm_freed = 1; + + if (rm->rm_reports == 0) + vdev_raidz_map_free(rm); +} + +/*ARGSUSED*/ +static void +vdev_raidz_cksum_free(void *arg, size_t ignored) +{ + raidz_map_t *rm = arg; + + ASSERT3U(rm->rm_reports, >, 0); + + if (--rm->rm_reports == 0 && rm->rm_freed != 0) + vdev_raidz_map_free(rm); +} + +static void +vdev_raidz_cksum_finish(zio_cksum_report_t *zcr, const void *good_data) +{ + raidz_map_t *rm = zcr->zcr_cbdata; + size_t c = zcr->zcr_cbinfo; + size_t x; + + const char *good = NULL; + const char *bad = rm->rm_col[c].rc_data; + + if (good_data == NULL) { + zfs_ereport_finish_checksum(zcr, NULL, NULL, B_FALSE); + return; + } + + if (c < rm->rm_firstdatacol) { + /* + * The first time through, calculate the parity blocks for + * the good data (this relies on the fact that the good + * data never changes for a given logical ZIO) + */ + if (rm->rm_col[0].rc_gdata == NULL) { + char *bad_parity[VDEV_RAIDZ_MAXPARITY]; + char *buf; + + /* + * Set up the rm_col[]s to generate the parity for + * good_data, first saving the parity bufs and + * replacing them with buffers to hold the result. + */ + for (x = 0; x < rm->rm_firstdatacol; x++) { + bad_parity[x] = rm->rm_col[x].rc_data; + rm->rm_col[x].rc_data = rm->rm_col[x].rc_gdata = + zio_buf_alloc(rm->rm_col[x].rc_size); + } + + /* fill in the data columns from good_data */ + buf = (char *)good_data; + for (; x < rm->rm_cols; x++) { + rm->rm_col[x].rc_data = buf; + buf += rm->rm_col[x].rc_size; + } + + /* + * Construct the parity from the good data. + */ + vdev_raidz_generate_parity(rm); + + /* restore everything back to its original state */ + for (x = 0; x < rm->rm_firstdatacol; x++) + rm->rm_col[x].rc_data = bad_parity[x]; + + buf = rm->rm_datacopy; + for (x = rm->rm_firstdatacol; x < rm->rm_cols; x++) { + rm->rm_col[x].rc_data = buf; + buf += rm->rm_col[x].rc_size; + } + } + + ASSERT3P(rm->rm_col[c].rc_gdata, !=, NULL); + good = rm->rm_col[c].rc_gdata; + } else { + /* adjust good_data to point at the start of our column */ + good = good_data; + + for (x = rm->rm_firstdatacol; x < c; x++) + good += rm->rm_col[x].rc_size; + } + + /* we drop the ereport if it ends up that the data was good */ + zfs_ereport_finish_checksum(zcr, good, bad, B_TRUE); +} + +/* + * Invoked indirectly by zfs_ereport_start_checksum(), called + * below when our read operation fails completely. The main point + * is to keep a copy of everything we read from disk, so that at + * vdev_raidz_cksum_finish() time we can compare it with the good data. + */ +static void +vdev_raidz_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *arg) +{ + size_t c = (size_t)(uintptr_t)arg; + caddr_t buf; + + raidz_map_t *rm = zio->io_vsd; + size_t size; + + /* set up the report and bump the refcount */ + zcr->zcr_cbdata = rm; + zcr->zcr_cbinfo = c; + zcr->zcr_finish = vdev_raidz_cksum_finish; + zcr->zcr_free = vdev_raidz_cksum_free; + + rm->rm_reports++; + ASSERT3U(rm->rm_reports, >, 0); + + if (rm->rm_datacopy != NULL) + return; + + /* + * It's the first time we're called for this raidz_map_t, so we need + * to copy the data aside; there's no guarantee that our zio's buffer + * won't be re-used for something else. + * + * Our parity data is already in separate buffers, so there's no need + * to copy them. + */ + + size = 0; + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) + size += rm->rm_col[c].rc_size; + + buf = rm->rm_datacopy = zio_buf_alloc(size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + raidz_col_t *col = &rm->rm_col[c]; + + bcopy(col->rc_data, buf, col->rc_size); + col->rc_data = buf; + + buf += col->rc_size; + } + ASSERT3P(buf - (caddr_t)rm->rm_datacopy, ==, size); } +static const zio_vsd_ops_t vdev_raidz_vsd_ops = { + vdev_raidz_map_free_vsd, + vdev_raidz_cksum_report +}; + static raidz_map_t * vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t nparity) @@ -213,24 +440,40 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, uint64_t s = zio->io_size >> unit_shift; uint64_t f = b % dcols; uint64_t o = (b / dcols) << unit_shift; - uint64_t q, r, c, bc, col, acols, coff, devidx; + uint64_t q, r, c, bc, col, acols, scols, coff, devidx, asize, tot; q = s / (dcols - nparity); r = s - q * (dcols - nparity); bc = (r == 0 ? 0 : r + nparity); + tot = s + nparity * (q + (r == 0 ? 0 : 1)); + + if (q == 0) { + acols = bc; + scols = MIN(dcols, roundup(bc, nparity + 1)); + } else { + acols = dcols; + scols = dcols; + } - acols = (q == 0 ? bc : dcols); + ASSERT3U(acols, <=, scols); - rm = kmem_alloc(offsetof(raidz_map_t, rm_col[acols]), KM_SLEEP); + rm = kmem_alloc(offsetof(raidz_map_t, rm_col[scols]), KM_SLEEP); rm->rm_cols = acols; + rm->rm_scols = scols; rm->rm_bigcols = bc; - rm->rm_asize = 0; + rm->rm_skipstart = bc; rm->rm_missingdata = 0; rm->rm_missingparity = 0; rm->rm_firstdatacol = nparity; + rm->rm_datacopy = NULL; + rm->rm_reports = 0; + rm->rm_freed = 0; + rm->rm_ecksuminjected = 0; - for (c = 0; c < acols; c++) { + asize = 0; + + for (c = 0; c < scols; c++) { col = f + c; coff = o; if (col >= dcols) { @@ -239,15 +482,27 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, } rm->rm_col[c].rc_devidx = col; rm->rm_col[c].rc_offset = coff; - rm->rm_col[c].rc_size = (q + (c < bc)) << unit_shift; rm->rm_col[c].rc_data = NULL; + rm->rm_col[c].rc_gdata = NULL; rm->rm_col[c].rc_error = 0; rm->rm_col[c].rc_tried = 0; rm->rm_col[c].rc_skipped = 0; - rm->rm_asize += rm->rm_col[c].rc_size; + + if (c >= acols) + rm->rm_col[c].rc_size = 0; + else if (c < bc) + rm->rm_col[c].rc_size = (q + 1) << unit_shift; + else + rm->rm_col[c].rc_size = q << unit_shift; + + asize += rm->rm_col[c].rc_size; } - rm->rm_asize = roundup(rm->rm_asize, (nparity + 1) << unit_shift); + ASSERT3U(asize, ==, tot << unit_shift); + rm->rm_asize = roundup(asize, (nparity + 1) << unit_shift); + rm->rm_nskip = roundup(tot, nparity + 1) - tot; + ASSERT3U(rm->rm_asize - asize, ==, rm->rm_nskip << unit_shift); + ASSERT3U(rm->rm_nskip, <=, nparity); for (c = 0; c < rm->rm_firstdatacol; c++) rm->rm_col[c].rc_data = zio_buf_alloc(rm->rm_col[c].rc_size); @@ -272,6 +527,11 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, * Unfortunately, this decision created an implicit on-disk format * requirement that we need to support for all eternity, but only * for single-parity RAID-Z. + * + * If we intend to skip a sector in the zeroth column for padding + * we must make sure to note this swap. We will never intend to + * skip the first column since at least one data and one parity + * column must appear in each row. */ ASSERT(rm->rm_cols >= 2); ASSERT(rm->rm_col[0].rc_size == rm->rm_col[1].rc_size); @@ -283,10 +543,13 @@ vdev_raidz_map_alloc(zio_t *zio, uint64_t unit_shift, uint64_t dcols, rm->rm_col[0].rc_offset = rm->rm_col[1].rc_offset; rm->rm_col[1].rc_devidx = devidx; rm->rm_col[1].rc_offset = o; + + if (rm->rm_skipstart == 0) + rm->rm_skipstart = 1; } zio->io_vsd = rm; - zio->io_vsd_free = vdev_raidz_map_free; + zio->io_vsd_ops = &vdev_raidz_vsd_ops; return (rm); } @@ -305,12 +568,12 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) if (c == rm->rm_firstdatacol) { ASSERT(ccount == pcount); - for (i = 0; i < ccount; i++, p++, src++) { + for (i = 0; i < ccount; i++, src++, p++) { *p = *src; } } else { ASSERT(ccount <= pcount); - for (i = 0; i < ccount; i++, p++, src++) { + for (i = 0; i < ccount; i++, src++, p++) { *p ^= *src; } } @@ -320,10 +583,10 @@ vdev_raidz_generate_parity_p(raidz_map_t *rm) static void vdev_raidz_generate_parity_pq(raidz_map_t *rm) { - uint64_t *q, *p, *src, pcount, ccount, mask, i; + uint64_t *p, *q, *src, pcnt, ccnt, mask, i; int c; - pcount = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == rm->rm_col[VDEV_RAIDZ_Q].rc_size); @@ -331,55 +594,138 @@ vdev_raidz_generate_parity_pq(raidz_map_t *rm) src = rm->rm_col[c].rc_data; p = rm->rm_col[VDEV_RAIDZ_P].rc_data; q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; - ccount = rm->rm_col[c].rc_size / sizeof (src[0]); + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); if (c == rm->rm_firstdatacol) { - ASSERT(ccount == pcount || ccount == 0); - for (i = 0; i < ccount; i++, p++, q++, src++) { - *q = *src; + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++) { *p = *src; + *q = *src; } - for (; i < pcount; i++, p++, q++, src++) { - *q = 0; + for (; i < pcnt; i++, src++, p++, q++) { *p = 0; + *q = 0; } } else { - ASSERT(ccount <= pcount); + ASSERT(ccnt <= pcnt); /* - * Rather than multiplying each byte individually (as - * described above), we are able to handle 8 at once - * by generating a mask based on the high bit in each - * byte and using that to conditionally XOR in 0x1d. + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. */ - for (i = 0; i < ccount; i++, p++, q++, src++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + for (i = 0; i < ccnt; i++, src++, p++, q++) { + *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); *q ^= *src; + } + + /* + * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. + */ + for (; i < pcnt; i++, q++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + } + } + } +} + +static void +vdev_raidz_generate_parity_pqr(raidz_map_t *rm) +{ + uint64_t *p, *q, *r, *src, pcnt, ccnt, mask, i; + int c; + + pcnt = rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0]); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_Q].rc_size); + ASSERT(rm->rm_col[VDEV_RAIDZ_P].rc_size == + rm->rm_col[VDEV_RAIDZ_R].rc_size); + + for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + src = rm->rm_col[c].rc_data; + p = rm->rm_col[VDEV_RAIDZ_P].rc_data; + q = rm->rm_col[VDEV_RAIDZ_Q].rc_data; + r = rm->rm_col[VDEV_RAIDZ_R].rc_data; + + ccnt = rm->rm_col[c].rc_size / sizeof (src[0]); + + if (c == rm->rm_firstdatacol) { + ASSERT(ccnt == pcnt || ccnt == 0); + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { + *p = *src; + *q = *src; + *r = *src; + } + for (; i < pcnt; i++, src++, p++, q++, r++) { + *p = 0; + *q = 0; + *r = 0; + } + } else { + ASSERT(ccnt <= pcnt); + + /* + * Apply the algorithm described above by multiplying + * the previous result and adding in the new value. + */ + for (i = 0; i < ccnt; i++, src++, p++, q++, r++) { *p ^= *src; + + VDEV_RAIDZ_64MUL_2(*q, mask); + *q ^= *src; + + VDEV_RAIDZ_64MUL_4(*r, mask); + *r ^= *src; } /* * Treat short columns as though they are full of 0s. + * Note that there's therefore nothing needed for P. */ - for (; i < pcount; i++, q++) { - mask = *q & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *q = ((*q << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + for (; i < pcnt; i++, q++, r++) { + VDEV_RAIDZ_64MUL_2(*q, mask); + VDEV_RAIDZ_64MUL_4(*r, mask); } } } } +/* + * Generate RAID parity in the first virtual columns according to the number of + * parity columns available. + */ static void -vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) +vdev_raidz_generate_parity(raidz_map_t *rm) +{ + switch (rm->rm_firstdatacol) { + case 1: + vdev_raidz_generate_parity_p(rm); + break; + case 2: + vdev_raidz_generate_parity_pq(rm); + break; + case 3: + vdev_raidz_generate_parity_pqr(rm); + break; + default: + cmn_err(CE_PANIC, "invalid RAID-Z configuration"); + } +} + +static int +vdev_raidz_reconstruct_p(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, i; + int x = tgts[0]; int c; + ASSERT(ntgts == 1); + ASSERT(x >= rm->rm_firstdatacol); + ASSERT(x < rm->rm_cols); + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_P].rc_size / sizeof (src[0])); ASSERT(xcount > 0); @@ -404,15 +750,20 @@ vdev_raidz_reconstruct_p(raidz_map_t *rm, int x) *dst ^= *src; } } + + return (1 << VDEV_RAIDZ_P); } -static void -vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) +static int +vdev_raidz_reconstruct_q(raidz_map_t *rm, int *tgts, int ntgts) { uint64_t *dst, *src, xcount, ccount, count, mask, i; uint8_t *b; + int x = tgts[0]; int c, j, exp; + ASSERT(ntgts == 1); + xcount = rm->rm_col[x].rc_size / sizeof (src[0]); ASSERT(xcount <= rm->rm_col[VDEV_RAIDZ_Q].rc_size / sizeof (src[0])); @@ -436,23 +787,13 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) } } else { - /* - * For an explanation of this, see the comment in - * vdev_raidz_generate_parity_pq() above. - */ for (i = 0; i < count; i++, dst++, src++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + VDEV_RAIDZ_64MUL_2(*dst, mask); *dst ^= *src; } for (; i < xcount; i++, dst++) { - mask = *dst & 0x8080808080808080ULL; - mask = (mask << 1) - (mask >> 7); - *dst = ((*dst << 1) & 0xfefefefefefefefeULL) ^ - (mask & 0x1d1d1d1d1d1d1d1dULL); + VDEV_RAIDZ_64MUL_2(*dst, mask); } } } @@ -467,15 +808,20 @@ vdev_raidz_reconstruct_q(raidz_map_t *rm, int x) *b = vdev_raidz_exp2(*b, exp); } } + + return (1 << VDEV_RAIDZ_Q); } -static void -vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) +static int +vdev_raidz_reconstruct_pq(raidz_map_t *rm, int *tgts, int ntgts) { uint8_t *p, *q, *pxy, *qxy, *xd, *yd, tmp, a, b, aexp, bexp; void *pdata, *qdata; uint64_t xsize, ysize, i; + int x = tgts[0]; + int y = tgts[1]; + ASSERT(ntgts == 2); ASSERT(x < y); ASSERT(x >= rm->rm_firstdatacol); ASSERT(y < rm->rm_cols); @@ -553,15 +899,554 @@ vdev_raidz_reconstruct_pq(raidz_map_t *rm, int x, int y) */ rm->rm_col[VDEV_RAIDZ_P].rc_data = pdata; rm->rm_col[VDEV_RAIDZ_Q].rc_data = qdata; + + return ((1 << VDEV_RAIDZ_P) | (1 << VDEV_RAIDZ_Q)); +} + +/* BEGIN CSTYLED */ +/* + * In the general case of reconstruction, we must solve the system of linear + * equations defined by the coeffecients used to generate parity as well as + * the contents of the data and parity disks. This can be expressed with + * vectors for the original data (D) and the actual data (d) and parity (p) + * and a matrix composed of the identity matrix (I) and a dispersal matrix (V): + * + * __ __ __ __ + * | | __ __ | p_0 | + * | V | | D_0 | | p_m-1 | + * | | x | : | = | d_0 | + * | I | | D_n-1 | | : | + * | | ~~ ~~ | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * I is simply a square identity matrix of size n, and V is a vandermonde + * matrix defined by the coeffecients we chose for the various parity columns + * (1, 2, 4). Note that these values were chosen both for simplicity, speedy + * computation as well as linear separability. + * + * __ __ __ __ + * | 1 .. 1 1 1 | | p_0 | + * | 2^n-1 .. 4 2 1 | __ __ | : | + * | 4^n-1 .. 16 4 1 | | D_0 | | p_m-1 | + * | 1 .. 0 0 0 | | D_1 | | d_0 | + * | 0 .. 0 0 0 | x | D_2 | = | d_1 | + * | : : : : | | : | | d_2 | + * | 0 .. 1 0 0 | | D_n-1 | | : | + * | 0 .. 0 1 0 | ~~ ~~ | : | + * | 0 .. 0 0 1 | | d_n-1 | + * ~~ ~~ ~~ ~~ + * + * Note that I, V, d, and p are known. To compute D, we must invert the + * matrix and use the known data and parity values to reconstruct the unknown + * data values. We begin by removing the rows in V|I and d|p that correspond + * to failed or missing columns; we then make V|I square (n x n) and d|p + * sized n by removing rows corresponding to unused parity from the bottom up + * to generate (V|I)' and (d|p)'. We can then generate the inverse of (V|I)' + * using Gauss-Jordan elimination. In the example below we use m=3 parity + * columns, n=8 data columns, with errors in d_1, d_2, and p_1: + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | <-----+-+-- missing disks + * | 19 205 116 29 64 16 4 1 | / / + * | 1 0 0 0 0 0 0 0 | / / + * | 0 1 0 0 0 0 0 0 | <--' / + * (V|I) = | 0 0 1 0 0 0 0 0 | <---' + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 1 1 1 1 1 1 1 | + * | 128 64 32 16 8 4 2 1 | + * | 19 205 116 29 64 16 4 1 | + * | 1 0 0 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 | + * (V|I)' = | 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * Here we employ Gauss-Jordan elimination to find the inverse of (V|I)'. We + * have carefully chosen the seed values 1, 2, and 4 to ensure that this + * matrix is not singular. + * __ __ + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 | + * | 19 205 116 29 64 16 4 1 0 1 0 0 0 0 0 0 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 205 116 0 0 0 0 0 0 1 19 29 64 16 4 1 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 185 0 0 0 0 0 205 1 222 208 141 221 201 204 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 1 0 0 0 0 0 1 0 1 1 1 1 1 1 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 | + * | 0 1 0 0 0 0 0 0 167 100 5 41 159 169 217 208 | + * | 0 0 1 0 0 0 0 0 166 100 4 40 158 168 216 209 | + * | 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * __ __ + * | 0 0 1 0 0 0 0 0 | + * | 167 100 5 41 159 169 217 208 | + * | 166 100 4 40 158 168 216 209 | + * (V|I)'^-1 = | 0 0 0 1 0 0 0 0 | + * | 0 0 0 0 1 0 0 0 | + * | 0 0 0 0 0 1 0 0 | + * | 0 0 0 0 0 0 1 0 | + * | 0 0 0 0 0 0 0 1 | + * ~~ ~~ + * + * We can then simply compute D = (V|I)'^-1 x (d|p)' to discover the values + * of the missing data. + * + * As is apparent from the example above, the only non-trivial rows in the + * inverse matrix correspond to the data disks that we're trying to + * reconstruct. Indeed, those are the only rows we need as the others would + * only be useful for reconstructing data known or assumed to be valid. For + * that reason, we only build the coefficients in the rows that correspond to + * targeted columns. + */ +/* END CSTYLED */ + +static void +vdev_raidz_matrix_init(raidz_map_t *rm, int n, int nmap, int *map, + uint8_t **rows) +{ + int i, j; + int pow; + + ASSERT(n == rm->rm_cols - rm->rm_firstdatacol); + + /* + * Fill in the missing rows of interest. + */ + for (i = 0; i < nmap; i++) { + ASSERT3S(0, <=, map[i]); + ASSERT3S(map[i], <=, 2); + + pow = map[i] * n; + if (pow > 255) + pow -= 255; + ASSERT(pow <= 255); + + for (j = 0; j < n; j++) { + pow -= map[i]; + if (pow < 0) + pow += 255; + rows[i][j] = vdev_raidz_pow2[pow]; + } + } +} + +static void +vdev_raidz_matrix_invert(raidz_map_t *rm, int n, int nmissing, int *missing, + uint8_t **rows, uint8_t **invrows, const uint8_t *used) +{ + int i, j, ii, jj; + uint8_t log; + + /* + * Assert that the first nmissing entries from the array of used + * columns correspond to parity columns and that subsequent entries + * correspond to data columns. + */ + for (i = 0; i < nmissing; i++) { + ASSERT3S(used[i], <, rm->rm_firstdatacol); + } + for (; i < n; i++) { + ASSERT3S(used[i], >=, rm->rm_firstdatacol); + } + + /* + * First initialize the storage where we'll compute the inverse rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + invrows[i][j] = (i == j) ? 1 : 0; + } + } + + /* + * Subtract all trivial rows from the rows of consequence. + */ + for (i = 0; i < nmissing; i++) { + for (j = nmissing; j < n; j++) { + ASSERT3U(used[j], >=, rm->rm_firstdatacol); + jj = used[j] - rm->rm_firstdatacol; + ASSERT3S(jj, <, n); + invrows[i][j] = rows[i][jj]; + rows[i][jj] = 0; + } + } + + /* + * For each of the rows of interest, we must normalize it and subtract + * a multiple of it from the other rows. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < missing[i]; j++) { + ASSERT3U(rows[i][j], ==, 0); + } + ASSERT3U(rows[i][missing[i]], !=, 0); + + /* + * Compute the inverse of the first element and multiply each + * element in the row by that value. + */ + log = 255 - vdev_raidz_log2[rows[i][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[i][j] = vdev_raidz_exp2(rows[i][j], log); + invrows[i][j] = vdev_raidz_exp2(invrows[i][j], log); + } + + for (ii = 0; ii < nmissing; ii++) { + if (i == ii) + continue; + + ASSERT3U(rows[ii][missing[i]], !=, 0); + + log = vdev_raidz_log2[rows[ii][missing[i]]]; + + for (j = 0; j < n; j++) { + rows[ii][j] ^= + vdev_raidz_exp2(rows[i][j], log); + invrows[ii][j] ^= + vdev_raidz_exp2(invrows[i][j], log); + } + } + } + + /* + * Verify that the data that is left in the rows are properly part of + * an identity matrix. + */ + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + if (j == missing[i]) { + ASSERT3U(rows[i][j], ==, 1); + } else { + ASSERT3U(rows[i][j], ==, 0); + } + } + } +} + +static void +vdev_raidz_matrix_reconstruct(raidz_map_t *rm, int n, int nmissing, + int *missing, uint8_t **invrows, const uint8_t *used) +{ + int i, j, x, cc, c; + uint8_t *src; + uint64_t ccount; + uint8_t *dst[VDEV_RAIDZ_MAXPARITY]; + uint64_t dcount[VDEV_RAIDZ_MAXPARITY]; + uint8_t log, val; + int ll; + uint8_t *invlog[VDEV_RAIDZ_MAXPARITY]; + uint8_t *p, *pp; + size_t psize; + + psize = sizeof (invlog[0][0]) * n * nmissing; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing; i++) { + invlog[i] = pp; + pp += n; + } + + for (i = 0; i < nmissing; i++) { + for (j = 0; j < n; j++) { + ASSERT3U(invrows[i][j], !=, 0); + invlog[i][j] = vdev_raidz_log2[invrows[i][j]]; + } + } + + for (i = 0; i < n; i++) { + c = used[i]; + ASSERT3U(c, <, rm->rm_cols); + + src = rm->rm_col[c].rc_data; + ccount = rm->rm_col[c].rc_size; + for (j = 0; j < nmissing; j++) { + cc = missing[j] + rm->rm_firstdatacol; + ASSERT3U(cc, >=, rm->rm_firstdatacol); + ASSERT3U(cc, <, rm->rm_cols); + ASSERT3U(cc, !=, c); + + dst[j] = rm->rm_col[cc].rc_data; + dcount[j] = rm->rm_col[cc].rc_size; + } + + ASSERT(ccount >= rm->rm_col[missing[0]].rc_size || i > 0); + + for (x = 0; x < ccount; x++, src++) { + if (*src != 0) + log = vdev_raidz_log2[*src]; + + for (cc = 0; cc < nmissing; cc++) { + if (x >= dcount[cc]) + continue; + + if (*src == 0) { + val = 0; + } else { + if ((ll = log + invlog[cc][i]) >= 255) + ll -= 255; + val = vdev_raidz_pow2[ll]; + } + + if (i == 0) + dst[cc][x] = val; + else + dst[cc][x] ^= val; + } + } + } + + kmem_free(p, psize); +} + +static int +vdev_raidz_reconstruct_general(raidz_map_t *rm, int *tgts, int ntgts) +{ + int n, i, c, t, tt; + int nmissing_rows; + int missing_rows[VDEV_RAIDZ_MAXPARITY]; + int parity_map[VDEV_RAIDZ_MAXPARITY]; + + uint8_t *p, *pp; + size_t psize; + + uint8_t *rows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *invrows[VDEV_RAIDZ_MAXPARITY]; + uint8_t *used; + + int code = 0; + + + n = rm->rm_cols - rm->rm_firstdatacol; + + /* + * Figure out which data columns are missing. + */ + nmissing_rows = 0; + for (t = 0; t < ntgts; t++) { + if (tgts[t] >= rm->rm_firstdatacol) { + missing_rows[nmissing_rows++] = + tgts[t] - rm->rm_firstdatacol; + } + } + + /* + * Figure out which parity columns to use to help generate the missing + * data columns. + */ + for (tt = 0, c = 0, i = 0; i < nmissing_rows; c++) { + ASSERT(tt < ntgts); + ASSERT(c < rm->rm_firstdatacol); + + /* + * Skip any targeted parity columns. + */ + if (c == tgts[tt]) { + tt++; + continue; + } + + code |= 1 << c; + + parity_map[i] = c; + i++; + } + + ASSERT(code != 0); + ASSERT3U(code, <, 1 << VDEV_RAIDZ_MAXPARITY); + + psize = (sizeof (rows[0][0]) + sizeof (invrows[0][0])) * + nmissing_rows * n + sizeof (used[0]) * n; + p = kmem_alloc(psize, KM_SLEEP); + + for (pp = p, i = 0; i < nmissing_rows; i++) { + rows[i] = pp; + pp += n; + invrows[i] = pp; + pp += n; + } + used = pp; + + for (i = 0; i < nmissing_rows; i++) { + used[i] = parity_map[i]; + } + + for (tt = 0, c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { + if (tt < nmissing_rows && + c == missing_rows[tt] + rm->rm_firstdatacol) { + tt++; + continue; + } + + ASSERT3S(i, <, n); + used[i] = c; + i++; + } + + /* + * Initialize the interesting rows of the matrix. + */ + vdev_raidz_matrix_init(rm, n, nmissing_rows, parity_map, rows); + + /* + * Invert the matrix. + */ + vdev_raidz_matrix_invert(rm, n, nmissing_rows, missing_rows, rows, + invrows, used); + + /* + * Reconstruct the missing data using the generated matrix. + */ + vdev_raidz_matrix_reconstruct(rm, n, nmissing_rows, missing_rows, + invrows, used); + + kmem_free(p, psize); + + return (code); } +static int +vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) +{ + int tgts[VDEV_RAIDZ_MAXPARITY], *dt; + int ntgts; + int i, c; + int code; + int nbadparity, nbaddata; + int parity_valid[VDEV_RAIDZ_MAXPARITY]; + + /* + * The tgts list must already be sorted. + */ + for (i = 1; i < nt; i++) { + ASSERT(t[i] > t[i - 1]); + } + + nbadparity = rm->rm_firstdatacol; + nbaddata = rm->rm_cols - nbadparity; + ntgts = 0; + for (i = 0, c = 0; c < rm->rm_cols; c++) { + if (c < rm->rm_firstdatacol) + parity_valid[c] = B_FALSE; + + if (i < nt && c == t[i]) { + tgts[ntgts++] = c; + i++; + } else if (rm->rm_col[c].rc_error != 0) { + tgts[ntgts++] = c; + } else if (c >= rm->rm_firstdatacol) { + nbaddata--; + } else { + parity_valid[c] = B_TRUE; + nbadparity--; + } + } + + ASSERT(ntgts >= nt); + ASSERT(nbaddata >= 0); + ASSERT(nbaddata + nbadparity == ntgts); + + dt = &tgts[nbadparity]; + + /* + * See if we can use any of our optimized reconstruction routines. + */ + if (!vdev_raidz_default_to_general) { + switch (nbaddata) { + case 1: + if (parity_valid[VDEV_RAIDZ_P]) + return (vdev_raidz_reconstruct_p(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_q(rm, dt, 1)); + + ASSERT(rm->rm_firstdatacol > 2); + break; + + case 2: + ASSERT(rm->rm_firstdatacol > 1); + + if (parity_valid[VDEV_RAIDZ_P] && + parity_valid[VDEV_RAIDZ_Q]) + return (vdev_raidz_reconstruct_pq(rm, dt, 2)); + + ASSERT(rm->rm_firstdatacol > 2); + + break; + } + } + + code = vdev_raidz_reconstruct_general(rm, tgts, ntgts); + ASSERT(code < (1 << VDEV_RAIDZ_MAXPARITY)); + ASSERT(code > 0); + return (code); +} static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { vdev_t *cvd; uint64_t nparity = vd->vdev_nparity; - int c, error; + int c; int lasterror = 0; int numerrors = 0; @@ -573,11 +1458,13 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) return (EINVAL); } + vdev_open_children(vd); + for (c = 0; c < vd->vdev_children; c++) { cvd = vd->vdev_child[c]; - if ((error = vdev_open(cvd)) != 0) { - lasterror = error; + if (cvd->vdev_open_error != 0) { + lasterror = cvd->vdev_open_error; numerrors++; continue; } @@ -636,10 +1523,9 @@ vdev_raidz_io_start(zio_t *zio) vdev_t *vd = zio->io_vd; vdev_t *tvd = vd->vdev_top; vdev_t *cvd; - blkptr_t *bp = zio->io_bp; raidz_map_t *rm; raidz_col_t *rc; - int c; + int c, i; rm = vdev_raidz_map_alloc(zio, tvd->vdev_ashift, vd->vdev_children, vd->vdev_nparity); @@ -647,13 +1533,7 @@ vdev_raidz_io_start(zio_t *zio) ASSERT3U(rm->rm_asize, ==, vdev_psize_to_asize(vd, zio->io_size)); if (zio->io_type == ZIO_TYPE_WRITE) { - /* - * Generate RAID parity in the first virtual columns. - */ - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; @@ -664,6 +1544,23 @@ vdev_raidz_io_start(zio_t *zio) vdev_raidz_child_done, rc)); } + /* + * Generate optional I/Os for any skipped sectors to improve + * aggregation contiguity. + */ + for (c = rm->rm_skipstart, i = 0; i < rm->rm_nskip; c++, i++) { + ASSERT(c <= rm->rm_scols); + if (c == rm->rm_scols) + c = 0; + rc = &rm->rm_col[c]; + cvd = vd->vdev_child[rc->rc_devidx]; + zio_nowait(zio_vdev_child_io(zio, NULL, cvd, + rc->rc_offset + rc->rc_size, NULL, + 1 << tvd->vdev_ashift, + zio->io_type, zio->io_priority, + ZIO_FLAG_NODATA | ZIO_FLAG_OPTIONAL, NULL, NULL)); + } + return (ZIO_PIPELINE_CONTINUE); } @@ -671,8 +1568,7 @@ vdev_raidz_io_start(zio_t *zio) /* * Iterate over the columns in reverse order so that we hit the parity - * last -- any errors along the way will force us to read the parity - * data. + * last -- any errors along the way will force us to read the parity. */ for (c = rm->rm_cols - 1; c >= 0; c--) { rc = &rm->rm_col[c]; @@ -687,7 +1583,7 @@ vdev_raidz_io_start(zio_t *zio) rc->rc_skipped = 1; continue; } - if (vdev_dtl_contains(&cvd->vdev_dtl_map, bp->blk_birth, 1)) { + if (vdev_dtl_contains(cvd, DTL_MISSING, zio->io_txg, 1)) { if (c >= rm->rm_firstdatacol) rm->rm_missingdata++; else @@ -697,7 +1593,7 @@ vdev_raidz_io_start(zio_t *zio) continue; } if (c >= rm->rm_firstdatacol || rm->rm_missingdata > 0 || - (zio->io_flags & ZIO_FLAG_SCRUB)) { + (zio->io_flags & (ZIO_FLAG_SCRUB | ZIO_FLAG_RESILVER))) { zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, zio->io_type, zio->io_priority, 0, @@ -712,19 +1608,42 @@ vdev_raidz_io_start(zio_t *zio) * Report a checksum error for a child of a RAID-Z device. */ static void -raidz_checksum_error(zio_t *zio, raidz_col_t *rc) +raidz_checksum_error(zio_t *zio, raidz_col_t *rc, void *bad_data) { vdev_t *vd = zio->io_vd->vdev_child[rc->rc_devidx]; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; + mutex_enter(&vd->vdev_stat_lock); vd->vdev_stat.vs_checksum_errors++; mutex_exit(&vd->vdev_stat_lock); + + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = rm->rm_ecksuminjected; + + zfs_ereport_post_checksum(zio->io_spa, vd, zio, + rc->rc_offset, rc->rc_size, rc->rc_data, bad_data, + &zbc); } +} + +/* + * We keep track of whether or not there were any injected errors, so that + * any ereports we generate can note it. + */ +static int +raidz_checksum_verify(zio_t *zio) +{ + zio_bad_cksum_t zbc; + raidz_map_t *rm = zio->io_vsd; + + int ret = zio_checksum_error(zio, &zbc); + if (ret != 0 && zbc.zbc_injected != 0) + rm->rm_ecksuminjected = 1; - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd, zio, rc->rc_offset, rc->rc_size); + return (ret); } /* @@ -748,17 +1667,14 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) bcopy(rc->rc_data, orig[c], rc->rc_size); } - if (rm->rm_firstdatacol == 1) - vdev_raidz_generate_parity_p(rm); - else - vdev_raidz_generate_parity_pq(rm); + vdev_raidz_generate_parity(rm); for (c = 0; c < rm->rm_firstdatacol; c++) { rc = &rm->rm_col[c]; if (!rc->rc_tried || rc->rc_error != 0) continue; if (bcmp(orig[c], rc->rc_data, rc->rc_size) != 0) { - raidz_checksum_error(zio, rc); + raidz_checksum_error(zio, rc, orig[c]); rc->rc_error = ECKSUM; ret++; } @@ -768,9 +1684,10 @@ raidz_parity_verify(zio_t *zio, raidz_map_t *rm) return (ret); } -static uint64_t raidz_corrected_p; -static uint64_t raidz_corrected_q; -static uint64_t raidz_corrected_pq; +/* + * Keep statistics on all the ways that we used parity to correct data. + */ +static uint64_t raidz_corrected[1 << VDEV_RAIDZ_MAXPARITY]; static int vdev_raidz_worst_error(raidz_map_t *rm) @@ -783,19 +1700,177 @@ vdev_raidz_worst_error(raidz_map_t *rm) return (error); } +/* + * Iterate over all combinations of bad data and attempt a reconstruction. + * Note that the algorithm below is non-optimal because it doesn't take into + * account how reconstruction is actually performed. For example, with + * triple-parity RAID-Z the reconstruction procedure is the same if column 4 + * is targeted as invalid as if columns 1 and 4 are targeted since in both + * cases we'd only use parity information in column 0. + */ +static int +vdev_raidz_combrec(zio_t *zio, int total_errors, int data_errors) +{ + raidz_map_t *rm = zio->io_vsd; + raidz_col_t *rc; + void *orig[VDEV_RAIDZ_MAXPARITY]; + int tstore[VDEV_RAIDZ_MAXPARITY + 2]; + int *tgts = &tstore[1]; + int current, next, i, c, n; + int code, ret = 0; + + ASSERT(total_errors < rm->rm_firstdatacol); + + /* + * This simplifies one edge condition. + */ + tgts[-1] = -1; + + for (n = 1; n <= rm->rm_firstdatacol - total_errors; n++) { + /* + * Initialize the targets array by finding the first n columns + * that contain no error. + * + * If there were no data errors, we need to ensure that we're + * always explicitly attempting to reconstruct at least one + * data column. To do this, we simply push the highest target + * up into the data columns. + */ + for (c = 0, i = 0; i < n; i++) { + if (i == n - 1 && data_errors == 0 && + c < rm->rm_firstdatacol) { + c = rm->rm_firstdatacol; + } + + while (rm->rm_col[c].rc_error != 0) { + c++; + ASSERT3S(c, <, rm->rm_cols); + } + + tgts[i] = c++; + } + + /* + * Setting tgts[n] simplifies the other edge condition. + */ + tgts[n] = rm->rm_cols; + + /* + * These buffers were allocated in previous iterations. + */ + for (i = 0; i < n - 1; i++) { + ASSERT(orig[i] != NULL); + } + + orig[n - 1] = zio_buf_alloc(rm->rm_col[0].rc_size); + + current = 0; + next = tgts[current]; + + while (current != n) { + tgts[current] = next; + current = 0; + + /* + * Save off the original data that we're going to + * attempt to reconstruct. + */ + for (i = 0; i < n; i++) { + ASSERT(orig[i] != NULL); + c = tgts[i]; + ASSERT3S(c, >=, 0); + ASSERT3S(c, <, rm->rm_cols); + rc = &rm->rm_col[c]; + bcopy(rc->rc_data, orig[i], rc->rc_size); + } + + /* + * Attempt a reconstruction and exit the outer loop on + * success. + */ + code = vdev_raidz_reconstruct(rm, tgts, n); + if (raidz_checksum_verify(zio) == 0) { + atomic_inc_64(&raidz_corrected[code]); + + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + ASSERT(rc->rc_error == 0); + if (rc->rc_tried) + raidz_checksum_error(zio, rc, + orig[i]); + rc->rc_error = ECKSUM; + } + + ret = code; + goto done; + } + + /* + * Restore the original data. + */ + for (i = 0; i < n; i++) { + c = tgts[i]; + rc = &rm->rm_col[c]; + bcopy(orig[i], rc->rc_data, rc->rc_size); + } + + do { + /* + * Find the next valid column after the current + * position.. + */ + for (next = tgts[current] + 1; + next < rm->rm_cols && + rm->rm_col[next].rc_error != 0; next++) + continue; + + ASSERT(next <= tgts[current + 1]); + + /* + * If that spot is available, we're done here. + */ + if (next != tgts[current + 1]) + break; + + /* + * Otherwise, find the next valid column after + * the previous position. + */ + for (c = tgts[current - 1] + 1; + rm->rm_col[c].rc_error != 0; c++) + continue; + + tgts[current] = c; + current++; + + } while (current != n); + } + } + n--; +done: + for (i = 0; i < n; i++) { + zio_buf_free(orig[i], rm->rm_col[0].rc_size); + } + + return (ret); +} + static void vdev_raidz_io_done(zio_t *zio) { vdev_t *vd = zio->io_vd; vdev_t *cvd; raidz_map_t *rm = zio->io_vsd; - raidz_col_t *rc, *rc1; + raidz_col_t *rc; int unexpected_errors = 0; int parity_errors = 0; int parity_untried = 0; int data_errors = 0; int total_errors = 0; - int n, c, c1; + int n, c; + int tgts[VDEV_RAIDZ_MAXPARITY]; + int code; ASSERT(zio->io_bp != NULL); /* XXX need to add code to enforce this */ @@ -859,9 +1934,8 @@ vdev_raidz_io_done(zio_t *zio) * any errors. */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { - switch (data_errors) { - case 0: - if (zio_checksum_error(zio) == 0) { + if (data_errors == 0) { + if (raidz_checksum_verify(zio) == 0) { /* * If we read parity information (unnecessarily * as it happens since no reconstruction was @@ -880,9 +1954,7 @@ vdev_raidz_io_done(zio_t *zio) } goto done; } - break; - - case 1: + } else { /* * We either attempt to read all the parity columns or * none of them. If we didn't try to read parity, we @@ -894,45 +1966,38 @@ vdev_raidz_io_done(zio_t *zio) ASSERT(parity_errors < rm->rm_firstdatacol); /* - * Find the column that reported the error. + * Identify the data columns that reported an error. */ + n = 0; for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; + if (rc->rc_error != 0) { + ASSERT(n < VDEV_RAIDZ_MAXPARITY); + tgts[n++] = c; + } } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { - vdev_raidz_reconstruct_p(rm, c); - } else { - ASSERT(rm->rm_firstdatacol > 1); - vdev_raidz_reconstruct_q(rm, c); - } + ASSERT(rm->rm_firstdatacol >= n); - if (zio_checksum_error(zio) == 0) { - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) - atomic_inc_64(&raidz_corrected_p); - else - atomic_inc_64(&raidz_corrected_q); + code = vdev_raidz_reconstruct(rm, tgts, n); + + if (raidz_checksum_verify(zio) == 0) { + atomic_inc_64(&raidz_corrected[code]); /* - * If there's more than one parity disk that - * was successfully read, confirm that the - * other parity disk produced the correct data. - * This routine is suboptimal in that it - * regenerates both the parity we wish to test - * as well as the parity we just used to - * perform the reconstruction, but this should - * be a relatively uncommon case, and can be - * optimized if it becomes a problem. - * We also regenerate parity when resilvering - * so we can write it out to the failed device - * later. + * If we read more parity disks than were used + * for reconstruction, confirm that the other + * parity disks produced correct data. This + * routine is suboptimal in that it regenerates + * the parity that we already used in addition + * to the parity that we're attempting to + * verify, but this should be a relatively + * uncommon case, and can be optimized if it + * becomes a problem. Note that we regenerate + * parity when resilvering so we can write it + * out to failed devices later. */ - if (parity_errors < rm->rm_firstdatacol - 1 || + if (parity_errors < rm->rm_firstdatacol - n || (zio->io_flags & ZIO_FLAG_RESILVER)) { n = raidz_parity_verify(zio, rm); unexpected_errors += n; @@ -942,46 +2007,6 @@ vdev_raidz_io_done(zio_t *zio) goto done; } - break; - - case 2: - /* - * Two data column errors require double parity. - */ - ASSERT(rm->rm_firstdatacol == 2); - - /* - * Find the two columns that reported errors. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - for (c1 = c++; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - if (rc->rc_error != 0) - break; - } - ASSERT(c != rm->rm_cols); - ASSERT(!rc->rc_skipped || rc->rc_error == ENXIO || - rc->rc_error == ESTALE); - - vdev_raidz_reconstruct_pq(rm, c1, c); - - if (zio_checksum_error(zio) == 0) { - atomic_inc_64(&raidz_corrected_pq); - goto done; - } - break; - - default: - ASSERT(rm->rm_firstdatacol <= 2); - ASSERT(0); } } @@ -1020,152 +2045,61 @@ vdev_raidz_io_done(zio_t *zio) * errors we detected, and we've attempted to read all columns. There * must, therefore, be one or more additional problems -- silent errors * resulting in invalid data rather than explicit I/O errors resulting - * in absent data. Before we attempt combinatorial reconstruction make - * sure we have a chance of coming up with the right answer. + * in absent data. We check if there is enough additional data to + * possibly reconstruct the data and then perform combinatorial + * reconstruction over all possible combinations. If that fails, + * we're cooked. */ - if (total_errors >= rm->rm_firstdatacol) { + if (total_errors > rm->rm_firstdatacol) { zio->io_error = vdev_raidz_worst_error(rm); - /* - * If there were exactly as many device errors as parity - * columns, yet we couldn't reconstruct the data, then at - * least one device must have returned bad data silently. - */ - if (total_errors == rm->rm_firstdatacol) - zio->io_error = zio_worst_error(zio->io_error, ECKSUM); - goto done; - } - - if (rm->rm_col[VDEV_RAIDZ_P].rc_error == 0) { - /* - * Attempt to reconstruct the data from parity P. - */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_p(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - atomic_inc_64(&raidz_corrected_p); - - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - if (rm->rm_firstdatacol > 1 && rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + } else if (total_errors < rm->rm_firstdatacol && + (code = vdev_raidz_combrec(zio, total_errors, data_errors)) != 0) { /* - * Attempt to reconstruct the data from parity Q. + * If we didn't use all the available parity for the + * combinatorial reconstruction, verify that the remaining + * parity is correct. */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols; c++) { - void *orig; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - vdev_raidz_reconstruct_q(rm, c); - - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - atomic_inc_64(&raidz_corrected_q); - - /* - * If this child didn't know that it returned - * bad data, inform it. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - rc->rc_error = ECKSUM; - goto done; - } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - if (rm->rm_firstdatacol > 1 && - rm->rm_col[VDEV_RAIDZ_P].rc_error == 0 && - rm->rm_col[VDEV_RAIDZ_Q].rc_error == 0) { + if (code != (1 << rm->rm_firstdatacol) - 1) + (void) raidz_parity_verify(zio, rm); + } else { /* - * Attempt to reconstruct the data from both P and Q. + * We're here because either: + * + * total_errors == rm_first_datacol, or + * vdev_raidz_combrec() failed + * + * In either case, there is enough bad data to prevent + * reconstruction. + * + * Start checksum ereports for all children which haven't + * failed, and the IO wasn't speculative. */ - for (c = rm->rm_firstdatacol; c < rm->rm_cols - 1; c++) { - void *orig, *orig1; - rc = &rm->rm_col[c]; - - orig = zio_buf_alloc(rc->rc_size); - bcopy(rc->rc_data, orig, rc->rc_size); - - for (c1 = c + 1; c1 < rm->rm_cols; c1++) { - rc1 = &rm->rm_col[c1]; - - orig1 = zio_buf_alloc(rc1->rc_size); - bcopy(rc1->rc_data, orig1, rc1->rc_size); - - vdev_raidz_reconstruct_pq(rm, c, c1); + zio->io_error = ECKSUM; - if (zio_checksum_error(zio) == 0) { - zio_buf_free(orig, rc->rc_size); - zio_buf_free(orig1, rc1->rc_size); - atomic_inc_64(&raidz_corrected_pq); - - /* - * If these children didn't know they - * returned bad data, inform them. - */ - if (rc->rc_tried && rc->rc_error == 0) - raidz_checksum_error(zio, rc); - if (rc1->rc_tried && rc1->rc_error == 0) - raidz_checksum_error(zio, rc1); - - rc->rc_error = ECKSUM; - rc1->rc_error = ECKSUM; - - goto done; + if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { + for (c = 0; c < rm->rm_cols; c++) { + rc = &rm->rm_col[c]; + if (rc->rc_error == 0) { + zio_bad_cksum_t zbc; + zbc.zbc_has_cksum = 0; + zbc.zbc_injected = + rm->rm_ecksuminjected; + + zfs_ereport_start_checksum( + zio->io_spa, + vd->vdev_child[rc->rc_devidx], + zio, rc->rc_offset, rc->rc_size, + (void *)(uintptr_t)c, &zbc); } - - bcopy(orig1, rc1->rc_data, rc1->rc_size); - zio_buf_free(orig1, rc1->rc_size); } - - bcopy(orig, rc->rc_data, rc->rc_size); - zio_buf_free(orig, rc->rc_size); - } - } - - /* - * All combinations failed to checksum. Generate checksum ereports for - * all children. - */ - zio->io_error = ECKSUM; - - if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - for (c = 0; c < rm->rm_cols; c++) { - rc = &rm->rm_col[c]; - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, vd->vdev_child[rc->rc_devidx], zio, - rc->rc_offset, rc->rc_size); } } done: zio_checksum_verified(zio); - if (zio->io_error == 0 && (spa_mode & FWRITE) && + if (zio->io_error == 0 && spa_writeable(zio->io_spa) && (unexpected_errors || (zio->io_flags & ZIO_FLAG_RESILVER))) { /* * Use the good data we have in hand to repair damaged children. @@ -1180,7 +2114,8 @@ vdev_raidz_io_done(zio_t *zio) zio_nowait(zio_vdev_child_io(zio, NULL, cvd, rc->rc_offset, rc->rc_data, rc->rc_size, ZIO_TYPE_WRITE, zio->io_priority, - ZIO_FLAG_IO_REPAIR, NULL, NULL)); + ZIO_FLAG_IO_REPAIR | (unexpected_errors ? + ZIO_FLAG_SELF_HEAL : 0), NULL, NULL)); } } } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c index 88383f002b805..524c8e60601d9 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/vdev_root.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,7 +52,6 @@ too_many_errors(vdev_t *vd, int numerrors) static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) { - int c; int lasterror = 0; int numerrors = 0; @@ -61,15 +60,14 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) return (EINVAL); } - for (c = 0; c < vd->vdev_children; c++) { + vdev_open_children(vd); + + for (int c = 0; c < vd->vdev_children; c++) { vdev_t *cvd = vd->vdev_child[c]; - int error; - if ((error = vdev_open(cvd)) != 0 && - !cvd->vdev_islog) { - lasterror = error; + if (cvd->vdev_open_error && !cvd->vdev_islog) { + lasterror = cvd->vdev_open_error; numerrors++; - continue; } } @@ -87,9 +85,7 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *ashift) static void vdev_root_close(vdev_t *vd) { - int c; - - for (c = 0; c < vd->vdev_children; c++) + for (int c = 0; c < vd->vdev_children; c++) vdev_close(vd->vdev_child[c]); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c index ca859ec355dab..3be29e971c2d1 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zap.c @@ -19,13 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - - /* * This file contains the top half of the zfs directory structure * implementation. The bottom half is in zap_leaf.c. @@ -45,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -72,7 +70,7 @@ fzap_byteswap(void *vbuf, size_t size) } void -fzap_upgrade(zap_t *zap, dmu_tx_t *tx) +fzap_upgrade(zap_t *zap, dmu_tx_t *tx, zap_flags_t flags) { dmu_buf_t *db; zap_leaf_t *l; @@ -104,6 +102,7 @@ fzap_upgrade(zap_t *zap, dmu_tx_t *tx) zp->zap_num_entries = 0; zp->zap_salt = zap->zap_salt; zp->zap_normflags = zap->zap_normflags; + zp->zap_flags = flags; /* block 1 will be the first leaf */ for (i = 0; i < (1<zap_ptrtbl.zt_shift); i++) @@ -317,8 +316,13 @@ zap_ptrtbl_transfer(const uint64_t *src, uint64_t *dst, int n) static int zap_grow_ptrtbl(zap_t *zap, dmu_tx_t *tx) { - /* In case things go horribly wrong. */ - if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= ZAP_HASHBITS-2) + /* + * The pointer table should never use more hash bits than we + * have (otherwise we'd be using useless zero bits to index it). + * If we are within 2 bits of running out, stop growing, since + * this is already an aberrant condition. + */ + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_shift >= zap_hashbits(zap) - 2) return (ENOSPC); if (zap->zap_f.zap_phys->zap_ptrtbl.zt_numblks == 0) { @@ -702,13 +706,17 @@ zap_put_leaf_maybe_grow_ptrtbl(zap_name_t *zn, zap_leaf_t *l, dmu_tx_t *tx) } } - static int -fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers) +fzap_checkname(zap_name_t *zn) { - if (name && strlen(name) > ZAP_MAXNAMELEN) - return (E2BIG); + if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + return (0); +} +static int +fzap_checksize(uint64_t integer_size, uint64_t num_integers) +{ /* Only integer sizes supported by C */ switch (integer_size) { case 1: @@ -726,6 +734,16 @@ fzap_checksize(const char *name, uint64_t integer_size, uint64_t num_integers) return (0); } +static int +fzap_check(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers) +{ + int err; + + if ((err = fzap_checkname(zn)) != 0) + return (err); + return (fzap_checksize(integer_size, num_integers)); +} + /* * Routines for manipulating attributes. */ @@ -738,8 +756,7 @@ fzap_lookup(zap_name_t *zn, int err; zap_entry_handle_t zeh; - err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); - if (err != 0) + if ((err = fzap_checkname(zn)) != 0) return (err); err = zap_deref_leaf(zn->zn_zap, zn->zn_hash, NULL, RW_READER, &l); @@ -747,8 +764,13 @@ fzap_lookup(zap_name_t *zn, return (err); err = zap_leaf_lookup(l, zn, &zeh); if (err == 0) { + if ((err = fzap_checksize(integer_size, num_integers)) != 0) { + zap_put_leaf(l); + return (err); + } + err = zap_entry_read(&zeh, integer_size, num_integers, buf); - (void) zap_entry_read_name(&zeh, rn_len, realname); + (void) zap_entry_read_name(zn->zn_zap, &zeh, rn_len, realname); if (ncp) { *ncp = zap_entry_normalization_conflict(&zeh, zn, NULL, zn->zn_zap); @@ -771,8 +793,7 @@ fzap_add_cd(zap_name_t *zn, ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); ASSERT(!zap->zap_ismicro); - ASSERT(fzap_checksize(zn->zn_name_orij, - integer_size, num_integers) == 0); + ASSERT(fzap_check(zn, integer_size, num_integers) == 0); err = zap_deref_leaf(zap, zn->zn_hash, tx, RW_WRITER, &l); if (err != 0) @@ -786,7 +807,7 @@ fzap_add_cd(zap_name_t *zn, if (err != ENOENT) goto out; - err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, cd, + err = zap_entry_create(l, zn, cd, integer_size, num_integers, val, &zeh); if (err == 0) { @@ -809,12 +830,12 @@ fzap_add(zap_name_t *zn, uint64_t integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { - int err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); + int err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); return (fzap_add_cd(zn, integer_size, num_integers, - val, ZAP_MAXCD, tx)); + val, ZAP_NEED_CD, tx)); } int @@ -827,7 +848,7 @@ fzap_update(zap_name_t *zn, zap_t *zap = zn->zn_zap; ASSERT(RW_LOCK_HELD(&zap->zap_rwlock)); - err = fzap_checksize(zn->zn_name_orij, integer_size, num_integers); + err = fzap_check(zn, integer_size, num_integers); if (err != 0) return (err); @@ -840,8 +861,8 @@ fzap_update(zap_name_t *zn, ASSERT(err == 0 || err == ENOENT); if (create) { - err = zap_entry_create(l, zn->zn_name_orij, zn->zn_hash, - ZAP_MAXCD, integer_size, num_integers, val, &zeh); + err = zap_entry_create(l, zn, ZAP_NEED_CD, + integer_size, num_integers, val, &zeh); if (err == 0) zap_increment_num_entries(zap, 1, tx); } else { @@ -980,6 +1001,30 @@ zap_lookup_int(objset_t *os, uint64_t obj, uint64_t value) return (zap_lookup(os, obj, name, 8, 1, &value)); } +int +zap_increment_int(objset_t *os, uint64_t obj, uint64_t key, int64_t delta, + dmu_tx_t *tx) +{ + char name[20]; + uint64_t value = 0; + int err; + + if (delta == 0) + return (0); + + (void) snprintf(name, sizeof (name), "%llx", (longlong_t)key); + err = zap_lookup(os, obj, name, 8, 1, &value); + if (err != 0 && err != ENOENT) + return (err); + value += delta; + if (value == 0) + err = zap_remove(os, obj, name, tx); + else + err = zap_update(os, obj, name, 8, 1, &value, tx); + return (err); +} + + /* * Routines for iterating over the attributes. */ @@ -1041,7 +1086,7 @@ fzap_cursor_retrieve(zap_t *zap, zap_cursor_t *zc, zap_attribute_t *za) err = zap_entry_read(&zeh, 8, 1, &za->za_first_integer); ASSERT(err == 0 || err == EOVERFLOW); } - err = zap_entry_read_name(&zeh, + err = zap_entry_read_name(zap, &zeh, sizeof (za->za_name), za->za_name); ASSERT(err == 0); @@ -1080,6 +1125,31 @@ zap_stats_ptrtbl(zap_t *zap, uint64_t *tbl, int len, zap_stats_t *zs) } } +int +fzap_cursor_move_to_key(zap_cursor_t *zc, zap_name_t *zn) +{ + int err; + zap_leaf_t *l; + zap_entry_handle_t zeh; + + if (zn->zn_key_orig_numints * zn->zn_key_intlen > ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + + err = zap_deref_leaf(zc->zc_zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) + return (err); + + err = zap_leaf_lookup(l, zn, &zeh); + if (err != 0) + return (err); + + zc->zc_leaf = l; + zc->zc_hash = zeh.zeh_hash; + zc->zc_cd = zeh.zeh_cd; + + return (err); +} + void fzap_get_stats(zap_t *zap, zap_stats_t *zs) { @@ -1134,3 +1204,58 @@ fzap_get_stats(zap_t *zap, zap_stats_t *zs) } } } + +int +fzap_count_write(zap_name_t *zn, int add, uint64_t *towrite, + uint64_t *tooverwrite) +{ + zap_t *zap = zn->zn_zap; + zap_leaf_t *l; + int err; + + /* + * Account for the header block of the fatzap. + */ + if (!add && dmu_buf_freeable(zap->zap_dbuf)) { + *tooverwrite += zap->zap_dbuf->db_size; + } else { + *towrite += zap->zap_dbuf->db_size; + } + + /* + * Account for the pointer table blocks. + * If we are adding we need to account for the following cases : + * - If the pointer table is embedded, this operation could force an + * external pointer table. + * - If this already has an external pointer table this operation + * could extend the table. + */ + if (add) { + if (zap->zap_f.zap_phys->zap_ptrtbl.zt_blk == 0) + *towrite += zap->zap_dbuf->db_size; + else + *towrite += (zap->zap_dbuf->db_size * 3); + } + + /* + * Now, check if the block containing leaf is freeable + * and account accordingly. + */ + err = zap_deref_leaf(zap, zn->zn_hash, NULL, RW_READER, &l); + if (err != 0) { + return (err); + } + + if (!add && dmu_buf_freeable(l->l_dbuf)) { + *tooverwrite += l->l_dbuf->db_size; + } else { + /* + * If this an add operation, the leaf block could split. + * Hence, we need to account for an additional leaf block. + */ + *towrite += (add ? 2 : 1) * l->l_dbuf->db_size; + } + + zap_put_leaf(l); + return (0); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c index da498b6bc9e3f..285d9c56742b1 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zap_leaf.c @@ -19,24 +19,24 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * The 512-byte leaf is broken into 32 16-byte chunks. * chunk number n means l_chunk[n], even though the header precedes it. * the names are stored null-terminated. */ +#include +#include +#include #include +#include #include #include #include -#include -#include static uint16_t *zap_leaf_rehash_entry(zap_leaf_t *l, uint16_t entry); @@ -127,12 +127,12 @@ zap_leaf_byteswap(zap_leaf_phys_t *buf, int size) le = &lc->l_entry; le->le_type = BSWAP_8(le->le_type); - le->le_int_size = BSWAP_8(le->le_int_size); + le->le_value_intlen = BSWAP_8(le->le_value_intlen); le->le_next = BSWAP_16(le->le_next); le->le_name_chunk = BSWAP_16(le->le_name_chunk); - le->le_name_length = BSWAP_16(le->le_name_length); + le->le_name_numints = BSWAP_16(le->le_name_numints); le->le_value_chunk = BSWAP_16(le->le_value_chunk); - le->le_value_length = BSWAP_16(le->le_value_length); + le->le_value_numints = BSWAP_16(le->le_value_numints); le->le_cd = BSWAP_32(le->le_cd); le->le_hash = BSWAP_64(le->le_hash); break; @@ -215,7 +215,7 @@ zap_leaf_chunk_free(zap_leaf_t *l, uint16_t chunk) static uint16_t zap_leaf_array_create(zap_leaf_t *l, const char *buf, - int integer_size, int num_integers) + int integer_size, int num_integers) { uint16_t chunk_head; uint16_t *chunkp = &chunk_head; @@ -273,11 +273,12 @@ zap_leaf_array_free(zap_leaf_t *l, uint16_t *chunkp) static void zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, int array_int_len, int array_len, int buf_int_len, uint64_t buf_len, - char *buf) + void *buf) { int len = MIN(array_len, buf_len); int byten = 0; uint64_t value = 0; + char *p = buf; ASSERT3U(array_int_len, <=, buf_int_len); @@ -285,7 +286,7 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, if (array_int_len == 8 && buf_int_len == 8 && len == 1) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; uint8_t *ip = la->la_array; - uint64_t *buf64 = (uint64_t *)buf; + uint64_t *buf64 = buf; *buf64 = (uint64_t)ip[0] << 56 | (uint64_t)ip[1] << 48 | (uint64_t)ip[2] << 40 | (uint64_t)ip[3] << 32 | @@ -300,8 +301,8 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, while (chunk != CHAIN_END) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - bcopy(la->la_array, buf, ZAP_LEAF_ARRAY_BYTES); - buf += ZAP_LEAF_ARRAY_BYTES; + bcopy(la->la_array, p, ZAP_LEAF_ARRAY_BYTES); + p += ZAP_LEAF_ARRAY_BYTES; chunk = la->la_next; } return; @@ -316,50 +317,69 @@ zap_leaf_array_read(zap_leaf_t *l, uint16_t chunk, value = (value << 8) | la->la_array[i]; byten++; if (byten == array_int_len) { - stv(buf_int_len, buf, value); + stv(buf_int_len, p, value); byten = 0; len--; if (len == 0) return; - buf += buf_int_len; + p += buf_int_len; } } chunk = la->la_next; } } -/* - * Only to be used on 8-bit arrays. - * array_len is actual len in bytes (not encoded le_value_length). - * namenorm is null-terminated. - */ static boolean_t -zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, int chunk, int array_len) +zap_leaf_array_match(zap_leaf_t *l, zap_name_t *zn, + int chunk, int array_numints) { int bseen = 0; + if (zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY) { + uint64_t *thiskey; + boolean_t match; + + ASSERT(zn->zn_key_intlen == sizeof (*thiskey)); + thiskey = kmem_alloc(array_numints * sizeof (*thiskey), + KM_SLEEP); + + zap_leaf_array_read(l, chunk, sizeof (*thiskey), array_numints, + sizeof (*thiskey), array_numints, thiskey); + match = bcmp(thiskey, zn->zn_key_orig, + array_numints * sizeof (*thiskey)) == 0; + kmem_free(thiskey, array_numints * sizeof (*thiskey)); + return (match); + } + + ASSERT(zn->zn_key_intlen == 1); if (zn->zn_matchtype == MT_FIRST) { - char *thisname = kmem_alloc(array_len, KM_SLEEP); + char *thisname = kmem_alloc(array_numints, KM_SLEEP); boolean_t match; - zap_leaf_array_read(l, chunk, 1, array_len, 1, - array_len, thisname); + zap_leaf_array_read(l, chunk, sizeof (char), array_numints, + sizeof (char), array_numints, thisname); match = zap_match(zn, thisname); - kmem_free(thisname, array_len); + kmem_free(thisname, array_numints); return (match); } - /* Fast path for exact matching */ - while (bseen < array_len) { + /* + * Fast path for exact matching. + * First check that the lengths match, so that we don't read + * past the end of the zn_key_orig array. + */ + if (array_numints != zn->zn_key_orig_numints) + return (B_FALSE); + while (bseen < array_numints) { struct zap_leaf_array *la = &ZAP_LEAF_CHUNK(l, chunk).l_array; - int toread = MIN(array_len - bseen, ZAP_LEAF_ARRAY_BYTES); + int toread = MIN(array_numints - bseen, ZAP_LEAF_ARRAY_BYTES); ASSERT3U(chunk, <, ZAP_LEAF_NUMCHUNKS(l)); - if (bcmp(la->la_array, zn->zn_name_orij + bseen, toread)) + if (bcmp(la->la_array, (char *)zn->zn_key_orig + bseen, toread)) break; chunk = la->la_next; bseen += toread; } - return (bseen == array_len); + return (bseen == array_numints); } /* @@ -394,9 +414,9 @@ zap_leaf_lookup(zap_leaf_t *l, zap_name_t *zn, zap_entry_handle_t *zeh) ASSERT(zn->zn_matchtype == MT_EXACT || (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED)); if (zap_leaf_array_match(l, zn, le->le_name_chunk, - le->le_name_length)) { - zeh->zeh_num_integers = le->le_value_length; - zeh->zeh_integer_size = le->le_int_size; + le->le_name_numints)) { + zeh->zeh_num_integers = le->le_value_numints; + zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; @@ -427,7 +447,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l, { uint16_t chunk; uint64_t besth = -1ULL; - uint32_t bestcd = ZAP_MAXCD; + uint32_t bestcd = -1U; uint16_t bestlh = ZAP_LEAF_HASH_NUMENTRIES(l)-1; uint16_t lh; struct zap_leaf_entry *le; @@ -449,8 +469,8 @@ zap_leaf_lookup_closest(zap_leaf_t *l, besth = le->le_hash; bestcd = le->le_cd; - zeh->zeh_num_integers = le->le_value_length; - zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_num_integers = le->le_value_numints; + zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_fakechunk = chunk; @@ -460,7 +480,7 @@ zap_leaf_lookup_closest(zap_leaf_t *l, } } - return (bestcd == ZAP_MAXCD ? ENOENT : 0); + return (bestcd == -1U ? ENOENT : 0); } int @@ -471,11 +491,12 @@ zap_entry_read(const zap_entry_handle_t *zeh, ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - if (le->le_int_size > integer_size) + if (le->le_value_intlen > integer_size) return (EINVAL); - zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, le->le_int_size, - le->le_value_length, integer_size, num_integers, buf); + zap_leaf_array_read(zeh->zeh_leaf, le->le_value_chunk, + le->le_value_intlen, le->le_value_numints, + integer_size, num_integers, buf); if (zeh->zeh_num_integers > num_integers) return (EOVERFLOW); @@ -484,15 +505,21 @@ zap_entry_read(const zap_entry_handle_t *zeh, } int -zap_entry_read_name(const zap_entry_handle_t *zeh, uint16_t buflen, char *buf) +zap_entry_read_name(zap_t *zap, const zap_entry_handle_t *zeh, uint16_t buflen, + char *buf) { struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(zeh->zeh_leaf, *zeh->zeh_chunkp); ASSERT3U(le->le_type, ==, ZAP_CHUNK_ENTRY); - zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, - le->le_name_length, 1, buflen, buf); - if (le->le_name_length > buflen) + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 8, + le->le_name_numints, 8, buflen / 8, buf); + } else { + zap_leaf_array_read(zeh->zeh_leaf, le->le_name_chunk, 1, + le->le_name_numints, 1, buflen, buf); + } + if (le->le_name_numints > buflen) return (EOVERFLOW); return (0); } @@ -506,7 +533,7 @@ zap_entry_update(zap_entry_handle_t *zeh, struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, *zeh->zeh_chunkp); delta_chunks = ZAP_LEAF_ARRAY_NCHUNKS(num_integers * integer_size) - - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * le->le_int_size); + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * le->le_value_intlen); if ((int)l->l_phys->l_hdr.lh_nfree < delta_chunks) return (EAGAIN); @@ -522,8 +549,8 @@ zap_entry_update(zap_entry_handle_t *zeh, zap_leaf_array_free(l, &le->le_value_chunk); le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); - le->le_value_length = num_integers; - le->le_int_size = integer_size; + le->le_value_numints = num_integers; + le->le_value_intlen = integer_size; return (0); } @@ -550,26 +577,25 @@ zap_entry_remove(zap_entry_handle_t *zeh) } int -zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, +zap_entry_create(zap_leaf_t *l, zap_name_t *zn, uint32_t cd, uint8_t integer_size, uint64_t num_integers, const void *buf, zap_entry_handle_t *zeh) { uint16_t chunk; uint16_t *chunkp; struct zap_leaf_entry *le; - uint64_t namelen, valuelen; + uint64_t valuelen; int numchunks; + uint64_t h = zn->zn_hash; valuelen = integer_size * num_integers; - namelen = strlen(name) + 1; - ASSERT(namelen >= 2); - numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(namelen) + - ZAP_LEAF_ARRAY_NCHUNKS(valuelen); + numchunks = 1 + ZAP_LEAF_ARRAY_NCHUNKS(zn->zn_key_orig_numints * + zn->zn_key_intlen) + ZAP_LEAF_ARRAY_NCHUNKS(valuelen); if (numchunks > ZAP_LEAF_NUMCHUNKS(l)) return (E2BIG); - if (cd == ZAP_MAXCD) { + if (cd == ZAP_NEED_CD) { /* find the lowest unused cd */ if (l->l_phys->l_hdr.lh_flags & ZLF_ENTRIES_CDSORTED) { cd = 0; @@ -586,7 +612,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, } } else { /* old unsorted format; do it the O(n^2) way */ - for (cd = 0; cd < ZAP_MAXCD; cd++) { + for (cd = 0; ; cd++) { for (chunk = *LEAF_HASH_ENTPTR(l, h); chunk != CHAIN_END; chunk = le->le_next) { le = ZAP_LEAF_ENTRY(l, chunk); @@ -601,10 +627,10 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, } } /* - * we would run out of space in a block before we could - * have ZAP_MAXCD entries + * We would run out of space in a block before we could + * store enough entries to run out of CD values. */ - ASSERT3U(cd, <, ZAP_MAXCD); + ASSERT3U(cd, <, zap_maxcd(zn->zn_zap)); } if (l->l_phys->l_hdr.lh_nfree < numchunks) @@ -614,12 +640,13 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, chunk = zap_leaf_chunk_alloc(l); le = ZAP_LEAF_ENTRY(l, chunk); le->le_type = ZAP_CHUNK_ENTRY; - le->le_name_chunk = zap_leaf_array_create(l, name, 1, namelen); - le->le_name_length = namelen; + le->le_name_chunk = zap_leaf_array_create(l, zn->zn_key_orig, + zn->zn_key_intlen, zn->zn_key_orig_numints); + le->le_name_numints = zn->zn_key_orig_numints; le->le_value_chunk = zap_leaf_array_create(l, buf, integer_size, num_integers); - le->le_value_length = num_integers; - le->le_int_size = integer_size; + le->le_value_numints = num_integers; + le->le_value_intlen = integer_size; le->le_hash = h; le->le_cd = cd; @@ -631,7 +658,7 @@ zap_entry_create(zap_leaf_t *l, const char *name, uint64_t h, uint32_t cd, zeh->zeh_leaf = l; zeh->zeh_num_integers = num_integers; - zeh->zeh_integer_size = le->le_int_size; + zeh->zeh_integer_size = le->le_value_intlen; zeh->zeh_cd = le->le_cd; zeh->zeh_hash = le->le_hash; zeh->zeh_chunkp = chunkp; @@ -673,7 +700,7 @@ zap_entry_normalization_conflict(zap_entry_handle_t *zeh, zap_name_t *zn, allocdzn = B_TRUE; } if (zap_leaf_array_match(zeh->zeh_leaf, zn, - le->le_name_chunk, le->le_name_length)) { + le->le_name_chunk, le->le_name_numints)) { if (allocdzn) zap_name_free(zn); return (B_TRUE); @@ -836,9 +863,9 @@ zap_leaf_stats(zap_t *zap, zap_leaf_t *l, zap_stats_t *zs) struct zap_leaf_entry *le = ZAP_LEAF_ENTRY(l, chunk); - n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_length) + - ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_length * - le->le_int_size); + n = 1 + ZAP_LEAF_ARRAY_NCHUNKS(le->le_name_numints) + + ZAP_LEAF_ARRAY_NCHUNKS(le->le_value_numints * + le->le_value_intlen); n = MIN(n, ZAP_HISTOGRAM_SIZE-1); zs->zs_entries_using_n_chunks[n]++; diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c index abba42775bb76..32ffc966f6a15 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zap_micro.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - +#include #include #include #include @@ -38,33 +37,92 @@ #include #endif -static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx); +static int mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags); +uint64_t +zap_getflags(zap_t *zap) +{ + if (zap->zap_ismicro) + return (0); + return (zap->zap_u.zap_fat.zap_phys->zap_flags); +} -static uint64_t -zap_hash(zap_t *zap, const char *normname) +int +zap_hashbits(zap_t *zap) { - const uint8_t *cp; - uint8_t c; - uint64_t crc = zap->zap_salt; + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return (48); + else + return (28); +} - /* NB: name must already be normalized, if necessary */ +uint32_t +zap_maxcd(zap_t *zap) +{ + if (zap_getflags(zap) & ZAP_FLAG_HASH64) + return ((1<<16)-1); + else + return (-1U); +} - ASSERT(crc != 0); - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - for (cp = (const uint8_t *)normname; (c = *cp) != '\0'; cp++) { - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ c) & 0xFF]; - } +static uint64_t +zap_hash(zap_name_t *zn) +{ + zap_t *zap = zn->zn_zap; + uint64_t h = 0; + if (zap_getflags(zap) & ZAP_FLAG_PRE_HASHED_KEY) { + ASSERT(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY); + h = *(uint64_t *)zn->zn_key_orig; + } else { + h = zap->zap_salt; + ASSERT(h != 0); + ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); + + if (zap_getflags(zap) & ZAP_FLAG_UINT64_KEY) { + int i; + const uint64_t *wp = zn->zn_key_norm; + + ASSERT(zn->zn_key_intlen == 8); + for (i = 0; i < zn->zn_key_norm_numints; wp++, i++) { + int j; + uint64_t word = *wp; + + for (j = 0; j < zn->zn_key_intlen; j++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ word) & 0xFF]; + word >>= NBBY; + } + } + } else { + int i, len; + const uint8_t *cp = zn->zn_key_norm; + + /* + * We previously stored the terminating null on + * disk, but didn't hash it, so we need to + * continue to not hash it. (The + * zn_key_*_numints includes the terminating + * null for non-binary keys.) + */ + len = zn->zn_key_norm_numints - 1; + + ASSERT(zn->zn_key_intlen == 1); + for (i = 0; i < len; cp++, i++) { + h = (h >> 8) ^ + zfs_crc64_table[(h ^ *cp) & 0xFF]; + } + } + } /* - * Only use 28 bits, since we need 4 bits in the cookie for the - * collision differentiator. We MUST use the high bits, since - * those are the ones that we first pay attention to when + * Don't use all 64 bits, since we need some in the cookie for + * the collision differentiator. We MUST use the high bits, + * since those are the ones that we first pay attention to when * chosing the bucket. */ - crc &= ~((1ULL << (64 - ZAP_HASHBITS)) - 1); + h &= ~((1ULL << (64 - zap_hashbits(zap))) - 1); - return (crc); + return (h); } static int @@ -73,13 +131,15 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm) size_t inlen, outlen; int err; + ASSERT(!(zap_getflags(zap) & ZAP_FLAG_UINT64_KEY)); + inlen = strlen(name) + 1; outlen = ZAP_MAXNAMELEN; err = 0; (void) u8_textprep_str((char *)name, &inlen, namenorm, &outlen, - zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL, U8_UNICODE_LATEST, - &err); + zap->zap_normflags | U8_TEXTPREP_IGNORE_NULL | + U8_TEXTPREP_IGNORE_INVALID, U8_UNICODE_LATEST, &err); return (err); } @@ -87,16 +147,18 @@ zap_normalize(zap_t *zap, const char *name, char *namenorm) boolean_t zap_match(zap_name_t *zn, const char *matchname) { + ASSERT(!(zap_getflags(zn->zn_zap) & ZAP_FLAG_UINT64_KEY)); + if (zn->zn_matchtype == MT_FIRST) { char norm[ZAP_MAXNAMELEN]; if (zap_normalize(zn->zn_zap, matchname, norm) != 0) return (B_FALSE); - return (strcmp(zn->zn_name_norm, norm) == 0); + return (strcmp(zn->zn_key_norm, norm) == 0); } else { /* MT_BEST or MT_EXACT */ - return (strcmp(zn->zn_name_orij, matchname) == 0); + return (strcmp(zn->zn_key_orig, matchname) == 0); } } @@ -106,30 +168,49 @@ zap_name_free(zap_name_t *zn) kmem_free(zn, sizeof (zap_name_t)); } -/* XXX combine this with zap_lockdir()? */ zap_name_t * -zap_name_alloc(zap_t *zap, const char *name, matchtype_t mt) +zap_name_alloc(zap_t *zap, const char *key, matchtype_t mt) { zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); zn->zn_zap = zap; - zn->zn_name_orij = name; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = key; + zn->zn_key_orig_numints = strlen(zn->zn_key_orig) + 1; zn->zn_matchtype = mt; if (zap->zap_normflags) { - if (zap_normalize(zap, name, zn->zn_normbuf) != 0) { + if (zap_normalize(zap, key, zn->zn_normbuf) != 0) { zap_name_free(zn); return (NULL); } - zn->zn_name_norm = zn->zn_normbuf; + zn->zn_key_norm = zn->zn_normbuf; + zn->zn_key_norm_numints = strlen(zn->zn_key_norm) + 1; } else { if (mt != MT_EXACT) { zap_name_free(zn); return (NULL); } - zn->zn_name_norm = zn->zn_name_orij; + zn->zn_key_norm = zn->zn_key_orig; + zn->zn_key_norm_numints = zn->zn_key_orig_numints; } - zn->zn_hash = zap_hash(zap, zn->zn_name_norm); + zn->zn_hash = zap_hash(zn); + return (zn); +} + +zap_name_t * +zap_name_alloc_uint64(zap_t *zap, const uint64_t *key, int numints) +{ + zap_name_t *zn = kmem_alloc(sizeof (zap_name_t), KM_SLEEP); + + ASSERT(zap->zap_normflags == 0); + zn->zn_zap = zap; + zn->zn_key_intlen = sizeof (*key); + zn->zn_key_orig = zn->zn_key_norm = key; + zn->zn_key_orig_numints = zn->zn_key_norm_numints = numints; + zn->zn_matchtype = MT_EXACT; + + zn->zn_hash = zap_hash(zn); return (zn); } @@ -188,7 +269,7 @@ mze_insert(zap_t *zap, int chunkid, uint64_t hash, mzap_ent_phys_t *mzep) ASSERT(zap->zap_ismicro); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); - ASSERT(mzep->mze_cd < ZAP_MAXCD); + ASSERT(mzep->mze_cd < zap_maxcd(zap)); mze = kmem_alloc(sizeof (mzap_ent_t), KM_SLEEP); mze->mze_chunkid = chunkid; @@ -208,9 +289,6 @@ mze_find(zap_name_t *zn) ASSERT(zn->zn_zap->zap_ismicro); ASSERT(RW_LOCK_HELD(&zn->zn_zap->zap_rwlock)); - if (strlen(zn->zn_name_norm) >= sizeof (mze_tofind.mze_phys.mze_name)) - return (NULL); - mze_tofind.mze_hash = zn->zn_hash; mze_tofind.mze_phys.mze_cd = 0; @@ -423,7 +501,7 @@ zap_lockdir(objset_t *os, uint64_t obj, dmu_tx_t *tx, dprintf("upgrading obj %llu: num_entries=%u\n", obj, zap->zap_m.zap_num_entries); *zapp = zap; - return (mzap_upgrade(zapp, tx)); + return (mzap_upgrade(zapp, tx, 0)); } err = dmu_object_set_blocksize(os, obj, newsz, 0, tx); ASSERT3U(err, ==, 0); @@ -443,10 +521,11 @@ zap_unlockdir(zap_t *zap) } static int -mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) +mzap_upgrade(zap_t **zapp, dmu_tx_t *tx, zap_flags_t flags) { mzap_phys_t *mzp; - int i, sz, nchunks, err; + int i, sz, nchunks; + int err = 0; zap_t *zap = *zapp; ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); @@ -456,11 +535,13 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) bcopy(zap->zap_dbuf->db_data, mzp, sz); nchunks = zap->zap_m.zap_num_chunks; - err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, - 1ULL << fzap_default_block_shift, 0, tx); - if (err) { - kmem_free(mzp, sz); - return (err); + if (!flags) { + err = dmu_object_set_blocksize(zap->zap_objset, zap->zap_object, + 1ULL << fzap_default_block_shift, 0, tx); + if (err) { + kmem_free(mzp, sz); + return (err); + } } dprintf("upgrading obj=%llu with %u chunks\n", @@ -468,10 +549,9 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) /* XXX destroy the avl later, so we can use the stored hash value */ mze_destroy(zap); - fzap_upgrade(zap, tx); + fzap_upgrade(zap, tx, flags); for (i = 0; i < nchunks; i++) { - int err; mzap_ent_phys_t *mze = &mzp->mz_chunk[i]; zap_name_t *zn; if (mze->mze_name[0] == 0) @@ -491,7 +571,8 @@ mzap_upgrade(zap_t **zapp, dmu_tx_t *tx) } static void -mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) +mzap_create_impl(objset_t *os, uint64_t obj, int normflags, zap_flags_t flags, + dmu_tx_t *tx) { dmu_buf_t *db; mzap_phys_t *zp; @@ -512,6 +593,15 @@ mzap_create_impl(objset_t *os, uint64_t obj, int normflags, dmu_tx_t *tx) zp->mz_salt = ((uintptr_t)db ^ (uintptr_t)tx ^ (obj << 1)) | 1ULL; zp->mz_normflags = normflags; dmu_buf_rele(db, FTAG); + + if (flags != 0) { + zap_t *zap; + /* Only fat zap supports flags; upgrade immediately. */ + VERIFY(0 == zap_lockdir(os, obj, tx, RW_WRITER, + B_FALSE, B_FALSE, &zap)); + VERIFY3U(0, ==, mzap_upgrade(&zap, tx, flags)); + zap_unlockdir(zap); + } } int @@ -532,7 +622,7 @@ zap_create_claim_norm(objset_t *os, uint64_t obj, int normflags, err = dmu_object_claim(os, obj, ot, 0, bonustype, bonuslen, tx); if (err != 0) return (err); - mzap_create_impl(os, obj, normflags, tx); + mzap_create_impl(os, obj, normflags, 0, tx); return (0); } @@ -549,7 +639,26 @@ zap_create_norm(objset_t *os, int normflags, dmu_object_type_t ot, { uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); - mzap_create_impl(os, obj, normflags, tx); + mzap_create_impl(os, obj, normflags, 0, tx); + return (obj); +} + +uint64_t +zap_create_flags(objset_t *os, int normflags, zap_flags_t flags, + dmu_object_type_t ot, int leaf_blockshift, int indirect_blockshift, + dmu_object_type_t bonustype, int bonuslen, dmu_tx_t *tx) +{ + uint64_t obj = dmu_object_alloc(os, ot, 0, bonustype, bonuslen, tx); + + ASSERT(leaf_blockshift >= SPA_MINBLOCKSHIFT && + leaf_blockshift <= SPA_MAXBLOCKSHIFT && + indirect_blockshift >= SPA_MINBLOCKSHIFT && + indirect_blockshift <= SPA_MAXBLOCKSHIFT); + + VERIFY(dmu_object_set_blocksize(os, obj, + 1ULL << leaf_blockshift, indirect_blockshift, tx) == 0); + + mzap_create_impl(os, obj, normflags, flags, tx); return (obj); } @@ -700,6 +809,40 @@ zap_lookup_norm(objset_t *os, uint64_t zapobj, const char *name, return (err); } +int +zap_lookup_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t integer_size, uint64_t num_integers, void *buf) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + + err = fzap_lookup(zn, integer_size, num_integers, buf, + NULL, 0, NULL); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + +int +zap_contains(objset_t *os, uint64_t zapobj, const char *name) +{ + int err = (zap_lookup_norm(os, zapobj, name, 0, + 0, NULL, MT_EXACT, NULL, 0, NULL)); + if (err == EOVERFLOW || err == EINVAL) + err = 0; /* found, but skipped reading the value */ + return (err); +} + int zap_length(objset_t *os, uint64_t zapobj, const char *name, uint64_t *integer_size, uint64_t *num_integers) @@ -735,6 +878,28 @@ zap_length(objset_t *os, uint64_t zapobj, const char *name, return (err); } +int +zap_length_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, uint64_t *integer_size, uint64_t *num_integers) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_length(zn, integer_size, num_integers); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + static void mzap_addent(zap_name_t *zn, uint64_t value) { @@ -743,20 +908,18 @@ mzap_addent(zap_name_t *zn, uint64_t value) int start = zap->zap_m.zap_alloc_next; uint32_t cd; - dprintf("obj=%llu %s=%llu\n", zap->zap_object, - zn->zn_name_orij, value); ASSERT(RW_WRITE_HELD(&zap->zap_rwlock)); #ifdef ZFS_DEBUG for (i = 0; i < zap->zap_m.zap_num_chunks; i++) { mzap_ent_phys_t *mze = &zap->zap_m.zap_phys->mz_chunk[i]; - ASSERT(strcmp(zn->zn_name_orij, mze->mze_name) != 0); + ASSERT(strcmp(zn->zn_key_orig, mze->mze_name) != 0); } #endif cd = mze_find_unused_cd(zap, zn->zn_hash); /* given the limited size of the microzap, this can't happen */ - ASSERT(cd != ZAP_MAXCD); + ASSERT(cd < zap_maxcd(zap)); again: for (i = start; i < zap->zap_m.zap_num_chunks; i++) { @@ -764,7 +927,7 @@ mzap_addent(zap_name_t *zn, uint64_t value) if (mze->mze_name[0] == 0) { mze->mze_value = value; mze->mze_cd = cd; - (void) strcpy(mze->mze_name, zn->zn_name_orij); + (void) strcpy(mze->mze_name, zn->zn_key_orig); zap->zap_m.zap_num_entries++; zap->zap_m.zap_alloc_next = i+1; if (zap->zap_m.zap_alloc_next == @@ -782,7 +945,7 @@ mzap_addent(zap_name_t *zn, uint64_t value) } int -zap_add(objset_t *os, uint64_t zapobj, const char *name, +zap_add(objset_t *os, uint64_t zapobj, const char *key, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) { @@ -795,7 +958,7 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); if (err) return (err); - zn = zap_name_alloc(zap, name, MT_EXACT); + zn = zap_name_alloc(zap, key, MT_EXACT); if (zn == NULL) { zap_unlockdir(zap); return (ENOTSUP); @@ -804,10 +967,8 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ } else if (integer_size != 8 || num_integers != 1 || - strlen(name) >= MZAP_NAME_LEN) { - dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", - zapobj, integer_size, num_integers, name); - err = mzap_upgrade(&zn->zn_zap, tx); + strlen(key) >= MZAP_NAME_LEN) { + err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_add(zn, integer_size, num_integers, val, tx); zap = zn->zn_zap; /* fzap_add() may change zap */ @@ -826,6 +987,31 @@ zap_add(objset_t *os, uint64_t zapobj, const char *name, return (err); } +int +zap_add_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, int integer_size, uint64_t num_integers, + const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_add(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_add() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_add() failed */ + zap_unlockdir(zap); + return (err); +} + int zap_update(objset_t *os, uint64_t zapobj, const char *name, int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) @@ -851,7 +1037,7 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, strlen(name) >= MZAP_NAME_LEN) { dprintf("upgrading obj %llu: intsz=%u numint=%llu name=%s\n", zapobj, integer_size, num_integers, name); - err = mzap_upgrade(&zn->zn_zap, tx); + err = mzap_upgrade(&zn->zn_zap, tx, 0); if (err == 0) err = fzap_update(zn, integer_size, num_integers, val, tx); @@ -873,6 +1059,31 @@ zap_update(objset_t *os, uint64_t zapobj, const char *name, return (err); } +int +zap_update_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, + int integer_size, uint64_t num_integers, const void *val, dmu_tx_t *tx) +{ + zap_t *zap; + zap_name_t *zn; + int err; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, TRUE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_update(zn, integer_size, num_integers, val, tx); + zap = zn->zn_zap; /* fzap_update() may change zap */ + zap_name_free(zn); + if (zap != NULL) /* may be NULL if fzap_upgrade() failed */ + zap_unlockdir(zap); + return (err); +} + int zap_remove(objset_t *os, uint64_t zapobj, const char *name, dmu_tx_t *tx) { @@ -914,17 +1125,32 @@ zap_remove_norm(objset_t *os, uint64_t zapobj, const char *name, return (err); } +int +zap_remove_uint64(objset_t *os, uint64_t zapobj, const uint64_t *key, + int key_numints, dmu_tx_t *tx) +{ + zap_t *zap; + int err; + zap_name_t *zn; + + err = zap_lockdir(os, zapobj, tx, RW_WRITER, TRUE, FALSE, &zap); + if (err) + return (err); + zn = zap_name_alloc_uint64(zap, key, key_numints); + if (zn == NULL) { + zap_unlockdir(zap); + return (ENOTSUP); + } + err = fzap_remove(zn, tx); + zap_name_free(zn); + zap_unlockdir(zap); + return (err); +} + /* * Routines for iterating over the attributes. */ -/* - * We want to keep the high 32 bits of the cursor zero if we can, so - * that 32-bit programs can access this. So use a small hash value so - * we can fit 4 bits of cd into the 32-bit cursor. - * - * [ 4 zero bits | 32-bit collision differentiator | 28-bit hash value ] - */ void zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, uint64_t serialized) @@ -933,15 +1159,9 @@ zap_cursor_init_serialized(zap_cursor_t *zc, objset_t *os, uint64_t zapobj, zc->zc_zap = NULL; zc->zc_leaf = NULL; zc->zc_zapobj = zapobj; - if (serialized == -1ULL) { - zc->zc_hash = -1ULL; - zc->zc_cd = 0; - } else { - zc->zc_hash = serialized << (64-ZAP_HASHBITS); - zc->zc_cd = serialized >> ZAP_HASHBITS; - if (zc->zc_cd >= ZAP_MAXCD) /* corrupt serialized */ - zc->zc_cd = 0; - } + zc->zc_serialized = serialized; + zc->zc_hash = 0; + zc->zc_cd = 0; } void @@ -971,10 +1191,21 @@ zap_cursor_serialize(zap_cursor_t *zc) { if (zc->zc_hash == -1ULL) return (-1ULL); - ASSERT((zc->zc_hash & (ZAP_MAXCD-1)) == 0); - ASSERT(zc->zc_cd < ZAP_MAXCD); - return ((zc->zc_hash >> (64-ZAP_HASHBITS)) | - ((uint64_t)zc->zc_cd << ZAP_HASHBITS)); + if (zc->zc_zap == NULL) + return (zc->zc_serialized); + ASSERT((zc->zc_hash & zap_maxcd(zc->zc_zap)) == 0); + ASSERT(zc->zc_cd < zap_maxcd(zc->zc_zap)); + + /* + * We want to keep the high 32 bits of the cursor zero if we can, so + * that 32-bit programs can access this. So usually use a small + * (28-bit) hash value so we can fit 4 bits of cd into the low 32-bits + * of the cursor. + * + * [ collision differentiator | zap_hashbits()-bit hash value ] + */ + return ((zc->zc_hash >> (64 - zap_hashbits(zc->zc_zap))) | + ((uint64_t)zc->zc_cd << zap_hashbits(zc->zc_zap))); } int @@ -989,10 +1220,23 @@ zap_cursor_retrieve(zap_cursor_t *zc, zap_attribute_t *za) return (ENOENT); if (zc->zc_zap == NULL) { + int hb; err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, RW_READER, TRUE, FALSE, &zc->zc_zap); if (err) return (err); + + /* + * To support zap_cursor_init_serialized, advance, retrieve, + * we must add to the existing zc_cd, which may already + * be 1 due to the zap_cursor_advance. + */ + ASSERT(zc->zc_hash == 0); + hb = zap_hashbits(zc->zc_zap); + zc->zc_hash = zc->zc_serialized << (64 - hb); + zc->zc_cd += zc->zc_serialized >> hb; + if (zc->zc_cd >= zap_maxcd(zc->zc_zap)) /* corrupt serialized */ + zc->zc_cd = 0; } else { rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } @@ -1037,12 +1281,46 @@ zap_cursor_advance(zap_cursor_t *zc) if (zc->zc_hash == -1ULL) return; zc->zc_cd++; - if (zc->zc_cd >= ZAP_MAXCD) { - zc->zc_cd = 0; - zc->zc_hash += 1ULL<<(64-ZAP_HASHBITS); - if (zc->zc_hash == 0) /* EOF */ - zc->zc_hash = -1ULL; +} + +int +zap_cursor_move_to_key(zap_cursor_t *zc, const char *name, matchtype_t mt) +{ + int err = 0; + mzap_ent_t *mze; + zap_name_t *zn; + + if (zc->zc_zap == NULL) { + err = zap_lockdir(zc->zc_objset, zc->zc_zapobj, NULL, + RW_READER, TRUE, FALSE, &zc->zc_zap); + if (err) + return (err); + } else { + rw_enter(&zc->zc_zap->zap_rwlock, RW_READER); } + + zn = zap_name_alloc(zc->zc_zap, name, mt); + if (zn == NULL) { + rw_exit(&zc->zc_zap->zap_rwlock); + return (ENOTSUP); + } + + if (!zc->zc_zap->zap_ismicro) { + err = fzap_cursor_move_to_key(zc, zn); + } else { + mze = mze_find(zn); + if (mze == NULL) { + err = ENOENT; + goto out; + } + zc->zc_hash = mze->mze_hash; + zc->zc_cd = mze->mze_phys.mze_cd; + } + +out: + zap_name_free(zn); + rw_exit(&zc->zc_zap->zap_rwlock); + return (err); } int @@ -1067,3 +1345,79 @@ zap_get_stats(objset_t *os, uint64_t zapobj, zap_stats_t *zs) zap_unlockdir(zap); return (0); } + +int +zap_count_write(objset_t *os, uint64_t zapobj, const char *name, int add, + uint64_t *towrite, uint64_t *tooverwrite) +{ + zap_t *zap; + int err = 0; + + + /* + * Since, we don't have a name, we cannot figure out which blocks will + * be affected in this operation. So, account for the worst case : + * - 3 blocks overwritten: target leaf, ptrtbl block, header block + * - 4 new blocks written if adding: + * - 2 blocks for possibly split leaves, + * - 2 grown ptrtbl blocks + * + * This also accomodates the case where an add operation to a fairly + * large microzap results in a promotion to fatzap. + */ + if (name == NULL) { + *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + return (err); + } + + /* + * We lock the zap with adding == FALSE. Because, if we pass + * the actual value of add, it could trigger a mzap_upgrade(). + * At present we are just evaluating the possibility of this operation + * and hence we donot want to trigger an upgrade. + */ + err = zap_lockdir(os, zapobj, NULL, RW_READER, TRUE, FALSE, &zap); + if (err) + return (err); + + if (!zap->zap_ismicro) { + zap_name_t *zn = zap_name_alloc(zap, name, MT_EXACT); + if (zn) { + err = fzap_count_write(zn, add, towrite, + tooverwrite); + zap_name_free(zn); + } else { + /* + * We treat this case as similar to (name == NULL) + */ + *towrite += (3 + (add ? 4 : 0)) * SPA_MAXBLOCKSIZE; + } + } else { + /* + * We are here if (name != NULL) and this is a micro-zap. + * We account for the header block depending on whether it + * is freeable. + * + * Incase of an add-operation it is hard to find out + * if this add will promote this microzap to fatzap. + * Hence, we consider the worst case and account for the + * blocks assuming this microzap would be promoted to a + * fatzap. + * + * 1 block overwritten : header block + * 4 new blocks written : 2 new split leaf, 2 grown + * ptrtbl blocks + */ + if (dmu_buf_freeable(zap->zap_dbuf)) + *tooverwrite += SPA_MAXBLOCKSIZE; + else + *towrite += SPA_MAXBLOCKSIZE; + + if (add) { + *towrite += 4 * SPA_MAXBLOCKSIZE; + } + } + + zap_unlockdir(zap); + return (err); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c index 341dc4dfe7436..36e39a320cba0 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_acl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -65,15 +65,16 @@ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) #define OWNER_ALLOW_MASK (ACE_WRITE_ACL | ACE_WRITE_OWNER | \ ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS) -#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) #define ZFS_CHECKED_MASKS (ACE_READ_ACL|ACE_READ_ATTRIBUTES|ACE_READ_DATA| \ ACE_READ_NAMED_ATTRS|ACE_WRITE_DATA|ACE_WRITE_ATTRIBUTES| \ ACE_WRITE_NAMED_ATTRS|ACE_APPEND_DATA|ACE_EXECUTE|ACE_WRITE_OWNER| \ ACE_WRITE_ACL|ACE_DELETE|ACE_DELETE_CHILD|ACE_SYNCHRONIZE) -#define WRITE_MASK (WRITE_MASK_DATA|ACE_WRITE_ATTRIBUTES|ACE_WRITE_ACL|\ - ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK_DATA (ACE_WRITE_DATA|ACE_APPEND_DATA|ACE_WRITE_NAMED_ATTRS) +#define WRITE_MASK_ATTRS (ACE_WRITE_ACL|ACE_WRITE_OWNER|ACE_WRITE_ATTRIBUTES| \ + ACE_DELETE|ACE_DELETE_CHILD) +#define WRITE_MASK (WRITE_MASK_DATA|WRITE_MASK_ATTRS) #define OGE_CLEAR (ACE_READ_DATA|ACE_LIST_DIRECTORY|ACE_WRITE_DATA| \ ACE_ADD_FILE|ACE_APPEND_DATA|ACE_ADD_SUBDIRECTORY|ACE_EXECUTE) @@ -92,6 +93,8 @@ #define ZFS_ACL_WIDE_FLAGS (V4_ACL_WIDE_FLAGS|ZFS_ACL_TRIVIAL|ZFS_INHERIT_ACE|\ ZFS_ACL_OBJ_ACE) +#define ALL_MODE_EXECS (S_IXUSR | S_IXGRP | S_IXOTH) + static uint16_t zfs_ace_v0_get_type(void *acep) { @@ -538,8 +541,9 @@ zfs_acl_curr_node(zfs_acl_t *aclp) * ACE FUIDs will be created later. */ int -zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, - zfs_ace_t *z_acl, int aclcnt, size_t *size) +zfs_copy_ace_2_fuid(zfsvfs_t *zfsvfs, vtype_t obj_type, zfs_acl_t *aclp, + void *datap, zfs_ace_t *z_acl, int aclcnt, size_t *size, + zfs_fuid_info_t **fuidp, cred_t *cr) { int i; uint16_t entry_type; @@ -555,9 +559,9 @@ zfs_copy_ace_2_fuid(vtype_t obj_type, zfs_acl_t *aclp, void *datap, entry_type = aceptr->z_hdr.z_flags & ACE_TYPE_FLAGS; if (entry_type != ACE_OWNER && entry_type != OWNING_GROUP && entry_type != ACE_EVERYONE) { - if (!aclp->z_has_fuids) - aclp->z_has_fuids = IS_EPHEMERAL(acep->a_who); - aceptr->z_fuid = (uint64_t)acep->a_who; + aceptr->z_fuid = zfs_fuid_create(zfsvfs, acep->a_who, + cr, (entry_type == 0) ? + ZFS_ACE_USER : ZFS_ACE_GROUP, fuidp); } /* @@ -682,7 +686,7 @@ zfs_copy_ace_2_oldace(vtype_t obj_type, zfs_acl_t *aclp, ace_t *acep, * convert old ACL format to new */ void -zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) +zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp, cred_t *cr) { zfs_oldace_t *oldaclp; int i; @@ -714,9 +718,9 @@ zfs_acl_xform(znode_t *zp, zfs_acl_t *aclp) newaclnode = zfs_acl_node_alloc(aclp->z_acl_count * sizeof (zfs_object_ace_t)); aclp->z_ops = zfs_acl_fuid_ops; - VERIFY(zfs_copy_ace_2_fuid(ZTOV(zp)->v_type, aclp, oldaclp, - newaclnode->z_acldata, aclp->z_acl_count, - &newaclnode->z_size) == 0); + VERIFY(zfs_copy_ace_2_fuid(zp->z_zfsvfs, ZTOV(zp)->v_type, aclp, + oldaclp, newaclnode->z_acldata, aclp->z_acl_count, + &newaclnode->z_size, NULL, cr) == 0); newaclnode->z_ace_count = aclp->z_acl_count; aclp->z_version = ZFS_ACL_VERSION; kmem_free(oldaclp, aclp->z_acl_count * sizeof (zfs_oldace_t)); @@ -770,8 +774,7 @@ zfs_set_ace(zfs_acl_t *aclp, void *acep, uint32_t access_mask, * Also, create FUIDs for any User/Group ACEs */ static uint64_t -zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, - zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +zfs_mode_compute(znode_t *zp, zfs_acl_t *aclp) { int entry_type; mode_t mode; @@ -780,6 +783,7 @@ zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, uint64_t who; uint16_t iflags, type; uint32_t access_mask; + boolean_t an_exec_denied = B_FALSE; mode = (zp->z_phys->zp_mode & (S_IFMT | S_ISUID | S_ISGID | S_ISVTX)); @@ -904,17 +908,32 @@ zfs_mode_fuid_compute(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, } } } - } - /* - * Now handle FUID create for user/group ACEs - */ - if (entry_type == 0 || entry_type == ACE_IDENTIFIER_GROUP) { - aclp->z_ops.ace_who_set(acep, - zfs_fuid_create(zp->z_zfsvfs, who, cr, - (entry_type == 0) ? ZFS_ACE_USER : ZFS_ACE_GROUP, - tx, fuidp)); + } else { + /* + * Only care if this IDENTIFIER_GROUP or + * USER ACE denies execute access to someone, + * mode is not affected + */ + if ((access_mask & ACE_EXECUTE) && type == DENY) + an_exec_denied = B_TRUE; } } + + /* + * Failure to allow is effectively a deny, so execute permission + * is denied if it was never mentioned or if we explicitly + * weren't allowed it. + */ + if (!an_exec_denied && + ((seen & ALL_MODE_EXECS) != ALL_MODE_EXECS || + (mode & ALL_MODE_EXECS) != ALL_MODE_EXECS)) + an_exec_denied = B_TRUE; + + if (an_exec_denied) + zp->z_phys->zp_flags &= ~ZFS_NO_EXECS_DENIED; + else + zp->z_phys->zp_flags |= ZFS_NO_EXECS_DENIED; + return (mode); } @@ -954,7 +973,8 @@ zfs_acl_node_read_internal(znode_t *zp, boolean_t will_modify) } /* - * Read an external acl object. + * Read an external acl object. If the intent is to modify, always + * create a new acl and leave any cached acl in place. */ static int zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) @@ -968,8 +988,15 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) ASSERT(MUTEX_HELD(&zp->z_acl_lock)); + if (zp->z_acl_cached && !will_modify) { + *aclpp = zp->z_acl_cached; + return (0); + } + if (zp->z_phys->zp_acl.z_acl_extern_obj == 0) { *aclpp = zfs_acl_node_read_internal(zp, will_modify); + if (!will_modify) + zp->z_acl_cached = *aclpp; return (0); } @@ -989,7 +1016,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) aclnode = zfs_acl_node_alloc(aclsize); list_insert_head(&aclp->z_acl, aclnode); error = dmu_read(zp->z_zfsvfs->z_os, extacl, 0, - aclsize, aclnode->z_acldata); + aclsize, aclnode->z_acldata, DMU_READ_PREFETCH); aclnode->z_ace_count = acl_count; aclp->z_acl_count = acl_count; aclp->z_acl_bytes = aclsize; @@ -1003,6 +1030,8 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) } *aclpp = aclp; + if (!will_modify) + zp->z_acl_cached = aclp; return (0); } @@ -1014,8 +1043,7 @@ zfs_acl_node_read(znode_t *zp, zfs_acl_t **aclpp, boolean_t will_modify) * already checked the acl and knows whether to inherit. */ int -zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, - zfs_fuid_info_t **fuidp, dmu_tx_t *tx) +zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_tx_t *tx) { int error; znode_phys_t *zphys = zp->z_phys; @@ -1026,16 +1054,18 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, dmu_object_type_t otype; zfs_acl_node_t *aclnode; - ASSERT(MUTEX_HELD(&zp->z_lock)); - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - dmu_buf_will_dirty(zp->z_dbuf, tx); - zphys->zp_mode = zfs_mode_fuid_compute(zp, aclp, cr, fuidp, tx); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + + zphys->zp_mode = zfs_mode_compute(zp, aclp); /* - * Decide which opbject type to use. If we are forced to - * use old ACL format than transform ACL into zfs_oldace_t + * Decide which object type to use. If we are forced to + * use old ACL format then transform ACL into zfs_oldace_t * layout. */ if (!zfsvfs->z_use_fuids) { @@ -1043,7 +1073,7 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, } else { if ((aclp->z_version == ZFS_ACL_VERSION_INITIAL) && (zfsvfs->z_version >= ZPL_VERSION_FUID)) - zfs_acl_xform(zp, aclp); + zfs_acl_xform(zp, aclp, cr); ASSERT(aclp->z_version >= ZFS_ACL_VERSION_FUID); otype = DMU_OT_ACL; } @@ -1125,7 +1155,6 @@ zfs_aclset_common(znode_t *zp, zfs_acl_t *aclp, cred_t *cr, if (ace_trivial_common(aclp, 0, zfs_ace_walk) == 0) zp->z_phys->zp_flags |= ZFS_ACL_TRIVIAL; - zfs_time_stamper_locked(zp, STATE_CHANGED, tx); return (0); } @@ -1336,7 +1365,7 @@ zfs_acl_ace_insert(zfs_acl_t *aclp, void *acep) * Prepend deny ACE */ static void * -zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, +zfs_acl_prepend_deny(uint64_t uid, zfs_acl_t *aclp, void *acep, mode_t mode) { zfs_acl_node_t *aclnode; @@ -1349,7 +1378,7 @@ zfs_acl_prepend_deny(znode_t *zp, zfs_acl_t *aclp, void *acep, fuid = aclp->z_ops.ace_who_get(acep); flags = aclp->z_ops.ace_flags_get(acep); zfs_set_ace(aclp, newacep, 0, DENY, fuid, (flags & ACE_TYPE_FLAGS)); - zfs_acl_prepend_fixup(aclp, newacep, acep, mode, zp->z_phys->zp_uid); + zfs_acl_prepend_fixup(aclp, newacep, acep, mode, uid); return (newacep); } @@ -1473,9 +1502,9 @@ zfs_fixup_group_entries(zfs_acl_t *aclp, void *acep, void *prevacep, * in PSARC/2002/240 */ static void -zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) +zfs_acl_chmod(zfsvfs_t *zfsvfs, uint64_t uid, + uint64_t mode, zfs_acl_t *aclp) { - zfsvfs_t *zfsvfs = zp->z_zfsvfs; void *acep = NULL, *prevacep = NULL; uint64_t who; int i; @@ -1485,11 +1514,6 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) uint16_t iflags, type; uint32_t access_mask; - ASSERT(MUTEX_HELD(&zp->z_acl_lock)); - ASSERT(MUTEX_HELD(&zp->z_lock)); - - aclp->z_hints = (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); - /* * If discard then just discard all ACL nodes which * represent the ACEs. @@ -1554,17 +1578,15 @@ zfs_acl_chmod(znode_t *zp, uint64_t mode, zfs_acl_t *aclp) if (!reuse_deny) { prevacep = - zfs_acl_prepend_deny(zp, + zfs_acl_prepend_deny(uid, aclp, acep, mode); } else { zfs_acl_prepend_fixup( aclp, prevacep, - acep, mode, - zp->z_phys->zp_uid); + acep, mode, uid); } zfs_fixup_group_entries(aclp, acep, prevacep, mode); - } } } @@ -1623,8 +1645,10 @@ zfs_acl_chmod_setattr(znode_t *zp, zfs_acl_t **aclp, uint64_t mode) mutex_enter(&zp->z_acl_lock); *aclp = NULL; error = zfs_acl_node_read(zp, aclp, B_TRUE); - if (error == 0) - zfs_acl_chmod(zp, mode, *aclp); + if (error == 0) { + (*aclp)->z_hints = zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS; + zfs_acl_chmod(zp->z_zfsvfs, zp->z_phys->zp_uid, mode, *aclp); + } mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); return (error); @@ -1649,9 +1673,8 @@ zfs_restricted_update(zfsvfs_t *zfsvfs, zfs_acl_t *aclp, void *acep) * Should ACE be inherited? */ static int -zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) +zfs_ace_can_use(vtype_t vtype, uint16_t acep_flags) { - int vtype = ZTOV(zp)->v_type; int iflags = (acep_flags & 0xf); if ((vtype == VDIR) && (iflags & ACE_DIRECTORY_INHERIT_ACE)) @@ -1666,10 +1689,9 @@ zfs_ace_can_use(znode_t *zp, uint16_t acep_flags) * inherit inheritable ACEs from parent */ static zfs_acl_t * -zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, - boolean_t *need_chmod) +zfs_acl_inherit(zfsvfs_t *zfsvfs, vtype_t vtype, zfs_acl_t *paclp, + uint64_t mode, boolean_t *need_chmod) { - zfsvfs_t *zfsvfs = zp->z_zfsvfs; void *pacep; void *acep, *acep2; zfs_acl_node_t *aclnode, *aclnode2; @@ -1680,8 +1702,8 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, size_t ace_size; void *data1, *data2; size_t data1sz, data2sz; - boolean_t vdir = ZTOV(zp)->v_type == VDIR; - boolean_t vreg = ZTOV(zp)->v_type == VREG; + boolean_t vdir = vtype == VDIR; + boolean_t vreg = vtype == VREG; boolean_t passthrough, passthrough_x, noallow; passthrough_x = @@ -1710,7 +1732,7 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, ace_size = aclp->z_ops.ace_size(pacep); - if (!zfs_ace_can_use(zp, iflags)) + if (!zfs_ace_can_use(vtype, iflags)) continue; /* @@ -1806,55 +1828,73 @@ zfs_acl_inherit(znode_t *zp, zfs_acl_t *paclp, uint64_t mode, * Create file system object initial permissions * including inheritable ACEs. */ -void -zfs_perm_init(znode_t *zp, znode_t *parent, int flag, - vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - zfs_acl_t *setaclp, zfs_fuid_info_t **fuidp) +int +zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, + vsecattr_t *vsecp, zfs_acl_ids_t *acl_ids) { - uint64_t mode, fuid, fgid; int error; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zfs_acl_t *aclp = NULL; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; zfs_acl_t *paclp; - xvattr_t *xvap = (xvattr_t *)vap; gid_t gid; boolean_t need_chmod = B_TRUE; - if (setaclp) - aclp = setaclp; + bzero(acl_ids, sizeof (zfs_acl_ids_t)); + acl_ids->z_mode = MAKEIMODE(vap->va_type, vap->va_mode); - mode = MAKEIMODE(vap->va_type, vap->va_mode); + if (vsecp) + if ((error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, cr, + &acl_ids->z_fuidp, &acl_ids->z_aclp)) != 0) + return (error); /* * Determine uid and gid. */ - if ((flag & (IS_ROOT_NODE | IS_REPLAY)) || + if ((flag & IS_ROOT_NODE) || zfsvfs->z_replay || ((flag & IS_XATTR) && (vap->va_type == VDIR))) { - fuid = zfs_fuid_create(zfsvfs, vap->va_uid, cr, - ZFS_OWNER, tx, fuidp); - fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, - ZFS_GROUP, tx, fuidp); + acl_ids->z_fuid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, + ZFS_OWNER, &acl_ids->z_fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, cr, + ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; } else { - fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, tx, cr, fuidp); - fgid = 0; + acl_ids->z_fuid = zfs_fuid_create_cred(zfsvfs, ZFS_OWNER, + cr, &acl_ids->z_fuidp); + acl_ids->z_fgid = 0; if (vap->va_mask & AT_GID) { - fgid = zfs_fuid_create(zfsvfs, vap->va_gid, cr, - ZFS_GROUP, tx, fuidp); + acl_ids->z_fgid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &acl_ids->z_fuidp); gid = vap->va_gid; - if (fgid != parent->z_phys->zp_gid && + if (acl_ids->z_fgid != dzp->z_phys->zp_gid && !groupmember(vap->va_gid, cr) && secpolicy_vnode_create_gid(cr) != 0) - fgid = 0; + acl_ids->z_fgid = 0; } - if (fgid == 0) { - if (parent->z_phys->zp_mode & S_ISGID) { - fgid = parent->z_phys->zp_gid; - gid = zfs_fuid_map_id(zfsvfs, fgid, + if (acl_ids->z_fgid == 0) { + if (dzp->z_phys->zp_mode & S_ISGID) { + char *domain; + uint32_t rid; + + acl_ids->z_fgid = dzp->z_phys->zp_gid; + gid = zfs_fuid_map_id(zfsvfs, acl_ids->z_fgid, cr, ZFS_GROUP); + + if (zfsvfs->z_use_fuids && + IS_EPHEMERAL(acl_ids->z_fgid)) { + domain = zfs_fuid_idx_domain( + &zfsvfs->z_fuid_idx, + FUID_INDEX(acl_ids->z_fgid)); + rid = FUID_RID(acl_ids->z_fgid); + zfs_fuid_node_add(&acl_ids->z_fuidp, + domain, rid, + FUID_INDEX(acl_ids->z_fgid), + acl_ids->z_fgid, ZFS_GROUP); + } } else { - fgid = zfs_fuid_create_cred(zfsvfs, - ZFS_GROUP, tx, cr, fuidp); + acl_ids->z_fgid = zfs_fuid_create_cred(zfsvfs, + ZFS_GROUP, cr, &acl_ids->z_fuidp); gid = crgetgid(cr); } } @@ -1867,57 +1907,60 @@ zfs_perm_init(znode_t *zp, znode_t *parent, int flag, * file's new group, clear the file's set-GID bit. */ - if ((parent->z_phys->zp_mode & S_ISGID) && (vap->va_type == VDIR)) { - mode |= S_ISGID; + if (!(flag & IS_ROOT_NODE) && (dzp->z_phys->zp_mode & S_ISGID) && + (vap->va_type == VDIR)) { + acl_ids->z_mode |= S_ISGID; } else { - if ((mode & S_ISGID) && + if ((acl_ids->z_mode & S_ISGID) && secpolicy_vnode_setids_setgids(cr, gid) != 0) - mode &= ~S_ISGID; - } - - zp->z_phys->zp_uid = fuid; - zp->z_phys->zp_gid = fgid; - zp->z_phys->zp_mode = mode; - - if (aclp == NULL) { - mutex_enter(&parent->z_lock); - if ((ZTOV(parent)->v_type == VDIR && - (parent->z_phys->zp_flags & ZFS_INHERIT_ACE)) && - !(zp->z_phys->zp_flags & ZFS_XATTR)) { - mutex_enter(&parent->z_acl_lock); - VERIFY(0 == zfs_acl_node_read(parent, &paclp, B_FALSE)); - mutex_exit(&parent->z_acl_lock); - aclp = zfs_acl_inherit(zp, paclp, mode, &need_chmod); - zfs_acl_free(paclp); + acl_ids->z_mode &= ~S_ISGID; + } + + if (acl_ids->z_aclp == NULL) { + mutex_enter(&dzp->z_lock); + if (!(flag & IS_ROOT_NODE) && (ZTOV(dzp)->v_type == VDIR && + (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE)) && + !(dzp->z_phys->zp_flags & ZFS_XATTR)) { + mutex_enter(&dzp->z_acl_lock); + VERIFY(0 == zfs_acl_node_read(dzp, &paclp, B_FALSE)); + mutex_exit(&dzp->z_acl_lock); + acl_ids->z_aclp = zfs_acl_inherit(zfsvfs, + vap->va_type, paclp, acl_ids->z_mode, &need_chmod); } else { - aclp = zfs_acl_alloc(zfs_acl_version_zp(zp)); + acl_ids->z_aclp = + zfs_acl_alloc(zfs_acl_version_zp(dzp)); + } + mutex_exit(&dzp->z_lock); + if (need_chmod) { + acl_ids->z_aclp->z_hints = (vap->va_type == VDIR) ? + ZFS_ACL_AUTO_INHERIT : 0; + zfs_acl_chmod(zfsvfs, acl_ids->z_fuid, + acl_ids->z_mode, acl_ids->z_aclp); } - mutex_exit(&parent->z_lock); - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); - if (need_chmod) - zfs_acl_chmod(zp, mode, aclp); - } else { - mutex_enter(&zp->z_lock); - mutex_enter(&zp->z_acl_lock); } - /* Force auto_inherit on all new directory objects */ - if (vap->va_type == VDIR) - aclp->z_hints |= ZFS_ACL_AUTO_INHERIT; - - error = zfs_aclset_common(zp, aclp, cr, fuidp, tx); - - /* Set optional attributes if any */ - if (vap->va_mask & AT_XVATTR) - zfs_xvattr_set(zp, xvap); + return (0); +} - mutex_exit(&zp->z_lock); - mutex_exit(&zp->z_acl_lock); - ASSERT3U(error, ==, 0); +/* + * Free ACL and fuid_infop, but not the acl_ids structure + */ +void +zfs_acl_ids_free(zfs_acl_ids_t *acl_ids) +{ + if (acl_ids->z_aclp) + zfs_acl_free(acl_ids->z_aclp); + if (acl_ids->z_fuidp) + zfs_fuid_info_free(acl_ids->z_fuidp); + acl_ids->z_aclp = NULL; + acl_ids->z_fuidp = NULL; +} - if (aclp != setaclp) - zfs_acl_free(aclp); +boolean_t +zfs_acl_ids_overquota(zfsvfs_t *zfsvfs, zfs_acl_ids_t *acl_ids) +{ + return (zfs_usergroup_overquota(zfsvfs, B_FALSE, acl_ids->z_fuid) || + zfs_usergroup_overquota(zfsvfs, B_TRUE, acl_ids->z_fgid)); } /* @@ -1983,8 +2026,6 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (mask & VSA_ACE) { size_t aclsz; - zfs_acl_node_t *aclnode = list_head(&aclp->z_acl); - aclsz = count * sizeof (ace_t) + sizeof (ace_object_t) * largeace; @@ -1995,8 +2036,17 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) zfs_copy_fuid_2_ace(zp->z_zfsvfs, aclp, cr, vsecp->vsa_aclentp, !(mask & VSA_ACE_ALLTYPES)); else { - bcopy(aclnode->z_acldata, vsecp->vsa_aclentp, - count * sizeof (ace_t)); + zfs_acl_node_t *aclnode; + void *start = vsecp->vsa_aclentp; + + for (aclnode = list_head(&aclp->z_acl); aclnode; + aclnode = list_next(&aclp->z_acl, aclnode)) { + bcopy(aclnode->z_acldata, start, + aclnode->z_size); + start = (caddr_t)start + aclnode->z_size; + } + ASSERT((caddr_t)start - (caddr_t)vsecp->vsa_aclentp == + aclp->z_acl_bytes); } } if (mask & VSA_ACE_ACLFLAGS) { @@ -2011,14 +2061,12 @@ zfs_getacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); - return (0); } int zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, - vsecattr_t *vsecp, zfs_acl_t **zaclp) + vsecattr_t *vsecp, cred_t *cr, zfs_fuid_info_t **fuidp, zfs_acl_t **zaclp) { zfs_acl_t *aclp; zfs_acl_node_t *aclnode; @@ -2041,9 +2089,9 @@ zfs_vsec_2_aclp(zfsvfs_t *zfsvfs, vtype_t obj_type, return (error); } } else { - if ((error = zfs_copy_ace_2_fuid(obj_type, aclp, + if ((error = zfs_copy_ace_2_fuid(zfsvfs, obj_type, aclp, vsecp->vsa_aclentp, aclnode->z_acldata, aclcnt, - &aclnode->z_size)) != 0) { + &aclnode->z_size, fuidp, cr)) != 0) { zfs_acl_free(aclp); zfs_acl_node_free(aclnode); return (error); @@ -2084,6 +2132,7 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) int error; zfs_acl_t *aclp; zfs_fuid_info_t *fuidp = NULL; + boolean_t fuid_dirtied; if (mask == 0) return (ENOSYS); @@ -2094,7 +2143,8 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) return (error); - error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, &aclp); + error = zfs_vsec_2_aclp(zfsvfs, ZTOV(zp)->v_type, vsecp, cr, &fuidp, + &aclp); if (error) return (error); @@ -2106,11 +2156,6 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) aclp->z_hints |= (zp->z_phys->zp_flags & V4_ACL_WIDE_FLAGS); } top: - if (error = zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr)) { - zfs_acl_free(aclp); - return (error); - } - mutex_enter(&zp->z_lock); mutex_enter(&zp->z_acl_lock); @@ -2135,25 +2180,16 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) } else if (aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, aclp->z_acl_bytes); } - if (aclp->z_has_fuids) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { mutex_exit(&zp->z_acl_lock); mutex_exit(&zp->z_lock); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -2163,14 +2199,18 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) return (error); } - error = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + error = zfs_aclset_common(zp, aclp, cr, tx); ASSERT(error == 0); + zp->z_acl_cached = aclp; + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + + zfs_time_stamper_locked(zp, STATE_CHANGED, tx); zfs_log_acl(zilog, tx, zp, vsecp, fuidp); if (fuidp) zfs_fuid_info_free(fuidp); - zfs_acl_free(aclp); dmu_tx_commit(tx); done: mutex_exit(&zp->z_acl_lock); @@ -2180,45 +2220,17 @@ zfs_setacl(znode_t *zp, vsecattr_t *vsecp, boolean_t skipaclchk, cred_t *cr) } /* - * working_mode returns the permissions that were not granted + * Check accesses of interest (AoI) against attributes of the dataset + * such as read-only. Returns zero if no AoI conflict with dataset + * attributes, otherwise an appropriate errno is returned. */ static int -zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, - boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +zfs_zaccess_dataset_check(znode_t *zp, uint32_t v4_mode) { - zfs_acl_t *aclp; - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; - uid_t uid = crgetuid(cr); - uint64_t who; - uint16_t type, iflags; - uint16_t entry_type; - uint32_t access_mask; - uint32_t deny_mask = 0; - zfs_ace_hdr_t *acep = NULL; - boolean_t checkit; - uid_t fowner; - uid_t gowner; - - /* - * Short circuit empty requests - */ - if (v4_mode == 0) - return (0); - - *check_privs = B_TRUE; - - if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ - *working_mode = 0; - return (0); - } - - *working_mode = v4_mode; - if ((v4_mode & WRITE_MASK) && (zp->z_zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) && - (!IS_DEVVP(ZTOV(zp)))) { - *check_privs = B_FALSE; + (!IS_DEVVP(ZTOV(zp)) || + (IS_DEVVP(ZTOV(zp)) && (v4_mode & WRITE_MASK_ATTRS)))) { return (EROFS); } @@ -2230,31 +2242,64 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, (zp->z_phys->zp_flags & (ZFS_READONLY | ZFS_IMMUTABLE))) || (ZTOV(zp)->v_type == VDIR && (zp->z_phys->zp_flags & ZFS_IMMUTABLE)))) { - *check_privs = B_FALSE; return (EPERM); } if ((v4_mode & (ACE_DELETE | ACE_DELETE_CHILD)) && (zp->z_phys->zp_flags & ZFS_NOUNLINK)) { - *check_privs = B_FALSE; return (EPERM); } if (((v4_mode & (ACE_READ_DATA|ACE_EXECUTE)) && (zp->z_phys->zp_flags & ZFS_AV_QUARANTINED))) { - *check_privs = B_FALSE; return (EACCES); } - /* - * The caller requested that the ACL check be skipped. This - * would only happen if the caller checked VOP_ACCESS() with a - * 32 bit ACE mask and already had the appropriate permissions. - */ - if (skipaclchk) { - *working_mode = 0; - return (0); - } + return (0); +} + +/* + * The primary usage of this function is to loop through all of the + * ACEs in the znode, determining what accesses of interest (AoI) to + * the caller are allowed or denied. The AoI are expressed as bits in + * the working_mode parameter. As each ACE is processed, bits covered + * by that ACE are removed from the working_mode. This removal + * facilitates two things. The first is that when the working mode is + * empty (= 0), we know we've looked at all the AoI. The second is + * that the ACE interpretation rules don't allow a later ACE to undo + * something granted or denied by an earlier ACE. Removing the + * discovered access or denial enforces this rule. At the end of + * processing the ACEs, all AoI that were found to be denied are + * placed into the working_mode, giving the caller a mask of denied + * accesses. Returns: + * 0 if all AoI granted + * EACCESS if the denied mask is non-zero + * other error if abnormal failure (e.g., IO error) + * + * A secondary usage of the function is to determine if any of the + * AoI are granted. If an ACE grants any access in + * the working_mode, we immediately short circuit out of the function. + * This mode is chosen by setting anyaccess to B_TRUE. The + * working_mode is not a denied access mask upon exit if the function + * is used in this manner. + */ +static int +zfs_zaccess_aces_check(znode_t *zp, uint32_t *working_mode, + boolean_t anyaccess, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + zfs_acl_t *aclp; + int error; + uid_t uid = crgetuid(cr); + uint64_t who; + uint16_t type, iflags; + uint16_t entry_type; + uint32_t access_mask; + uint32_t deny_mask = 0; + zfs_ace_hdr_t *acep = NULL; + boolean_t checkit; + uid_t fowner; + uid_t gowner; zfs_fuid_map_ids(zp, cr, &fowner, &gowner); @@ -2268,6 +2313,7 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, while (acep = zfs_acl_next_ace(aclp, acep, &who, &access_mask, &iflags, &type)) { + uint32_t mask_matched; if (!zfs_acl_valid_ace_type(type, iflags)) continue; @@ -2275,6 +2321,11 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, if (ZTOV(zp)->v_type == VDIR && (iflags & ACE_INHERIT_ONLY_ACE)) continue; + /* Skip ACE if it does not affect any AoI */ + mask_matched = (access_mask & *working_mode); + if (!mask_matched) + continue; + entry_type = (iflags & ACE_TYPE_FLAGS); checkit = B_FALSE; @@ -2306,21 +2357,29 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, checkit = B_TRUE; break; } else { - zfs_acl_free(aclp); mutex_exit(&zp->z_acl_lock); return (EIO); } } if (checkit) { - uint32_t mask_matched = (access_mask & *working_mode); - - if (mask_matched) { - if (type == DENY) - deny_mask |= mask_matched; - - *working_mode &= ~mask_matched; + if (type == DENY) { + DTRACE_PROBE3(zfs__ace__denies, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + deny_mask |= mask_matched; + } else { + DTRACE_PROBE3(zfs__ace__allows, + znode_t *, zp, + zfs_ace_hdr_t *, acep, + uint32_t, mask_matched); + if (anyaccess) { + mutex_exit(&zp->z_acl_lock); + return (0); + } } + *working_mode &= ~mask_matched; } /* Are we done? */ @@ -2329,7 +2388,6 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, } mutex_exit(&zp->z_acl_lock); - zfs_acl_free(aclp); /* Put the found 'denies' back on the working mode */ if (deny_mask) { @@ -2342,6 +2400,68 @@ zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, return (0); } +/* + * Return true if any access whatsoever granted, we don't actually + * care what access is granted. + */ +boolean_t +zfs_has_access(znode_t *zp, cred_t *cr) +{ + uint32_t have = ACE_ALL_PERMS; + + if (zfs_zaccess_aces_check(zp, &have, B_TRUE, cr) != 0) { + uid_t owner; + + owner = zfs_fuid_map_id(zp->z_zfsvfs, + zp->z_phys->zp_uid, cr, ZFS_OWNER); + + return ( + secpolicy_vnode_access(cr, ZTOV(zp), owner, VREAD) == 0 || + secpolicy_vnode_access(cr, ZTOV(zp), owner, VWRITE) == 0 || + secpolicy_vnode_access(cr, ZTOV(zp), owner, VEXEC) == 0 || + secpolicy_vnode_chown(cr, owner) == 0 || + secpolicy_vnode_setdac(cr, owner) == 0 || + secpolicy_vnode_remove(cr) == 0); + } + return (B_TRUE); +} + +static int +zfs_zaccess_common(znode_t *zp, uint32_t v4_mode, uint32_t *working_mode, + boolean_t *check_privs, boolean_t skipaclchk, cred_t *cr) +{ + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int err; + + *working_mode = v4_mode; + *check_privs = B_TRUE; + + /* + * Short circuit empty requests + */ + if (v4_mode == 0 || zfsvfs->z_replay) { + *working_mode = 0; + return (0); + } + + if ((err = zfs_zaccess_dataset_check(zp, v4_mode)) != 0) { + *check_privs = B_FALSE; + return (err); + } + + /* + * The caller requested that the ACL check be skipped. This + * would only happen if the caller checked VOP_ACCESS() with a + * 32 bit ACE mask and already had the appropriate permissions. + */ + if (skipaclchk) { + *working_mode = 0; + return (0); + } + + return (zfs_zaccess_aces_check(zp, working_mode, B_FALSE, cr)); +} + static int zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, cred_t *cr) @@ -2353,6 +2473,78 @@ zfs_zaccess_append(znode_t *zp, uint32_t *working_mode, boolean_t *check_privs, check_privs, B_FALSE, cr)); } +int +zfs_fastaccesschk_execute(znode_t *zdp, cred_t *cr) +{ + boolean_t owner = B_FALSE; + boolean_t groupmbr = B_FALSE; + boolean_t is_attr; + uid_t fowner; + uid_t gowner; + uid_t uid = crgetuid(cr); + int error; + + if (zdp->z_phys->zp_flags & ZFS_AV_QUARANTINED) + return (EACCES); + + is_attr = ((zdp->z_phys->zp_flags & ZFS_XATTR) && + (ZTOV(zdp)->v_type == VDIR)); + if (is_attr) + goto slow; + + mutex_enter(&zdp->z_acl_lock); + + if (zdp->z_phys->zp_flags & ZFS_NO_EXECS_DENIED) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + + if (FUID_INDEX(zdp->z_phys->zp_uid) != 0 || + FUID_INDEX(zdp->z_phys->zp_gid) != 0) { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + + fowner = (uid_t)zdp->z_phys->zp_uid; + gowner = (uid_t)zdp->z_phys->zp_gid; + + if (uid == fowner) { + owner = B_TRUE; + if (zdp->z_phys->zp_mode & S_IXUSR) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (groupmember(gowner, cr)) { + groupmbr = B_TRUE; + if (zdp->z_phys->zp_mode & S_IXGRP) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } else { + mutex_exit(&zdp->z_acl_lock); + goto slow; + } + } + if (!owner && !groupmbr) { + if (zdp->z_phys->zp_mode & S_IXOTH) { + mutex_exit(&zdp->z_acl_lock); + return (0); + } + } + + mutex_exit(&zdp->z_acl_lock); + +slow: + DTRACE_PROBE(zfs__fastpath__execute__access__miss); + ZFS_ENTER(zdp->z_zfsvfs); + error = zfs_zaccess(zdp, ACE_EXECUTE, 0, B_FALSE, cr); + ZFS_EXIT(zdp->z_zfsvfs); + return (error); +} + /* * Determine whether Access should be granted/denied, invoking least * priv subsytem when a deny is determined. @@ -2447,7 +2639,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) owner, checkmode); if (error == 0 && (working_mode & ACE_WRITE_OWNER)) - error = secpolicy_vnode_chown(cr, B_TRUE); + error = secpolicy_vnode_chown(cr, owner); if (error == 0 && (working_mode & ACE_WRITE_ACL)) error = secpolicy_vnode_setdac(cr, owner); @@ -2456,7 +2648,7 @@ zfs_zaccess(znode_t *zp, int mode, int flags, boolean_t skipaclchk, cred_t *cr) error = secpolicy_vnode_remove(cr); if (error == 0 && (working_mode & ACE_SYNCHRONIZE)) { - error = secpolicy_vnode_chown(cr, B_FALSE); + error = secpolicy_vnode_chown(cr, owner); } if (error == 0) { /* diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c index ab97f83eb0af2..cd36696f95007 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_byteswap.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -63,6 +61,20 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) while (ptr < end) { if (zfs_layout) { + /* + * Avoid overrun. Embedded aces can have one + * of several sizes. We don't know exactly + * how many our present, only the size of the + * buffer containing them. That size may be + * larger than needed to hold the aces + * present. As long as we do not do any + * swapping beyond the end of our block we are + * okay. It it safe to swap any non-ace data + * within the block since it is just zeros. + */ + if (ptr + sizeof (zfs_ace_hdr_t) > end) { + break; + } zacep = (zfs_ace_t *)ptr; zacep->z_hdr.z_access_mask = BSWAP_32(zacep->z_hdr.z_access_mask); @@ -71,6 +83,10 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) BSWAP_16(zacep->z_hdr.z_type); entry_type = zacep->z_hdr.z_flags & ACE_TYPE_FLAGS; } else { + /* Overrun avoidance */ + if (ptr + sizeof (ace_t) > end) { + break; + } acep = (ace_t *)ptr; acep->a_access_mask = BSWAP_32(acep->a_access_mask); acep->a_flags = BSWAP_16(acep->a_flags); @@ -87,8 +103,14 @@ zfs_ace_byteswap(void *buf, size_t size, boolean_t zfs_layout) break; case ACE_IDENTIFIER_GROUP: default: + /* Overrun avoidance */ if (zfs_layout) { - zacep->z_fuid = BSWAP_64(zacep->z_fuid); + if (ptr + sizeof (zfs_ace_t) <= end) { + zacep->z_fuid = BSWAP_64(zacep->z_fuid); + } else { + entry_size = sizeof (zfs_ace_t); + break; + } } switch (ace_type) { case ACE_ACCESS_ALLOWED_OBJECT_ACE_TYPE: @@ -169,7 +191,8 @@ zfs_znode_byteswap(void *buf, size_t size) if (zp->zp_acl.z_acl_version == ZFS_ACL_VERSION) { zfs_acl_byteswap((void *)&zp->zp_acl.z_ace_data[0], ZFS_ACE_SPACE); - } else + } else { zfs_oldace_byteswap((ace_t *)&zp->zp_acl.z_ace_data[0], ACE_SLOT_CNT); + } } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c index 208fc36295d07..d09309a3f2cc5 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ctldir.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * ZFS control directory (a.k.a. ".zfs") * @@ -116,12 +114,16 @@ snapentry_compare(const void *a, const void *b) vnodeops_t *zfsctl_ops_root; vnodeops_t *zfsctl_ops_snapdir; vnodeops_t *zfsctl_ops_snapshot; +vnodeops_t *zfsctl_ops_shares; +vnodeops_t *zfsctl_ops_shares_dir; static const fs_operation_def_t zfsctl_tops_root[]; static const fs_operation_def_t zfsctl_tops_snapdir[]; static const fs_operation_def_t zfsctl_tops_snapshot[]; +static const fs_operation_def_t zfsctl_tops_shares[]; static vnode_t *zfsctl_mknode_snapdir(vnode_t *); +static vnode_t *zfsctl_mknode_shares(vnode_t *); static vnode_t *zfsctl_snapshot_mknode(vnode_t *, uint64_t objset); static int zfsctl_unmount_snap(zfs_snapentry_t *, int, cred_t *); @@ -129,14 +131,18 @@ static gfs_opsvec_t zfsctl_opsvec[] = { { ".zfs", zfsctl_tops_root, &zfsctl_ops_root }, { ".zfs/snapshot", zfsctl_tops_snapdir, &zfsctl_ops_snapdir }, { ".zfs/snapshot/vnode", zfsctl_tops_snapshot, &zfsctl_ops_snapshot }, + { ".zfs/shares", zfsctl_tops_shares, &zfsctl_ops_shares_dir }, + { ".zfs/shares/vnode", zfsctl_tops_shares, &zfsctl_ops_shares }, { NULL } }; /* - * Root directory elements. We have only a single static entry, 'snapshot'. + * Root directory elements. We only have two entries + * snapshot and shares. */ static gfs_dirent_t zfsctl_root_entries[] = { { "snapshot", zfsctl_mknode_snapdir, GFS_CACHE_VNODE }, + { "shares", zfsctl_mknode_shares, GFS_CACHE_VNODE }, { NULL } }; @@ -168,21 +174,34 @@ zfsctl_fini(void) vn_freevnodeops(zfsctl_ops_snapdir); if (zfsctl_ops_snapshot) vn_freevnodeops(zfsctl_ops_snapshot); + if (zfsctl_ops_shares) + vn_freevnodeops(zfsctl_ops_shares); + if (zfsctl_ops_shares_dir) + vn_freevnodeops(zfsctl_ops_shares_dir); zfsctl_ops_root = NULL; zfsctl_ops_snapdir = NULL; zfsctl_ops_snapshot = NULL; + zfsctl_ops_shares = NULL; + zfsctl_ops_shares_dir = NULL; } /* - * Return the inode number associated with the 'snapshot' directory. + * Return the inode number associated with the 'snapshot' or + * 'shares' directory. */ /* ARGSUSED */ static ino64_t zfsctl_root_inode_cb(vnode_t *vp, int index) { - ASSERT(index == 0); - return (ZFSCTL_INO_SNAPDIR); + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + + ASSERT(index <= 2); + + if (index == 0) + return (ZFSCTL_INO_SNAPDIR); + + return (zfsvfs->z_shares_dir); } /* @@ -275,8 +294,13 @@ static int zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr, caller_context_t *ct) { - if (mode & VWRITE) - return (EACCES); + if (flags & V_ACE_MASK) { + if (mode & ACE_ALL_WRITE_PERMS) + return (EACCES); + } else { + if (mode & VWRITE) + return (EACCES); + } return (0); } @@ -287,14 +311,13 @@ zfsctl_common_access(vnode_t *vp, int mode, int flags, cred_t *cr, static void zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) { - zfsctl_node_t *zcp = vp->v_data; timestruc_t now; vap->va_uid = 0; vap->va_gid = 0; vap->va_rdev = 0; /* - * We are a purly virtual object, so we have no + * We are a purely virtual object, so we have no * blocksize or allocated blocks. */ vap->va_blksize = 0; @@ -309,7 +332,6 @@ zfsctl_common_getattr(vnode_t *vp, vattr_t *vap) */ gethrestime(&now); vap->va_atime = now; - vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; } /*ARGSUSED*/ @@ -345,6 +367,30 @@ zfsctl_common_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) return (0); } + +/*ARGSUSED*/ +static int +zfsctl_shares_fid(vnode_t *vp, fid_t *fidp, caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_FID(ZTOV(dzp), fidp, ct); + VN_RELE(ZTOV(dzp)); + } + + ZFS_EXIT(zfsvfs); + return (error); +} /* * .zfs inode namespace * @@ -368,10 +414,12 @@ zfsctl_root_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, caller_context_t *ct) { zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + zfsctl_node_t *zcp = vp->v_data; ZFS_ENTER(zfsvfs); vap->va_nodeid = ZFSCTL_INO_ROOT; vap->va_nlink = vap->va_size = NROOT_ENTRIES; + vap->va_mtime = vap->va_ctime = zcp->zc_cmtime; zfsctl_common_getattr(vp, vap); ZFS_EXIT(zfsvfs); @@ -411,6 +459,22 @@ zfsctl_root_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, return (err); } +static int +zfsctl_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, + caller_context_t *ct) +{ + /* + * We only care about ACL_ENABLED so that libsec can + * display ACL correctly and not default to POSIX draft. + */ + if (cmd == _PC_ACL_ENABLED) { + *valp = _ACL_ACE_ENABLED; + return (0); + } + + return (fs_pathconf(vp, cmd, valp, cr, ct)); +} + static const fs_operation_def_t zfsctl_tops_root[] = { { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, @@ -421,6 +485,7 @@ static const fs_operation_def_t zfsctl_tops_root[] = { { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_root_lookup } }, { VOPNAME_SEEK, { .vop_seek = fs_seek } }, { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, + { VOPNAME_PATHCONF, { .vop_pathconf = zfsctl_pathconf } }, { VOPNAME_FID, { .vop_fid = zfsctl_common_fid } }, { NULL } }; @@ -458,7 +523,7 @@ zfsctl_unmount_snap(zfs_snapentry_t *sep, int fflags, cred_t *cr) VN_RELE(svp); return (error); } - VFS_RELE(svp->v_vfsp); + /* * We can't use VN_RELE(), as that will try to invoke * zfsctl_snapdir_inactive(), which would cause us to destroy @@ -635,7 +700,7 @@ zfsctl_snapdir_remove(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, if (err) avl_add(&sdp->sd_snaps, sep); else - err = dmu_objset_destroy(snapname); + err = dmu_objset_destroy(snapname, B_FALSE); } else { err = ENOENT; } @@ -671,7 +736,7 @@ zfsctl_snapdir_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, return (err); if (err == 0) { - err = dmu_objset_snapshot(name, dirname, B_FALSE); + err = dmu_objset_snapshot(name, dirname, NULL, B_FALSE); if (err) return (err); err = lookupnameat(dirname, seg, follow, NULL, vpp, dvp); @@ -712,9 +777,6 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, ASSERT(dvp->v_type == VDIR); - if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) - return (0); - /* * If we get a recursive call, that means we got called * from the domount() code while it was trying to look up the @@ -726,6 +788,11 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, ZFS_ENTER(zfsvfs); + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + if (flags & FIGNORECASE) { boolean_t conflict = B_FALSE; @@ -786,8 +853,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, */ return (err == EILSEQ ? ENOENT : err); } - if (dmu_objset_open(snapname, DMU_OST_ZFS, - DS_MODE_USER | DS_MODE_READONLY, &snap) != 0) { + if (dmu_objset_hold(snapname, FTAG, &snap) != 0) { mutex_exit(&sdp->sd_lock); ZFS_EXIT(zfsvfs); return (ENOENT); @@ -799,7 +865,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, *vpp = sep->se_root = zfsctl_snapshot_mknode(dvp, dmu_objset_id(snap)); avl_insert(&sdp->sd_snaps, sep, where); - dmu_objset_close(snap); + dmu_objset_rele(snap, FTAG); domount: mountpoint_len = strlen(refstr_value(dvp->v_vfsp->vfs_mntpt)) + strlen("/.zfs/snapshot/") + strlen(nm) + 1; @@ -824,7 +890,7 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, * Return the mounted root rather than the covered mount point. * Takes the GFS vnode at .zfs/snapshot/ and returns * the ZFS vnode mounted on top of the GFS node. This ZFS - * vnode is the root the newly created vfsp. + * vnode is the root of the newly created vfsp. */ VFS_RELE(vfsp); err = traverse(vpp); @@ -857,6 +923,37 @@ zfsctl_snapdir_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, return (err); } +/* ARGSUSED */ +static int +zfsctl_shares_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, pathname_t *pnp, + int flags, vnode_t *rdir, cred_t *cr, caller_context_t *ct, + int *direntflags, pathname_t *realpnp) +{ + zfsvfs_t *zfsvfs = dvp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (gfs_lookup_dot(vpp, dvp, zfsvfs->z_ctldir, nm) == 0) { + ZFS_EXIT(zfsvfs); + return (0); + } + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) + error = VOP_LOOKUP(ZTOV(dzp), nm, vpp, pnp, + flags, rdir, cr, ct, direntflags, realpnp); + + VN_RELE(ZTOV(dzp)); + ZFS_EXIT(zfsvfs); + + return (error); +} + /* ARGSUSED */ static int zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, @@ -901,6 +998,33 @@ zfsctl_snapdir_readdir_cb(vnode_t *vp, void *dp, int *eofp, return (0); } +/* ARGSUSED */ +static int +zfsctl_shares_readdir(vnode_t *vp, uio_t *uiop, cred_t *cr, int *eofp, + caller_context_t *ct, int flags) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_READDIR(ZTOV(dzp), uiop, cr, eofp, ct, flags); + VN_RELE(ZTOV(dzp)); + } else { + *eofp = 1; + error = ENOENT; + } + + ZFS_EXIT(zfsvfs); + return (error); +} + /* * pvp is the '.zfs' directory (zfsctl_node_t). * Creates vp, which is '.zfs/snapshot' (zfsctl_snapdir_t). @@ -926,6 +1050,45 @@ zfsctl_mknode_snapdir(vnode_t *pvp) return (vp); } +vnode_t * +zfsctl_mknode_shares(vnode_t *pvp) +{ + vnode_t *vp; + zfsctl_node_t *sdp; + + vp = gfs_dir_create(sizeof (zfsctl_node_t), pvp, + zfsctl_ops_shares, NULL, NULL, MAXNAMELEN, + NULL, NULL); + sdp = vp->v_data; + sdp->zc_cmtime = ((zfsctl_node_t *)pvp->v_data)->zc_cmtime; + return (vp); + +} + +/* ARGSUSED */ +static int +zfsctl_shares_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, + caller_context_t *ct) +{ + zfsvfs_t *zfsvfs = vp->v_vfsp->vfs_data; + znode_t *dzp; + int error; + + ZFS_ENTER(zfsvfs); + if (zfsvfs->z_shares_dir == 0) { + ZFS_EXIT(zfsvfs); + return (ENOTSUP); + } + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &dzp)) == 0) { + error = VOP_GETATTR(ZTOV(dzp), vap, flags, cr, ct); + VN_RELE(ZTOV(dzp)); + } + ZFS_EXIT(zfsvfs); + return (error); + + +} + /* ARGSUSED */ static int zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, @@ -938,6 +1101,7 @@ zfsctl_snapdir_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, zfsctl_common_getattr(vp, vap); vap->va_nodeid = gfs_file_inode(vp); vap->va_nlink = vap->va_size = avl_numnodes(&sdp->sd_snaps) + 2; + vap->va_ctime = vap->va_mtime = dmu_objset_snap_cmtime(zfsvfs->z_os); ZFS_EXIT(zfsvfs); return (0); @@ -976,6 +1140,20 @@ static const fs_operation_def_t zfsctl_tops_snapdir[] = { { NULL } }; +static const fs_operation_def_t zfsctl_tops_shares[] = { + { VOPNAME_OPEN, { .vop_open = zfsctl_common_open } }, + { VOPNAME_CLOSE, { .vop_close = zfsctl_common_close } }, + { VOPNAME_IOCTL, { .error = fs_inval } }, + { VOPNAME_GETATTR, { .vop_getattr = zfsctl_shares_getattr } }, + { VOPNAME_ACCESS, { .vop_access = zfsctl_common_access } }, + { VOPNAME_READDIR, { .vop_readdir = zfsctl_shares_readdir } }, + { VOPNAME_LOOKUP, { .vop_lookup = zfsctl_shares_lookup } }, + { VOPNAME_SEEK, { .vop_seek = fs_seek } }, + { VOPNAME_INACTIVE, { .vop_inactive = gfs_vop_inactive } }, + { VOPNAME_FID, { .vop_fid = zfsctl_shares_fid } }, + { NULL } +}; + /* * pvp is the GFS vnode '.zfs/snapshot'. * @@ -993,7 +1171,6 @@ zfsctl_snapshot_mknode(vnode_t *pvp, uint64_t objset) zfsctl_ops_snapshot, NULL, NULL, MAXNAMELEN, NULL, NULL); zcp = vp->v_data; zcp->zc_id = objset; - VFS_HOLD(vp->v_vfsp); return (vp); } @@ -1032,7 +1209,6 @@ zfsctl_snapshot_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) mutex_exit(&sdp->sd_lock); VN_RELE(dvp); - VFS_RELE(vp->v_vfsp); /* * Dispose of the vnode for the snapshot mount point. diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c index 1ec4932646e90..2e3725c2bf1c3 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_dir.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -114,6 +114,8 @@ zfs_match_find(zfsvfs_t *zfsvfs, znode_t *dzp, char *name, boolean_t exact, * ZCIEXACT: On a purely case-insensitive file system, * this lookup should be case-sensitive. * ZRENAMING: we are locking for renaming, force narrow locks + * ZHAVELOCK: Don't grab the z_name_lock for this call. The + * current thread already holds it. * * Output arguments: * zpp - pointer to the znode for the entry (NULL if there isn't one) @@ -208,13 +210,20 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, /* * Wait until there are no locks on this name. + * + * Don't grab the the lock if it is already held. However, cannot + * have both ZSHARED and ZHAVELOCK together. */ - rw_enter(&dzp->z_name_lock, RW_READER); + ASSERT(!(flag & ZSHARED) || !(flag & ZHAVELOCK)); + if (!(flag & ZHAVELOCK)) + rw_enter(&dzp->z_name_lock, RW_READER); + mutex_enter(&dzp->z_lock); for (;;) { if (dzp->z_unlinked) { mutex_exit(&dzp->z_lock); - rw_exit(&dzp->z_name_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); return (ENOENT); } for (dl = dzp->z_dirlocks; dl != NULL; dl = dl->dl_next) { @@ -224,7 +233,8 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, } if (error != 0) { mutex_exit(&dzp->z_lock); - rw_exit(&dzp->z_name_lock); + if (!(flag & ZHAVELOCK)) + rw_exit(&dzp->z_name_lock); return (ENOENT); } if (dl == NULL) { @@ -235,6 +245,7 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, cv_init(&dl->dl_cv, NULL, CV_DEFAULT, NULL); dl->dl_name = name; dl->dl_sharecnt = 0; + dl->dl_namelock = 0; dl->dl_namesize = 0; dl->dl_dzp = dzp; dl->dl_next = dzp->z_dirlocks; @@ -246,6 +257,12 @@ zfs_dirent_lock(zfs_dirlock_t **dlpp, znode_t *dzp, char *name, znode_t **zpp, cv_wait(&dl->dl_cv, &dzp->z_lock); } + /* + * If the z_name_lock was NOT held for this dirlock record it. + */ + if (flag & ZHAVELOCK) + dl->dl_namelock = 1; + if ((flag & ZSHARED) && ++dl->dl_sharecnt > 1 && dl->dl_namesize == 0) { /* * We're the second shared reference to dl. Make a copy of @@ -325,7 +342,10 @@ zfs_dirent_unlock(zfs_dirlock_t *dl) zfs_dirlock_t **prev_dl, *cur_dl; mutex_enter(&dzp->z_lock); - rw_exit(&dzp->z_name_lock); + + if (!dl->dl_namelock) + rw_exit(&dzp->z_name_lock); + if (dl->dl_sharecnt > 1) { dl->dl_sharecnt--; mutex_exit(&dzp->z_lock); @@ -561,24 +581,6 @@ zfs_rmnode(znode_t *zp) ASSERT(ZTOV(zp)->v_count == 0); ASSERT(zp->z_phys->zp_links == 0); - /* - * If this is a ZIL replay then leave the object in the unlinked set. - * Otherwise we can get a deadlock, because the delete can be - * quite large and span multiple tx's and txgs, but each replay - * creates a tx to atomically run the replay function and mark the - * replay record as complete. We deadlock trying to start a tx in - * a new txg to further the deletion but can't because the replay - * tx hasn't finished. - * - * We actually delete the object if we get a failure to create an - * object in zil_replay_log_record(), or after calling zil_replay(). - */ - if (zfsvfs->z_assign >= TXG_INITIAL) { - zfs_znode_dmu_fini(zp); - zfs_znode_free(zp); - return; - } - /* * If this is an attribute directory, purge its contents. */ @@ -823,44 +825,49 @@ zfs_make_xattrdir(znode_t *zp, vattr_t *vap, vnode_t **xvpp, cred_t *cr) znode_t *xzp; dmu_tx_t *tx; int error; - zfs_fuid_info_t *fuidp = NULL; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; *xvpp = NULL; if (error = zfs_zaccess(zp, ACE_WRITE_NAMED_ATTRS, 0, B_FALSE, cr)) return (error); + if ((error = zfs_acl_ids_create(zp, IS_XATTR, vap, cr, NULL, + &acl_ids)) != 0) + return (error); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + return (EDQUOT); + } + tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) + zfs_acl_ids_free(&acl_ids); + if (error == ERESTART) dmu_tx_wait(tx); dmu_tx_abort(tx); return (error); } - zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, NULL, &fuidp); + zfs_mknode(zp, vap, tx, cr, IS_XATTR, &xzp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + ASSERT(xzp->z_phys->zp_parent == zp->z_id); dmu_buf_will_dirty(zp->z_dbuf, tx); zp->z_phys->zp_xattr = xzp->z_id; (void) zfs_log_create(zfsvfs->z_log, tx, TX_MKXATTR, zp, - xzp, "", NULL, fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + xzp, "", NULL, acl_ids.z_fuidp, vap); + + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); *xvpp = ZTOV(xzp); @@ -930,7 +937,7 @@ zfs_get_xattrdir(znode_t *zp, vnode_t **xvpp, cred_t *cr, int flags) error = zfs_make_xattrdir(zp, &va, xvpp, cr); zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { /* NB: we already did dmu_tx_wait() if necessary */ goto top; } @@ -959,7 +966,7 @@ zfs_sticky_remove_access(znode_t *zdp, znode_t *zp, cred_t *cr) uid_t fowner; zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - if (zdp->z_zfsvfs->z_assign >= TXG_INITIAL) /* ZIL replay */ + if (zdp->z_zfsvfs->z_replay) return (0); if ((zdp->z_phys->zp_mode & S_ISVTX) == 0) diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c index 236d69e7e6f07..0b4812666442d 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fm.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -87,21 +88,32 @@ * this pointer is set to NULL, and no ereport will be generated (since it * doesn't actually correspond to any particular device or piece of data, * and the caller will always retry without caching or queueing anyway). + * + * For checksum errors, we want to include more information about the actual + * error which occurs. Accordingly, we build an ereport when the error is + * noticed, but instead of sending it in immediately, we hang it off of the + * io_cksum_report field of the logical IO. When the logical IO completes + * (successfully or not), zfs_ereport_finish_checksum() is called with the + * good and bad versions of the buffer (if available), and we annotate the + * ereport with information about the differences. */ -void -zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, +#ifdef _KERNEL +static void +zfs_ereport_start(nvlist_t **ereport_out, nvlist_t **detector_out, + const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, uint64_t stateoroffset, uint64_t size) { -#ifdef _KERNEL nvlist_t *ereport, *detector; + uint64_t ena; char class[64]; - int state; /* - * If we are doing a spa_tryimport(), ignore errors. + * If we are doing a spa_tryimport() or in recovery mode, + * ignore errors. */ - if (spa->spa_load_state == SPA_LOAD_TRYIMPORT) + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT || + spa_load_state(spa) == SPA_LOAD_RECOVER) return; /* @@ -109,7 +121,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, * failed, don't bother logging any new ereports - we're just going to * get the same diagnosis anyway. */ - if (spa->spa_load_state != SPA_LOAD_NONE && + if (spa_load_state(spa) != SPA_LOAD_NONE && spa->spa_last_open_failed) return; @@ -130,17 +142,48 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, return; /* - * If the vdev has already been marked as failing due to a - * failed probe, then ignore any subsequent I/O errors, as the - * DE will automatically fault the vdev on the first such - * failure. + * If this I/O is not a retry I/O, don't post an ereport. + * Otherwise, we risk making bad diagnoses based on B_FAILFAST + * I/Os. */ - if (vd != NULL && - (!vdev_readable(vd) || !vdev_writeable(vd)) && - strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) != 0) + if (zio->io_error == EIO && + !(zio->io_flags & ZIO_FLAG_IO_RETRY)) return; + + if (vd != NULL) { + /* + * If the vdev has already been marked as failing due + * to a failed probe, then ignore any subsequent I/O + * errors, as the DE will automatically fault the vdev + * on the first such failure. This also catches cases + * where vdev_remove_wanted is set and the device has + * not yet been asynchronously placed into the REMOVED + * state. + */ + if (zio->io_vd == vd && !vdev_accessible(vd, zio)) + return; + + /* + * Ignore checksum errors for reads from DTL regions of + * leaf vdevs. + */ + if (zio->io_type == ZIO_TYPE_READ && + zio->io_error == ECKSUM && + vd->vdev_ops->vdev_op_leaf && + vdev_dtl_contains(vd, DTL_MISSING, zio->io_txg, 1)) + return; + } } + /* + * For probe failure, we want to avoid posting ereports if we've + * already removed the device in the meantime. + */ + if (vd != NULL && + strcmp(subclass, FM_EREPORT_ZFS_PROBE_FAILURE) == 0 && + (vd->vdev_remove_wanted || vd->vdev_state == VDEV_STATE_REMOVED)) + return; + if ((ereport = fm_nvlist_create(NULL)) == NULL) return; @@ -159,7 +202,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, * state, use a SPA-wide ENA. Otherwise, if we are in an I/O state, use * a root zio-wide ENA. Otherwise, simply use a unique ENA. */ - if (spa->spa_load_state != SPA_LOAD_NONE) { + if (spa_load_state(spa) != SPA_LOAD_NONE) { if (spa->spa_ena == 0) spa->spa_ena = fm_ena_generate(0, FM_ENA_FMT1); ena = spa->spa_ena; @@ -188,14 +231,6 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, * passed in. */ - /* - * If we are importing a faulted pool, then we treat it like an open, - * not an import. Otherwise, the DE will ignore all faults during - * import, since the default behavior is to mark the devices as - * persistently unavailable, not leave them in the faulted state. - */ - state = spa->spa_import_faulted ? SPA_LOAD_OPEN : spa->spa_load_state; - /* * Generic payload members common to all ereports. */ @@ -203,7 +238,7 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, DATA_TYPE_STRING, spa_name(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_GUID, DATA_TYPE_UINT64, spa_guid(spa), FM_EREPORT_PAYLOAD_ZFS_POOL_CONTEXT, DATA_TYPE_INT32, - state, NULL); + spa_load_state(spa), NULL); if (spa != NULL) { fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_POOL_FAILMODE, @@ -222,14 +257,18 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, DATA_TYPE_UINT64, vd->vdev_guid, FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE, DATA_TYPE_STRING, vd->vdev_ops->vdev_op_type, NULL); - if (vd->vdev_path) + if (vd->vdev_path != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, DATA_TYPE_STRING, vd->vdev_path, NULL); - if (vd->vdev_devid) + if (vd->vdev_devid != NULL) fm_payload_set(ereport, FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID, DATA_TYPE_STRING, vd->vdev_devid, NULL); + if (vd->vdev_fru != NULL) + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU, + DATA_TYPE_STRING, vd->vdev_fru, NULL); if (pvd != NULL) { fm_payload_set(ereport, @@ -303,8 +342,339 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, FM_EREPORT_PAYLOAD_ZFS_PREV_STATE, DATA_TYPE_UINT64, stateoroffset, NULL); } + mutex_exit(&spa->spa_errlist_lock); + *ereport_out = ereport; + *detector_out = detector; +} + +/* if it's <= 128 bytes, save the corruption directly */ +#define ZFM_MAX_INLINE (128 / sizeof (uint64_t)) + +#define MAX_RANGES 16 + +typedef struct zfs_ecksum_info { + /* histograms of set and cleared bits by bit number in a 64-bit word */ + uint16_t zei_histogram_set[sizeof (uint64_t) * NBBY]; + uint16_t zei_histogram_cleared[sizeof (uint64_t) * NBBY]; + + /* inline arrays of bits set and cleared. */ + uint64_t zei_bits_set[ZFM_MAX_INLINE]; + uint64_t zei_bits_cleared[ZFM_MAX_INLINE]; + + /* + * for each range, the number of bits set and cleared. The Hamming + * distance between the good and bad buffers is the sum of them all. + */ + uint32_t zei_range_sets[MAX_RANGES]; + uint32_t zei_range_clears[MAX_RANGES]; + + struct zei_ranges { + uint32_t zr_start; + uint32_t zr_end; + } zei_ranges[MAX_RANGES]; + + size_t zei_range_count; + uint32_t zei_mingap; + uint32_t zei_allowed_mingap; + +} zfs_ecksum_info_t; + +static void +update_histogram(uint64_t value_arg, uint16_t *hist, uint32_t *count) +{ + size_t i; + size_t bits = 0; + uint64_t value = BE_64(value_arg); + + /* We store the bits in big-endian (largest-first) order */ + for (i = 0; i < 64; i++) { + if (value & (1ull << i)) { + hist[63 - i]++; + ++bits; + } + } + /* update the count of bits changed */ + *count += bits; +} + +/* + * We've now filled up the range array, and need to increase "mingap" and + * shrink the range list accordingly. zei_mingap is always the smallest + * distance between array entries, so we set the new_allowed_gap to be + * one greater than that. We then go through the list, joining together + * any ranges which are closer than the new_allowed_gap. + * + * By construction, there will be at least one. We also update zei_mingap + * to the new smallest gap, to prepare for our next invocation. + */ +static void +shrink_ranges(zfs_ecksum_info_t *eip) +{ + uint32_t mingap = UINT32_MAX; + uint32_t new_allowed_gap = eip->zei_mingap + 1; + + size_t idx, output; + size_t max = eip->zei_range_count; + + struct zei_ranges *r = eip->zei_ranges; + + ASSERT3U(eip->zei_range_count, >, 0); + ASSERT3U(eip->zei_range_count, <=, MAX_RANGES); + + output = idx = 0; + while (idx < max - 1) { + uint32_t start = r[idx].zr_start; + uint32_t end = r[idx].zr_end; + + while (idx < max - 1) { + idx++; + + uint32_t nstart = r[idx].zr_start; + uint32_t nend = r[idx].zr_end; + + uint32_t gap = nstart - end; + if (gap < new_allowed_gap) { + end = nend; + continue; + } + if (gap < mingap) + mingap = gap; + break; + } + r[output].zr_start = start; + r[output].zr_end = end; + output++; + } + ASSERT3U(output, <, eip->zei_range_count); + eip->zei_range_count = output; + eip->zei_mingap = mingap; + eip->zei_allowed_mingap = new_allowed_gap; +} + +static void +add_range(zfs_ecksum_info_t *eip, int start, int end) +{ + struct zei_ranges *r = eip->zei_ranges; + size_t count = eip->zei_range_count; + + if (count >= MAX_RANGES) { + shrink_ranges(eip); + count = eip->zei_range_count; + } + if (count == 0) { + eip->zei_mingap = UINT32_MAX; + eip->zei_allowed_mingap = 1; + } else { + int gap = start - r[count - 1].zr_end; + + if (gap < eip->zei_allowed_mingap) { + r[count - 1].zr_end = end; + return; + } + if (gap < eip->zei_mingap) + eip->zei_mingap = gap; + } + r[count].zr_start = start; + r[count].zr_end = end; + eip->zei_range_count++; +} + +static size_t +range_total_size(zfs_ecksum_info_t *eip) +{ + struct zei_ranges *r = eip->zei_ranges; + size_t count = eip->zei_range_count; + size_t result = 0; + size_t idx; + + for (idx = 0; idx < count; idx++) + result += (r[idx].zr_end - r[idx].zr_start); + + return (result); +} + +static zfs_ecksum_info_t * +annotate_ecksum(nvlist_t *ereport, zio_bad_cksum_t *info, + const uint8_t *goodbuf, const uint8_t *badbuf, size_t size, + boolean_t drop_if_identical) +{ + const uint64_t *good = (const uint64_t *)goodbuf; + const uint64_t *bad = (const uint64_t *)badbuf; + + uint64_t allset = 0; + uint64_t allcleared = 0; + + size_t nui64s = size / sizeof (uint64_t); + + size_t inline_size; + int no_inline = 0; + size_t idx; + size_t range; + + size_t offset = 0; + ssize_t start = -1; + + zfs_ecksum_info_t *eip = kmem_zalloc(sizeof (*eip), KM_SLEEP); + + /* don't do any annotation for injected checksum errors */ + if (info != NULL && info->zbc_injected) + return (eip); + + if (info != NULL && info->zbc_has_cksum) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED, + DATA_TYPE_UINT64_ARRAY, + sizeof (info->zbc_expected) / sizeof (uint64_t), + (uint64_t *)&info->zbc_expected, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL, + DATA_TYPE_UINT64_ARRAY, + sizeof (info->zbc_actual) / sizeof (uint64_t), + (uint64_t *)&info->zbc_actual, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO, + DATA_TYPE_STRING, + info->zbc_checksum_name, + NULL); + + if (info->zbc_byteswapped) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP, + DATA_TYPE_BOOLEAN, 1, + NULL); + } + } + + if (badbuf == NULL || goodbuf == NULL) + return (eip); + + ASSERT3U(nui64s, <=, UINT16_MAX); + ASSERT3U(size, ==, nui64s * sizeof (uint64_t)); + ASSERT3U(size, <=, SPA_MAXBLOCKSIZE); + ASSERT3U(size, <=, UINT32_MAX); + + /* build up the range list by comparing the two buffers. */ + for (idx = 0; idx < nui64s; idx++) { + if (good[idx] == bad[idx]) { + if (start == -1) + continue; + + add_range(eip, start, idx); + start = -1; + } else { + if (start != -1) + continue; + + start = idx; + } + } + if (start != -1) + add_range(eip, start, idx); + + /* See if it will fit in our inline buffers */ + inline_size = range_total_size(eip); + if (inline_size > ZFM_MAX_INLINE) + no_inline = 1; + + /* + * If there is no change and we want to drop if the buffers are + * identical, do so. + */ + if (inline_size == 0 && drop_if_identical) { + kmem_free(eip, sizeof (*eip)); + return (NULL); + } + + /* + * Now walk through the ranges, filling in the details of the + * differences. Also convert our uint64_t-array offsets to byte + * offsets. + */ + for (range = 0; range < eip->zei_range_count; range++) { + size_t start = eip->zei_ranges[range].zr_start; + size_t end = eip->zei_ranges[range].zr_end; + + for (idx = start; idx < end; idx++) { + uint64_t set, cleared; + + // bits set in bad, but not in good + set = ((~good[idx]) & bad[idx]); + // bits set in good, but not in bad + cleared = (good[idx] & (~bad[idx])); + + allset |= set; + allcleared |= cleared; + + if (!no_inline) { + ASSERT3U(offset, <, inline_size); + eip->zei_bits_set[offset] = set; + eip->zei_bits_cleared[offset] = cleared; + offset++; + } + + update_histogram(set, eip->zei_histogram_set, + &eip->zei_range_sets[range]); + update_histogram(cleared, eip->zei_histogram_cleared, + &eip->zei_range_clears[range]); + } + + /* convert to byte offsets */ + eip->zei_ranges[range].zr_start *= sizeof (uint64_t); + eip->zei_ranges[range].zr_end *= sizeof (uint64_t); + } + eip->zei_allowed_mingap *= sizeof (uint64_t); + inline_size *= sizeof (uint64_t); + + /* fill in ereport */ + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES, + DATA_TYPE_UINT32_ARRAY, 2 * eip->zei_range_count, + (uint32_t *)eip->zei_ranges, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP, + DATA_TYPE_UINT32, eip->zei_allowed_mingap, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS, + DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_sets, + FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS, + DATA_TYPE_UINT32_ARRAY, eip->zei_range_count, eip->zei_range_clears, + NULL); + + if (!no_inline) { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS, + DATA_TYPE_UINT8_ARRAY, + inline_size, (uint8_t *)eip->zei_bits_set, + FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS, + DATA_TYPE_UINT8_ARRAY, + inline_size, (uint8_t *)eip->zei_bits_cleared, + NULL); + } else { + fm_payload_set(ereport, + FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM, + DATA_TYPE_UINT16_ARRAY, + NBBY * sizeof (uint64_t), eip->zei_histogram_set, + FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM, + DATA_TYPE_UINT16_ARRAY, + NBBY * sizeof (uint64_t), eip->zei_histogram_cleared, + NULL); + } + return (eip); +} +#endif + +void +zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, + uint64_t stateoroffset, uint64_t size) +{ +#ifdef _KERNEL + nvlist_t *ereport = NULL; + nvlist_t *detector = NULL; + + zfs_ereport_start(&ereport, &detector, + subclass, spa, vd, zio, stateoroffset, size); + + if (ereport == NULL) + return; + fm_ereport_post(ereport, EVCH_SLEEP); fm_nvlist_destroy(ereport, FM_NVA_FREE); @@ -312,6 +682,122 @@ zfs_ereport_post(const char *subclass, spa_t *spa, vdev_t *vd, zio_t *zio, #endif } +void +zfs_ereport_start_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, void *arg, + zio_bad_cksum_t *info) +{ + zio_cksum_report_t *report = kmem_zalloc(sizeof (*report), KM_SLEEP); + + if (zio->io_vsd != NULL) + zio->io_vsd_ops->vsd_cksum_report(zio, report, arg); + else + zio_vsd_default_cksum_report(zio, report, arg); + + /* copy the checksum failure information if it was provided */ + if (info != NULL) { + report->zcr_ckinfo = kmem_zalloc(sizeof (*info), KM_SLEEP); + bcopy(info, report->zcr_ckinfo, sizeof (*info)); + } + + report->zcr_align = 1ULL << vd->vdev_top->vdev_ashift; + report->zcr_length = length; + +#ifdef _KERNEL + zfs_ereport_start(&report->zcr_ereport, &report->zcr_detector, + FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); + + if (report->zcr_ereport == NULL) { + report->zcr_free(report->zcr_cbdata, report->zcr_cbinfo); + kmem_free(report, sizeof (*report)); + return; + } +#endif + + mutex_enter(&spa->spa_errlist_lock); + report->zcr_next = zio->io_logical->io_cksum_report; + zio->io_logical->io_cksum_report = report; + mutex_exit(&spa->spa_errlist_lock); +} + +void +zfs_ereport_finish_checksum(zio_cksum_report_t *report, + const void *good_data, const void *bad_data, boolean_t drop_if_identical) +{ +#ifdef _KERNEL + zfs_ecksum_info_t *info = NULL; + info = annotate_ecksum(report->zcr_ereport, report->zcr_ckinfo, + good_data, bad_data, report->zcr_length, drop_if_identical); + + if (info != NULL) + fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); + + fm_nvlist_destroy(report->zcr_ereport, FM_NVA_FREE); + fm_nvlist_destroy(report->zcr_detector, FM_NVA_FREE); + report->zcr_ereport = report->zcr_detector = NULL; + + if (info != NULL) + kmem_free(info, sizeof (*info)); +#endif +} + +void +zfs_ereport_free_checksum(zio_cksum_report_t *rpt) +{ +#ifdef _KERNEL + if (rpt->zcr_ereport != NULL) { + fm_nvlist_destroy(rpt->zcr_ereport, + FM_NVA_FREE); + fm_nvlist_destroy(rpt->zcr_detector, + FM_NVA_FREE); + } +#endif + rpt->zcr_free(rpt->zcr_cbdata, rpt->zcr_cbinfo); + + if (rpt->zcr_ckinfo != NULL) + kmem_free(rpt->zcr_ckinfo, sizeof (*rpt->zcr_ckinfo)); + + kmem_free(rpt, sizeof (*rpt)); +} + +void +zfs_ereport_send_interim_checksum(zio_cksum_report_t *report) +{ +#ifdef _KERNEL + fm_ereport_post(report->zcr_ereport, EVCH_SLEEP); +#endif +} + +void +zfs_ereport_post_checksum(spa_t *spa, vdev_t *vd, + struct zio *zio, uint64_t offset, uint64_t length, + const void *good_data, const void *bad_data, zio_bad_cksum_t *zbc) +{ +#ifdef _KERNEL + nvlist_t *ereport = NULL; + nvlist_t *detector = NULL; + zfs_ecksum_info_t *info; + + zfs_ereport_start(&ereport, &detector, + FM_EREPORT_ZFS_CHECKSUM, spa, vd, zio, offset, length); + + if (ereport == NULL) + return; + + info = annotate_ecksum(ereport, zbc, good_data, bad_data, length, + B_FALSE); + + if (info != NULL) + fm_ereport_post(ereport, EVCH_SLEEP); + + fm_nvlist_destroy(ereport, FM_NVA_FREE); + fm_nvlist_destroy(detector, FM_NVA_FREE); + + if (info != NULL) + kmem_free(info, sizeof (*info)); +#endif +} + static void zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) { @@ -319,6 +805,9 @@ zfs_post_common(spa_t *spa, vdev_t *vd, const char *name) nvlist_t *resource; char class[64]; + if (spa_load_state(spa) == SPA_LOAD_TRYIMPORT) + return; + if ((resource = fm_nvlist_create(NULL)) == NULL) return; @@ -360,3 +849,15 @@ zfs_post_autoreplace(spa_t *spa, vdev_t *vd) { zfs_post_common(spa, vd, FM_RESOURCE_AUTOREPLACE); } + +/* + * The 'resource.fs.zfs.statechange' event is an internal signal that the + * given vdev has transitioned its state to DEGRADED or HEALTHY. This will + * cause the retire agent to repair any outstanding fault management cases + * open because the device was not found (fault.fs.zfs.device). + */ +void +zfs_post_state_change(spa_t *spa, vdev_t *vd) +{ + zfs_post_common(spa, vd, FM_RESOURCE_STATECHANGE); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c index 7cb505258d8f7..dfa4f8daef58d 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_fuid.c @@ -19,12 +19,11 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #include -#include #include #include #include @@ -47,8 +46,10 @@ * During file system initialization the nvlist(s) are read and * two AVL trees are created. One tree is keyed by the index number * and the other by the domain string. Nodes are never removed from - * trees, but new entries may be added. If a new entry is added then the - * on-disk packed nvlist will also be updated. + * trees, but new entries may be added. If a new entry is added then + * the zfsvfs->z_fuid_dirty flag is set to true and the caller will then + * be responsible for calling zfs_fuid_sync() to sync the changes to disk. + * */ #define FUID_IDX "fuid_idx" @@ -97,6 +98,15 @@ domain_compare(const void *arg1, const void *arg2) return (val > 0 ? 1 : -1); } +void +zfs_fuid_avl_tree_create(avl_tree_t *idx_tree, avl_tree_t *domain_tree) +{ + avl_create(idx_tree, idx_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); + avl_create(domain_tree, domain_compare, + sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); +} + /* * load initial fuid domain and idx trees. This function is used by * both the kernel and zdb. @@ -108,12 +118,9 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, dmu_buf_t *db; uint64_t fuid_size; - avl_create(idx_tree, idx_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_idxnode)); - avl_create(domain_tree, domain_compare, - sizeof (fuid_domain_t), offsetof(fuid_domain_t, f_domnode)); - - VERIFY(0 == dmu_bonus_hold(os, fuid_obj, FTAG, &db)); + ASSERT(fuid_obj != 0); + VERIFY(0 == dmu_bonus_hold(os, fuid_obj, + FTAG, &db)); fuid_size = *(uint64_t *)db->db_data; dmu_buf_rele(db, FTAG); @@ -125,7 +132,8 @@ zfs_fuid_table_load(objset_t *os, uint64_t fuid_obj, avl_tree_t *idx_tree, int i; packed = kmem_alloc(fuid_size, KM_SLEEP); - VERIFY(dmu_read(os, fuid_obj, 0, fuid_size, packed) == 0); + VERIFY(dmu_read(os, fuid_obj, 0, + fuid_size, packed, DMU_READ_PREFETCH) == 0); VERIFY(nvlist_unpack(packed, fuid_size, &nvp, 0) == 0); VERIFY(nvlist_lookup_nvlist_array(nvp, FUID_NVP_ARRAY, @@ -189,10 +197,8 @@ zfs_fuid_idx_domain(avl_tree_t *idx_tree, uint32_t idx) * Load the fuid table(s) into memory. */ static void -zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +zfs_fuid_init(zfsvfs_t *zfsvfs) { - int error = 0; - rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); if (zfsvfs->z_fuid_loaded) { @@ -200,41 +206,101 @@ zfs_fuid_init(zfsvfs_t *zfsvfs, dmu_tx_t *tx) return; } - if (zfsvfs->z_fuid_obj == 0) { - - /* first make sure we need to allocate object */ - - error = zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); - if (error == ENOENT && tx != NULL) { - zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, - DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, - sizeof (uint64_t), tx); - VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, - ZFS_FUID_TABLES, sizeof (uint64_t), 1, - &zfsvfs->z_fuid_obj, tx) == 0); - } - } + zfs_fuid_avl_tree_create(&zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); + (void) zap_lookup(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, 8, 1, &zfsvfs->z_fuid_obj); if (zfsvfs->z_fuid_obj != 0) { zfsvfs->z_fuid_size = zfs_fuid_table_load(zfsvfs->z_os, zfsvfs->z_fuid_obj, &zfsvfs->z_fuid_idx, &zfsvfs->z_fuid_domain); - zfsvfs->z_fuid_loaded = B_TRUE; } + zfsvfs->z_fuid_loaded = B_TRUE; + rw_exit(&zfsvfs->z_fuid_lock); +} + +/* + * sync out AVL trees to persistent storage. + */ +void +zfs_fuid_sync(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + nvlist_t *nvp; + nvlist_t **fuids; + size_t nvsize = 0; + char *packed; + dmu_buf_t *db; + fuid_domain_t *domnode; + int numnodes; + int i; + + if (!zfsvfs->z_fuid_dirty) { + return; + } + + rw_enter(&zfsvfs->z_fuid_lock, RW_WRITER); + + /* + * First see if table needs to be created? + */ + if (zfsvfs->z_fuid_obj == 0) { + zfsvfs->z_fuid_obj = dmu_object_alloc(zfsvfs->z_os, + DMU_OT_FUID, 1 << 14, DMU_OT_FUID_SIZE, + sizeof (uint64_t), tx); + VERIFY(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_FUID_TABLES, sizeof (uint64_t), 1, + &zfsvfs->z_fuid_obj, tx) == 0); + } + + VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + numnodes = avl_numnodes(&zfsvfs->z_fuid_idx); + fuids = kmem_alloc(numnodes * sizeof (void *), KM_SLEEP); + for (i = 0, domnode = avl_first(&zfsvfs->z_fuid_domain); domnode; i++, + domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode)) { + VERIFY(nvlist_alloc(&fuids[i], NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, + domnode->f_idx) == 0); + VERIFY(nvlist_add_uint64(fuids[i], FUID_OFFSET, 0) == 0); + VERIFY(nvlist_add_string(fuids[i], FUID_DOMAIN, + domnode->f_ksid->kd_name) == 0); + } + VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, + fuids, numnodes) == 0); + for (i = 0; i != numnodes; i++) + nvlist_free(fuids[i]); + kmem_free(fuids, numnodes * sizeof (void *)); + VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); + packed = kmem_alloc(nvsize, KM_SLEEP); + VERIFY(nvlist_pack(nvp, &packed, &nvsize, + NV_ENCODE_XDR, KM_SLEEP) == 0); + nvlist_free(nvp); + zfsvfs->z_fuid_size = nvsize; + dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, + zfsvfs->z_fuid_size, packed, tx); + kmem_free(packed, zfsvfs->z_fuid_size); + VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, + FTAG, &db)); + dmu_buf_will_dirty(db, tx); + *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; + dmu_buf_rele(db, FTAG); + + zfsvfs->z_fuid_dirty = B_FALSE; rw_exit(&zfsvfs->z_fuid_lock); } /* * Query domain table for a given domain. * - * If domain isn't found it is added to AVL trees and - * the results are pushed out to disk. + * If domain isn't found and addok is set, it is added to AVL trees and + * the zfsvfs->z_fuid_dirty flag will be set to TRUE. It will then be + * necessary for the caller or another thread to detect the dirty table + * and sync out the changes. */ int -zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, - dmu_tx_t *tx) +zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, + char **retdomain, boolean_t addok) { fuid_domain_t searchnode, *findnode; avl_index_t loc; @@ -246,16 +312,16 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, * for the user nobody. */ if (domain[0] == '\0') { - *retdomain = nulldomain; + if (retdomain) + *retdomain = nulldomain; return (0); } searchnode.f_ksid = ksid_lookupdomain(domain); - if (retdomain) { + if (retdomain) *retdomain = searchnode.f_ksid->kd_name; - } if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs, tx); + zfs_fuid_init(zfsvfs); retry: rw_enter(&zfsvfs->z_fuid_lock, rw); @@ -265,15 +331,9 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, rw_exit(&zfsvfs->z_fuid_lock); ksiddomain_rele(searchnode.f_ksid); return (findnode->f_idx); - } else { + } else if (addok) { fuid_domain_t *domnode; - nvlist_t *nvp; - nvlist_t **fuids; uint64_t retidx; - size_t nvsize = 0; - char *packed; - dmu_buf_t *db; - int i = 0; if (rw == RW_READER && !rw_tryupgrade(&zfsvfs->z_fuid_lock)) { rw_exit(&zfsvfs->z_fuid_lock); @@ -288,46 +348,12 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, avl_add(&zfsvfs->z_fuid_domain, domnode); avl_add(&zfsvfs->z_fuid_idx, domnode); - /* - * Now resync the on-disk nvlist. - */ - VERIFY(nvlist_alloc(&nvp, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - domnode = avl_first(&zfsvfs->z_fuid_domain); - fuids = kmem_alloc(retidx * sizeof (void *), KM_SLEEP); - while (domnode) { - VERIFY(nvlist_alloc(&fuids[i], - NV_UNIQUE_NAME, KM_SLEEP) == 0); - VERIFY(nvlist_add_uint64(fuids[i], FUID_IDX, - domnode->f_idx) == 0); - VERIFY(nvlist_add_uint64(fuids[i], - FUID_OFFSET, 0) == 0); - VERIFY(nvlist_add_string(fuids[i++], FUID_DOMAIN, - domnode->f_ksid->kd_name) == 0); - domnode = AVL_NEXT(&zfsvfs->z_fuid_domain, domnode); - } - VERIFY(nvlist_add_nvlist_array(nvp, FUID_NVP_ARRAY, - fuids, retidx) == 0); - for (i = 0; i != retidx; i++) - nvlist_free(fuids[i]); - kmem_free(fuids, retidx * sizeof (void *)); - VERIFY(nvlist_size(nvp, &nvsize, NV_ENCODE_XDR) == 0); - packed = kmem_alloc(nvsize, KM_SLEEP); - VERIFY(nvlist_pack(nvp, &packed, &nvsize, - NV_ENCODE_XDR, KM_SLEEP) == 0); - nvlist_free(nvp); - zfsvfs->z_fuid_size = nvsize; - dmu_write(zfsvfs->z_os, zfsvfs->z_fuid_obj, 0, - zfsvfs->z_fuid_size, packed, tx); - kmem_free(packed, zfsvfs->z_fuid_size); - VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, zfsvfs->z_fuid_obj, - FTAG, &db)); - dmu_buf_will_dirty(db, tx); - *(uint64_t *)db->db_data = zfsvfs->z_fuid_size; - dmu_buf_rele(db, FTAG); - + zfsvfs->z_fuid_dirty = B_TRUE; rw_exit(&zfsvfs->z_fuid_lock); return (retidx); + } else { + rw_exit(&zfsvfs->z_fuid_lock); + return (-1); } } @@ -337,7 +363,7 @@ zfs_fuid_find_by_domain(zfsvfs_t *zfsvfs, const char *domain, char **retdomain, * Returns a pointer from an avl node of the domain string. * */ -static char * +const char * zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) { char *domain; @@ -346,7 +372,7 @@ zfs_fuid_find_by_idx(zfsvfs_t *zfsvfs, uint32_t idx) return (NULL); if (!zfsvfs->z_fuid_loaded) - zfs_fuid_init(zfsvfs, NULL); + zfs_fuid_init(zfsvfs); rw_enter(&zfsvfs->z_fuid_lock, RW_READER); @@ -374,7 +400,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, cred_t *cr, zfs_fuid_type_t type) { uint32_t index = FUID_INDEX(fuid); - char *domain; + const char *domain; uid_t id; if (index == 0) @@ -400,7 +426,7 @@ zfs_fuid_map_id(zfsvfs_t *zfsvfs, uint64_t fuid, * If ACL has multiple domains, then keep only one copy of each unique * domain. */ -static void +void zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, uint64_t idx, uint64_t id, zfs_fuid_type_t type) { @@ -439,6 +465,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, } if (type == ZFS_ACE_USER || type == ZFS_ACE_GROUP) { + /* * Now allocate fuid entry and add it on the end of the list */ @@ -463,7 +490,7 @@ zfs_fuid_node_add(zfs_fuid_info_t **fuidpp, const char *domain, uint32_t rid, */ uint64_t zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, - dmu_tx_t *tx, cred_t *cr, zfs_fuid_info_t **fuidp) + cred_t *cr, zfs_fuid_info_t **fuidp) { uint64_t idx; ksid_t *ksid; @@ -482,6 +509,11 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, id = crgetuid(cr); else id = crgetgid(cr); + + if (IS_EPHEMERAL(id)) { + return ((uint64_t)(type == ZFS_OWNER ? + UID_NOBODY : GID_NOBODY)); + } } if (!zfsvfs->z_use_fuids || (!IS_EPHEMERAL(id))) @@ -490,7 +522,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, rid = ksid_getrid(ksid); domain = ksid_getdomain(ksid); - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); zfs_fuid_node_add(fuidp, kdomain, rid, idx, id, type); @@ -511,7 +543,7 @@ zfs_fuid_create_cred(zfsvfs_t *zfsvfs, zfs_fuid_type_t type, */ uint64_t zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, - zfs_fuid_type_t type, dmu_tx_t *tx, zfs_fuid_info_t **fuidpp) + zfs_fuid_type_t type, zfs_fuid_info_t **fuidpp) { const char *domain; char *kdomain; @@ -519,7 +551,6 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, uint32_t rid; idmap_stat status; uint64_t idx; - boolean_t is_replay = (zfsvfs->z_assign >= TXG_INITIAL); zfs_fuid_t *zfuid = NULL; zfs_fuid_info_t *fuidp; @@ -534,7 +565,7 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, if (!zfsvfs->z_use_fuids || !IS_EPHEMERAL(id) || fuid_idx != 0) return (id); - if (is_replay) { + if (zfsvfs->z_replay) { fuidp = zfsvfs->z_fuid_replay; /* @@ -582,10 +613,11 @@ zfs_fuid_create(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr, } } - idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, tx); + idx = zfs_fuid_find_by_domain(zfsvfs, domain, &kdomain, B_TRUE); - if (!is_replay) - zfs_fuid_node_add(fuidpp, kdomain, rid, idx, id, type); + if (!zfsvfs->z_replay) + zfs_fuid_node_add(fuidpp, kdomain, + rid, idx, id, type); else if (zfuid != NULL) { list_remove(&fuidp->z_fuids, zfuid); kmem_free(zfuid, sizeof (zfs_fuid_t)); @@ -659,16 +691,15 @@ boolean_t zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) { ksid_t *ksid = crgetsid(cr, KSID_GROUP); + ksidlist_t *ksidlist = crgetsidlist(cr); uid_t gid; - if (ksid) { + if (ksid && ksidlist) { int i; ksid_t *ksid_groups; - ksidlist_t *ksidlist = crgetsidlist(cr); uint32_t idx = FUID_INDEX(id); uint32_t rid = FUID_RID(id); - ASSERT(ksidlist); ksid_groups = ksidlist->ksl_sids; for (i = 0; i != ksidlist->ksl_nsid; i++) { @@ -678,7 +709,7 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) return (B_TRUE); } } else { - char *domain; + const char *domain; domain = zfs_fuid_find_by_idx(zfsvfs, idx); ASSERT(domain != NULL); @@ -701,4 +732,19 @@ zfs_groupmember(zfsvfs_t *zfsvfs, uint64_t id, cred_t *cr) gid = zfs_fuid_map_id(zfsvfs, id, cr, ZFS_GROUP); return (groupmember(gid, cr)); } + +void +zfs_fuid_txhold(zfsvfs_t *zfsvfs, dmu_tx_t *tx) +{ + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } +} #endif diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c index c77892f90ab5b..f3c2c1d1bb349 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_ioctl.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -36,12 +36,13 @@ #include #include #include +#include #include #include #include #include #include -#include +#include #include #include #include @@ -79,21 +80,36 @@ dev_info_t *zfs_dip; typedef int zfs_ioc_func_t(zfs_cmd_t *); typedef int zfs_secpolicy_func_t(zfs_cmd_t *, cred_t *); +typedef enum { + NO_NAME, + POOL_NAME, + DATASET_NAME +} zfs_ioc_namecheck_t; + typedef struct zfs_ioc_vec { zfs_ioc_func_t *zvec_func; zfs_secpolicy_func_t *zvec_secpolicy; - enum { - NO_NAME, - POOL_NAME, - DATASET_NAME - } zvec_namecheck; + zfs_ioc_namecheck_t zvec_namecheck; boolean_t zvec_his_log; + boolean_t zvec_pool_check; } zfs_ioc_vec_t; -static void clear_props(char *dataset, nvlist_t *props); +/* This array is indexed by zfs_userquota_prop_t */ +static const char *userquota_perms[] = { + ZFS_DELEG_PERM_USERUSED, + ZFS_DELEG_PERM_USERQUOTA, + ZFS_DELEG_PERM_GROUPUSED, + ZFS_DELEG_PERM_GROUPQUOTA, +}; + +static int zfs_ioc_userspace_upgrade(zfs_cmd_t *zc); +static int zfs_check_settable(const char *name, nvpair_t *property, + cred_t *cr); +static int zfs_check_clearable(char *dataset, nvlist_t *props, + nvlist_t **errors); static int zfs_fill_zplprops_root(uint64_t, nvlist_t *, nvlist_t *, boolean_t *); -int zfs_set_prop_nvlist(const char *, nvlist_t *); +int zfs_set_prop_nvlist(const char *, zprop_source_t, nvlist_t *, nvlist_t **); /* _NOTE(PRINTFLIKE(4)) - this is printf-like, but lint is too whiney */ void @@ -163,22 +179,15 @@ history_str_get(zfs_cmd_t *zc) static boolean_t zfs_is_bootfs(const char *name) { - spa_t *spa; - boolean_t ret = B_FALSE; - - if (spa_open(name, &spa, FTAG) == 0) { - if (spa->spa_bootfs) { - objset_t *os; + objset_t *os; - if (dmu_objset_open(name, DMU_OST_ZFS, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - ret = (dmu_objset_id(os) == spa->spa_bootfs); - dmu_objset_close(os); - } - } - spa_close(spa, FTAG); + if (dmu_objset_hold(name, FTAG, &os) == 0) { + boolean_t ret; + ret = (dmu_objset_id(os) == spa_bootfs(dmu_objset_spa(os))); + dmu_objset_rele(os, FTAG); + return (ret); } - return (ret); + return (B_FALSE); } /* @@ -212,13 +221,17 @@ zpl_earlier_version(const char *name, int version) objset_t *os; boolean_t rc = B_TRUE; - if (dmu_objset_open(name, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { + if (dmu_objset_hold(name, FTAG, &os) == 0) { uint64_t zplversion; + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (B_TRUE); + } + /* XXX reading from non-owned objset */ if (zfs_get_zplprop(os, ZFS_PROP_VERSION, &zplversion) == 0) rc = zplversion < version; - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); } return (rc); } @@ -318,9 +331,109 @@ zfs_secpolicy_write_perms(const char *name, const char *perm, cred_t *cr) return (error); } +/* + * Policy for setting the security label property. + * + * Returns 0 for success, non-zero for access and other errors. + */ +static int +zfs_set_slabel_policy(const char *name, char *strval, cred_t *cr) +{ + char ds_hexsl[MAXNAMELEN]; + bslabel_t ds_sl, new_sl; + boolean_t new_default = FALSE; + uint64_t zoned; + int needed_priv = -1; + int error; + + /* First get the existing dataset label. */ + error = dsl_prop_get(name, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (EPERM); + + if (strcasecmp(strval, ZFS_MLSLABEL_DEFAULT) == 0) + new_default = TRUE; + + /* The label must be translatable */ + if (!new_default && (hexstr_to_label(strval, &new_sl) != 0)) + return (EINVAL); + + /* + * In a non-global zone, disallow attempts to set a label that + * doesn't match that of the zone; otherwise no other checks + * are needed. + */ + if (!INGLOBALZONE(curproc)) { + if (new_default || !blequal(&new_sl, CR_SL(CRED()))) + return (EPERM); + return (0); + } + + /* + * For global-zone datasets (i.e., those whose zoned property is + * "off", verify that the specified new label is valid for the + * global zone. + */ + if (dsl_prop_get_integer(name, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (EPERM); + if (!zoned) { + if (zfs_check_global_label(name, strval) != 0) + return (EPERM); + } + + /* + * If the existing dataset label is nondefault, check if the + * dataset is mounted (label cannot be changed while mounted). + * Get the zfsvfs; if there isn't one, then the dataset isn't + * mounted (or isn't a dataset, doesn't exist, ...). + */ + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) != 0) { + objset_t *os; + static char *setsl_tag = "setsl_tag"; + + /* + * Try to own the dataset; abort if there is any error, + * (e.g., already mounted, in use, or other error). + */ + error = dmu_objset_own(name, DMU_OST_ZFS, B_TRUE, + setsl_tag, &os); + if (error) + return (EPERM); + + dmu_objset_disown(os, setsl_tag); + + if (new_default) { + needed_priv = PRIV_FILE_DOWNGRADE_SL; + goto out_check; + } + + if (hexstr_to_label(strval, &new_sl) != 0) + return (EPERM); + + if (blstrictdom(&ds_sl, &new_sl)) + needed_priv = PRIV_FILE_DOWNGRADE_SL; + else if (blstrictdom(&new_sl, &ds_sl)) + needed_priv = PRIV_FILE_UPGRADE_SL; + } else { + /* dataset currently has a default label */ + if (!new_default) + needed_priv = PRIV_FILE_UPGRADE_SL; + } + +out_check: + if (needed_priv != -1) + return (PRIV_POLICY(cr, needed_priv, B_FALSE, EPERM, NULL)); + return (0); +} + static int -zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr) +zfs_secpolicy_setprop(const char *dsname, zfs_prop_t prop, nvpair_t *propval, + cred_t *cr) { + char *strval; + /* * Check permissions for special properties. */ @@ -342,16 +455,29 @@ zfs_secpolicy_setprop(const char *name, zfs_prop_t prop, cred_t *cr) * quota on things *under* (ie. contained by) * the thing they own. */ - if (dsl_prop_get_integer(name, "zoned", &zoned, + if (dsl_prop_get_integer(dsname, "zoned", &zoned, setpoint)) return (EPERM); - if (!zoned || strlen(name) <= strlen(setpoint)) + if (!zoned || strlen(dsname) <= strlen(setpoint)) return (EPERM); } break; + + case ZFS_PROP_MLSLABEL: + if (!is_system_labeled()) + return (EPERM); + + if (nvpair_value_string(propval, &strval) == 0) { + int err; + + err = zfs_set_slabel_policy(dsname, strval, CRED()); + if (err != 0) + return (err); + } + break; } - return (zfs_secpolicy_write_perms(name, zfs_prop_to_name(prop), cr)); + return (zfs_secpolicy_write_perms(dsname, zfs_prop_to_name(prop), cr)); } int @@ -373,13 +499,8 @@ zfs_secpolicy_fsacl(zfs_cmd_t *zc, cred_t *cr) int zfs_secpolicy_rollback(zfs_cmd_t *zc, cred_t *cr) { - int error; - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_ROLLBACK, cr); - if (error == 0) - error = zfs_secpolicy_write_perms(zc->zc_name, - ZFS_DELEG_PERM_MOUNT, cr); - return (error); + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_ROLLBACK, cr)); } int @@ -389,6 +510,30 @@ zfs_secpolicy_send(zfs_cmd_t *zc, cred_t *cr) ZFS_DELEG_PERM_SEND, cr)); } +static int +zfs_secpolicy_deleg_share(zfs_cmd_t *zc, cred_t *cr) +{ + vnode_t *vp; + int error; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (vp->v_vfsp->vfs_fstype != zfsfstype || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (EPERM); + } + + VN_RELE(vp); + return (dsl_deleg_access(zc->zc_name, + ZFS_DELEG_PERM_SHARE, cr)); +} + int zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) { @@ -398,25 +543,20 @@ zfs_secpolicy_share(zfs_cmd_t *zc, cred_t *cr) if (secpolicy_nfs(cr) == 0) { return (0); } else { - vnode_t *vp; - int error; - - if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, - NO_FOLLOW, NULL, &vp)) != 0) - return (error); - - /* Now make sure mntpnt and dataset are ZFS */ + return (zfs_secpolicy_deleg_share(zc, cr)); + } +} - if (vp->v_vfsp->vfs_fstype != zfsfstype || - (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), - zc->zc_name) != 0)) { - VN_RELE(vp); - return (EPERM); - } +int +zfs_secpolicy_smb_acl(zfs_cmd_t *zc, cred_t *cr) +{ + if (!INGLOBALZONE(curproc)) + return (EPERM); - VN_RELE(vp); - return (dsl_deleg_access(zc->zc_name, - ZFS_DELEG_PERM_SHARE, cr)); + if (secpolicy_smb(cr) == 0) { + return (0); + } else { + return (zfs_secpolicy_deleg_share(zc, cr)); } } @@ -460,6 +600,31 @@ zfs_secpolicy_destroy(zfs_cmd_t *zc, cred_t *cr) return (zfs_secpolicy_destroy_perms(zc->zc_name, cr)); } +/* + * Destroying snapshots with delegated permissions requires + * descendent mount and destroy permissions. + * Reassemble the full filesystem@snap name so dsl_deleg_access() + * can do the correct permission check. + * + * Since this routine is used when doing a recursive destroy of snapshots + * and destroying snapshots requires descendent permissions, a successfull + * check of the top level snapshot applies to snapshots of all descendent + * datasets as well. + */ +static int +zfs_secpolicy_destroy_snaps(zfs_cmd_t *zc, cred_t *cr) +{ + int error; + char *dsname; + + dsname = kmem_asprintf("%s@%s", zc->zc_name, zc->zc_value); + + error = zfs_secpolicy_destroy_perms(dsname, cr); + + strfree(dsname); + return (error); +} + /* * Must have sys_config privilege to check the iscsi permission */ @@ -473,7 +638,7 @@ zfs_secpolicy_iscsi(zfs_cmd_t *zc, cred_t *cr) int zfs_secpolicy_rename_perms(const char *from, const char *to, cred_t *cr) { - char parentname[MAXNAMELEN]; + char parentname[MAXNAMELEN]; int error; if ((error = zfs_secpolicy_write_perms(from, @@ -508,7 +673,7 @@ zfs_secpolicy_rename(zfs_cmd_t *zc, cred_t *cr) static int zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) { - char parentname[MAXNAMELEN]; + char parentname[MAXNAMELEN]; objset_t *clone; int error; @@ -517,20 +682,19 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) if (error) return (error); - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &clone); + error = dmu_objset_hold(zc->zc_name, FTAG, &clone); if (error == 0) { dsl_dataset_t *pclone = NULL; dsl_dir_t *dd; - dd = clone->os->os_dsl_dataset->ds_dir; + dd = clone->os_dsl_dataset->ds_dir; rw_enter(&dd->dd_pool->dp_config_rwlock, RW_READER); error = dsl_dataset_hold_obj(dd->dd_pool, dd->dd_phys->dd_origin_obj, FTAG, &pclone); rw_exit(&dd->dd_pool->dp_config_rwlock); if (error) { - dmu_objset_close(clone); + dmu_objset_rele(clone, FTAG); return (error); } @@ -538,7 +702,7 @@ zfs_secpolicy_promote(zfs_cmd_t *zc, cred_t *cr) ZFS_DELEG_PERM_MOUNT, cr); dsl_dataset_name(pclone, parentname); - dmu_objset_close(clone); + dmu_objset_rele(clone, FTAG); dsl_dataset_rele(pclone, FTAG); if (error == 0) error = zfs_secpolicy_write_perms(parentname, @@ -567,16 +731,8 @@ zfs_secpolicy_receive(zfs_cmd_t *zc, cred_t *cr) int zfs_secpolicy_snapshot_perms(const char *name, cred_t *cr) { - int error; - - if ((error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_SNAPSHOT, cr)) != 0) - return (error); - - error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_MOUNT, cr); - - return (error); + return (zfs_secpolicy_write_perms(name, + ZFS_DELEG_PERM_SNAPSHOT, cr)); } static int @@ -589,8 +745,8 @@ zfs_secpolicy_snapshot(zfs_cmd_t *zc, cred_t *cr) static int zfs_secpolicy_create(zfs_cmd_t *zc, cred_t *cr) { - char parentname[MAXNAMELEN]; - int error; + char parentname[MAXNAMELEN]; + int error; if ((error = zfs_get_parent(zc->zc_name, parentname, sizeof (parentname))) != 0) @@ -638,22 +794,6 @@ zfs_secpolicy_config(zfs_cmd_t *zc, cred_t *cr) return (0); } -/* - * Just like zfs_secpolicy_config, except that we will check for - * mount permission on the dataset for permission to create/remove - * the minor nodes. - */ -static int -zfs_secpolicy_minor(zfs_cmd_t *zc, cred_t *cr) -{ - if (secpolicy_sys_config(cr, B_FALSE) != 0) { - return (dsl_deleg_access(zc->zc_name, - ZFS_DELEG_PERM_MOUNT, cr)); - } - - return (0); -} - /* * Policy for fault injection. Requires all privileges. */ @@ -675,17 +815,80 @@ zfs_secpolicy_inherit(zfs_cmd_t *zc, cred_t *cr) return (zfs_secpolicy_write_perms(zc->zc_name, ZFS_DELEG_PERM_USERPROP, cr)); } else { - if (!zfs_prop_inheritable(prop)) - return (EINVAL); - return (zfs_secpolicy_setprop(zc->zc_name, prop, cr)); + return (zfs_secpolicy_setprop(zc->zc_name, prop, + NULL, cr)); + } +} + +static int +zfs_secpolicy_userspace_one(zfs_cmd_t *zc, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + if (zc->zc_value[0] == 0) { + /* + * They are asking about a posix uid/gid. If it's + * themself, allow it. + */ + if (zc->zc_objset_type == ZFS_PROP_USERUSED || + zc->zc_objset_type == ZFS_PROP_USERQUOTA) { + if (zc->zc_guid == crgetuid(cr)) + return (0); + } else { + if (groupmember(zc->zc_guid, cr)) + return (0); + } } + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +static int +zfs_secpolicy_userspace_many(zfs_cmd_t *zc, cred_t *cr) +{ + int err = zfs_secpolicy_read(zc, cr); + if (err) + return (err); + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + return (zfs_secpolicy_write_perms(zc->zc_name, + userquota_perms[zc->zc_objset_type], cr)); +} + +static int +zfs_secpolicy_userspace_upgrade(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_setprop(zc->zc_name, ZFS_PROP_VERSION, + NULL, cr)); +} + +static int +zfs_secpolicy_hold(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_HOLD, cr)); +} + +static int +zfs_secpolicy_release(zfs_cmd_t *zc, cred_t *cr) +{ + return (zfs_secpolicy_write_perms(zc->zc_name, + ZFS_DELEG_PERM_RELEASE, cr)); } /* * Returns the nvlist as specified by the user in the zfs_cmd_t. */ static int -get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) +get_nvlist(uint64_t nvl, uint64_t size, int iflag, nvlist_t **nvp) { char *packed; int error; @@ -699,7 +902,8 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) packed = kmem_alloc(size, KM_SLEEP); - if ((error = xcopyin((void *)(uintptr_t)nvl, packed, size)) != 0) { + if ((error = ddi_copyin((void *)(uintptr_t)nvl, packed, size, + iflag)) != 0) { kmem_free(packed, size); return (error); } @@ -715,6 +919,41 @@ get_nvlist(uint64_t nvl, uint64_t size, nvlist_t **nvp) return (0); } +static int +fit_error_list(zfs_cmd_t *zc, nvlist_t **errors) +{ + size_t size; + + VERIFY(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); + + if (size > zc->zc_nvlist_dst_size) { + nvpair_t *more_errors; + int n = 0; + + if (zc->zc_nvlist_dst_size < 1024) + return (ENOMEM); + + VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, 0) == 0); + more_errors = nvlist_prev_nvpair(*errors, NULL); + + do { + nvpair_t *pair = nvlist_prev_nvpair(*errors, + more_errors); + VERIFY(nvlist_remove_nvpair(*errors, pair) == 0); + n++; + VERIFY(nvlist_size(*errors, &size, + NV_ENCODE_NATIVE) == 0); + } while (size > zc->zc_nvlist_dst_size); + + VERIFY(nvlist_remove_nvpair(*errors, more_errors) == 0); + VERIFY(nvlist_add_int32(*errors, ZPROP_N_MORE_ERRORS, n) == 0); + ASSERT(nvlist_size(*errors, &size, NV_ENCODE_NATIVE) == 0); + ASSERT(size <= zc->zc_nvlist_dst_size); + } + + return (0); +} + static int put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) { @@ -730,8 +969,8 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) packed = kmem_alloc(size, KM_SLEEP); VERIFY(nvlist_pack(nvl, &packed, &size, NV_ENCODE_NATIVE, KM_SLEEP) == 0); - error = xcopyout(packed, (void *)(uintptr_t)zc->zc_nvlist_dst, - size); + error = ddi_copyout(packed, + (void *)(uintptr_t)zc->zc_nvlist_dst, size, zc->zc_iflags); kmem_free(packed, size); } @@ -739,6 +978,71 @@ put_nvlist(zfs_cmd_t *zc, nvlist_t *nvl) return (error); } +static int +getzfsvfs(const char *dsname, zfsvfs_t **zfvp) +{ + objset_t *os; + int error; + + error = dmu_objset_hold(dsname, FTAG, &os); + if (error) + return (error); + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (EINVAL); + } + + mutex_enter(&os->os_user_ptr_lock); + *zfvp = dmu_objset_get_user(os); + if (*zfvp) { + VFS_HOLD((*zfvp)->z_vfs); + } else { + error = ESRCH; + } + mutex_exit(&os->os_user_ptr_lock); + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * Find a zfsvfs_t for a mounted filesystem, or create our own, in which + * case its z_vfs will be NULL, and it will be opened as the owner. + */ +static int +zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp) +{ + int error = 0; + + if (getzfsvfs(name, zfvp) != 0) + error = zfsvfs_create(name, zfvp); + if (error == 0) { + rrw_enter(&(*zfvp)->z_teardown_lock, RW_READER, tag); + if ((*zfvp)->z_unmounted) { + /* + * XXX we could probably try again, since the unmounting + * thread should be just about to disassociate the + * objset from the zfsvfs. + */ + rrw_exit(&(*zfvp)->z_teardown_lock, tag); + return (EBUSY); + } + } + return (error); +} + +static void +zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag) +{ + rrw_exit(&zfsvfs->z_teardown_lock, tag); + + if (zfsvfs->z_vfs) { + VFS_RELE(zfsvfs->z_vfs); + } else { + dmu_objset_disown(zfsvfs->z_os, zfsvfs); + zfsvfs_free(zfsvfs); + } +} + static int zfs_ioc_pool_create(zfs_cmd_t *zc) { @@ -749,11 +1053,12 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) char *buf; if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) + zc->zc_iflags, &config)) return (error); if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { nvlist_free(config); return (error); } @@ -792,8 +1097,8 @@ zfs_ioc_pool_create(zfs_cmd_t *zc) /* * Set the remaining root properties */ - if (!error && - (error = zfs_set_prop_nvlist(zc->zc_name, rootprops)) != 0) + if (!error && (error = zfs_set_prop_nvlist(zc->zc_name, + ZPROP_SRC_LOCAL, rootprops, NULL)) != 0) (void) spa_destroy(zc->zc_name); if (buf != NULL) @@ -814,22 +1119,25 @@ zfs_ioc_pool_destroy(zfs_cmd_t *zc) int error; zfs_log_history(zc); error = spa_destroy(zc->zc_name); + if (error == 0) + zvol_remove_minors(zc->zc_name); return (error); } static int zfs_ioc_pool_import(zfs_cmd_t *zc) { - int error; nvlist_t *config, *props = NULL; uint64_t guid; + int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) != 0) + zc->zc_iflags, &config)) != 0) return (error); if (zc->zc_nvlist_src_size != 0 && (error = - get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, &props))) { + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { nvlist_free(config); return (error); } @@ -838,11 +1146,13 @@ zfs_ioc_pool_import(zfs_cmd_t *zc) guid != zc->zc_guid) error = EINVAL; else if (zc->zc_cookie) - error = spa_import_faulted(zc->zc_name, config, - props); + error = spa_import_verbatim(zc->zc_name, config, props); else error = spa_import(zc->zc_name, config, props); + if (zc->zc_nvlist_dst != 0) + (void) put_nvlist(zc, config); + nvlist_free(config); if (props) @@ -856,9 +1166,12 @@ zfs_ioc_pool_export(zfs_cmd_t *zc) { int error; boolean_t force = (boolean_t)zc->zc_cookie; + boolean_t hardforce = (boolean_t)zc->zc_guid; zfs_log_history(zc); - error = spa_export(zc->zc_name, NULL, force); + error = spa_export(zc->zc_name, NULL, force, hardforce); + if (error == 0) + zvol_remove_minors(zc->zc_name); return (error); } @@ -916,7 +1229,7 @@ zfs_ioc_pool_tryimport(zfs_cmd_t *zc) int error; if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &tryconfig)) != 0) + zc->zc_iflags, &tryconfig)) != 0) return (error); config = spa_tryimport(tryconfig); @@ -1004,9 +1317,9 @@ zfs_ioc_pool_get_history(zfs_cmd_t *zc) hist_buf = kmem_alloc(size, KM_SLEEP); if ((error = spa_history_get(spa, &zc->zc_history_offset, &zc->zc_history_len, hist_buf)) == 0) { - error = xcopyout(hist_buf, - (char *)(uintptr_t)zc->zc_history, - zc->zc_history_len); + error = ddi_copyout(hist_buf, + (void *)(uintptr_t)zc->zc_history, + zc->zc_history_len, zc->zc_iflags); } spa_close(spa, FTAG); @@ -1025,18 +1338,30 @@ zfs_ioc_dsobj_to_dsname(zfs_cmd_t *zc) return (0); } +/* + * inputs: + * zc_name name of filesystem + * zc_obj object to find + * + * outputs: + * zc_value name of object + */ static int zfs_ioc_obj_to_path(zfs_cmd_t *zc) { - objset_t *osp; + objset_t *os; int error; - if ((error = dmu_objset_open(zc->zc_name, DMU_OST_ZFS, - DS_MODE_USER | DS_MODE_READONLY, &osp)) != 0) + /* XXX reading from objset not owned */ + if ((error = dmu_objset_hold(zc->zc_name, FTAG, &os)) != 0) return (error); - error = zfs_obj_to_path(osp, zc->zc_obj, zc->zc_value, + if (dmu_objset_type(os) != DMU_OST_ZFS) { + dmu_objset_rele(os, FTAG); + return (EINVAL); + } + error = zfs_obj_to_path(os, zc->zc_obj, zc->zc_value, sizeof (zc->zc_value)); - dmu_objset_close(osp); + dmu_objset_rele(os, FTAG); return (error); } @@ -1054,7 +1379,7 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) return (error); error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config); + zc->zc_iflags, &config); (void) nvlist_lookup_nvlist_array(config, ZPOOL_CONFIG_L2CACHE, &l2cache, &nl2cache); @@ -1071,7 +1396,8 @@ zfs_ioc_vdev_add(zfs_cmd_t *zc) * * l2cache and spare devices are ok to be added to a rootpool. */ - if (spa->spa_bootfs != 0 && nl2cache == 0 && nspares == 0) { + if (spa_bootfs(spa) != 0 && nl2cache == 0 && nspares == 0) { + nvlist_free(config); spa_close(spa, FTAG); return (EDOM); } @@ -1117,11 +1443,19 @@ zfs_ioc_vdev_set_state(zfs_cmd_t *zc) break; case VDEV_STATE_FAULTED: - error = vdev_fault(spa, zc->zc_guid); + if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && + zc->zc_obj != VDEV_AUX_EXTERNAL) + zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; + + error = vdev_fault(spa, zc->zc_guid, zc->zc_obj); break; case VDEV_STATE_DEGRADED: - error = vdev_degrade(spa, zc->zc_guid); + if (zc->zc_obj != VDEV_AUX_ERR_EXCEEDED && + zc->zc_obj != VDEV_AUX_EXTERNAL) + zc->zc_obj = VDEV_AUX_ERR_EXCEEDED; + + error = vdev_degrade(spa, zc->zc_guid, zc->zc_obj); break; default: @@ -1144,7 +1478,7 @@ zfs_ioc_vdev_attach(zfs_cmd_t *zc) return (error); if ((error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, - &config)) == 0) { + zc->zc_iflags, &config)) == 0) { error = spa_vdev_attach(spa, zc->zc_guid, config, replacing); nvlist_free(config); } @@ -1162,35 +1496,87 @@ zfs_ioc_vdev_detach(zfs_cmd_t *zc) if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_vdev_detach(spa, zc->zc_guid, B_FALSE); + error = spa_vdev_detach(spa, zc->zc_guid, 0, B_FALSE); spa_close(spa, FTAG); return (error); } static int -zfs_ioc_vdev_setpath(zfs_cmd_t *zc) +zfs_ioc_vdev_split(zfs_cmd_t *zc) { spa_t *spa; - char *path = zc->zc_value; - uint64_t guid = zc->zc_guid; + nvlist_t *config, *props = NULL; int error; + boolean_t exp = !!(zc->zc_cookie & ZPOOL_EXPORT_AFTER_SPLIT); - error = spa_open(zc->zc_name, &spa, FTAG); - if (error != 0) + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) return (error); - error = spa_vdev_setpath(spa, guid, path); - spa_close(spa, FTAG); - return (error); -} + if (error = get_nvlist(zc->zc_nvlist_conf, zc->zc_nvlist_conf_size, + zc->zc_iflags, &config)) { + spa_close(spa, FTAG); + return (error); + } -/* - * inputs: - * zc_name name of filesystem - * zc_nvlist_dst_size size of buffer for property nvlist - * - * outputs: + if (zc->zc_nvlist_src_size != 0 && (error = + get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props))) { + spa_close(spa, FTAG); + nvlist_free(config); + return (error); + } + + error = spa_vdev_split_mirror(spa, zc->zc_string, config, props, exp); + + spa_close(spa, FTAG); + + nvlist_free(config); + nvlist_free(props); + + return (error); +} + +static int +zfs_ioc_vdev_setpath(zfs_cmd_t *zc) +{ + spa_t *spa; + char *path = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setpath(spa, guid, path); + spa_close(spa, FTAG); + return (error); +} + +static int +zfs_ioc_vdev_setfru(zfs_cmd_t *zc) +{ + spa_t *spa; + char *fru = zc->zc_value; + uint64_t guid = zc->zc_guid; + int error; + + error = spa_open(zc->zc_name, &spa, FTAG); + if (error != 0) + return (error); + + error = spa_vdev_setfru(spa, guid, fru); + spa_close(spa, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist @@ -1202,20 +1588,20 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) int error; nvlist_t *nv; - if (error = dmu_objset_open(zc->zc_name, - DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) return (error); dmu_objset_fast_stat(os, &zc->zc_objset_stats); if (zc->zc_nvlist_dst != 0 && - (error = dsl_prop_get_all(os, &nv, FALSE)) == 0) { + (error = dsl_prop_get_all(os, &nv)) == 0) { dmu_objset_stats(os, nv); /* * NB: zvol_get_stats() will read the objset contents, * which we aren't supposed to do with a * DS_MODE_USER hold, because it could be * inconsistent. So this is a bit of a workaround... + * XXX reading with out owning */ if (!zc->zc_objset_stats.dds_inconsistent) { if (dmu_objset_type(os) == DMU_OST_ZVOL) @@ -1225,7 +1611,50 @@ zfs_ioc_objset_stats(zfs_cmd_t *zc) nvlist_free(nv); } - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_nvlist_dst_size size of buffer for property nvlist + * + * outputs: + * zc_nvlist_dst received property nvlist + * zc_nvlist_dst_size size of received property nvlist + * + * Gets received properties (distinct from local properties on or after + * SPA_VERSION_RECVD_PROPS) for callers who want to differentiate received from + * local property values. + */ +static int +zfs_ioc_objset_recvd_props(zfs_cmd_t *zc) +{ + objset_t *os = NULL; + int error; + nvlist_t *nv; + + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) + return (error); + + /* + * Without this check, we would return local property values if the + * caller has not already received properties on or after + * SPA_VERSION_RECVD_PROPS. + */ + if (!dsl_prop_get_hasrecvd(os)) { + dmu_objset_rele(os, FTAG); + return (ENOTSUP); + } + + if (zc->zc_nvlist_dst != 0 && + (error = dsl_prop_get_received(os, &nv)) == 0) { + error = put_nvlist(zc, nv); + nvlist_free(nv); + } + + dmu_objset_rele(os, FTAG); return (error); } @@ -1260,8 +1689,8 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) objset_t *os; int err; - if (err = dmu_objset_open(zc->zc_name, - DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) + /* XXX reading without owning */ + if (err = dmu_objset_hold(zc->zc_name, FTAG, &os)) return (err); dmu_objset_fast_stat(os, &zc->zc_objset_stats); @@ -1286,30 +1715,25 @@ zfs_ioc_objset_zplprops(zfs_cmd_t *zc) } else { err = ENOENT; } - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); return (err); } -static void -zfs_prefetch_datasets(zfs_cmd_t *zc, objset_t *os, char *p) -{ - uint64_t cookie = 0; - int error; - - do { - error = dmu_dir_list_next(os, - sizeof (zc->zc_name) - (p - zc->zc_name), p, - NULL, &cookie); - } while (error == 0 && !INGLOBALZONE(curproc) && - !zone_dataset_visible(zc->zc_name, NULL) && - !dmu_objset_prefetch(zc->zc_name, NULL)); -} - -static void -zfs_prefetch_snapshots(zfs_cmd_t *zc) +static boolean_t +dataset_name_hidden(const char *name) { - dmu_objset_find(zc->zc_name, dmu_objset_prefetch, - NULL, DS_FIND_SNAPSHOTS); + /* + * Skip over datasets that are not visible in this zone, + * internal datasets (which have a $ in their name), and + * temporary datasets (which have a % in their name). + */ + if (strchr(name, '$') != NULL) + return (B_TRUE); + if (strchr(name, '%') != NULL) + return (B_TRUE); + if (!INGLOBALZONE(curproc) && !zone_dataset_visible(name, NULL)) + return (B_TRUE); + return (B_FALSE); } /* @@ -1320,6 +1744,7 @@ zfs_prefetch_snapshots(zfs_cmd_t *zc) * * outputs: * zc_name name of next filesystem + * zc_cookie zap cursor * zc_objset_stats stats * zc_nvlist_dst property nvlist * zc_nvlist_dst_size size of property nvlist @@ -1330,9 +1755,10 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) objset_t *os; int error; char *p; + size_t orig_len = strlen(zc->zc_name); - if (error = dmu_objset_open(zc->zc_name, - DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os)) { +top: + if (error = dmu_objset_hold(zc->zc_name, FTAG, &os)) { if (error == ENOENT) error = ESRCH; return (error); @@ -1343,25 +1769,40 @@ zfs_ioc_dataset_list_next(zfs_cmd_t *zc) (void) strlcat(zc->zc_name, "/", sizeof (zc->zc_name)); p = zc->zc_name + strlen(zc->zc_name); - if (zc->zc_cookie == 0) - zfs_prefetch_datasets(zc, os, p); + /* + * Pre-fetch the datasets. dmu_objset_prefetch() always returns 0 + * but is not declared void because its called by dmu_objset_find(). + */ + if (zc->zc_cookie == 0) { + uint64_t cookie = 0; + int len = sizeof (zc->zc_name) - (p - zc->zc_name); + + while (dmu_dir_list_next(os, len, p, NULL, &cookie) == 0) + (void) dmu_objset_prefetch(p, NULL); + } + do { error = dmu_dir_list_next(os, sizeof (zc->zc_name) - (p - zc->zc_name), p, NULL, &zc->zc_cookie); if (error == ENOENT) error = ESRCH; - } while (error == 0 && !INGLOBALZONE(curproc) && - !zone_dataset_visible(zc->zc_name, NULL)); - dmu_objset_close(os); + } while (error == 0 && dataset_name_hidden(zc->zc_name) && + !(zc->zc_iflags & FKIOCTL)); + dmu_objset_rele(os, FTAG); /* - * If it's a hidden dataset (ie. with a '$' in its name), don't - * try to get stats for it. Userland will skip over it. + * If it's an internal dataset (ie. with a '$' in its name), + * don't try to get stats for it, otherwise we'll return ENOENT. */ - if (error == 0 && strchr(zc->zc_name, '$') == NULL) + if (error == 0 && strchr(zc->zc_name, '$') == NULL) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - + if (error == ENOENT) { + /* We lost a race with destroy, get the next one. */ + zc->zc_name[orig_len] = '\0'; + goto top; + } + } return (error); } @@ -1383,30 +1824,38 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) objset_t *os; int error; - error = dmu_objset_open(zc->zc_name, - DMU_OST_ANY, DS_MODE_USER | DS_MODE_READONLY, &os); +top: + if (zc->zc_cookie == 0) + (void) dmu_objset_find(zc->zc_name, dmu_objset_prefetch, + NULL, DS_FIND_SNAPSHOTS); + + error = dmu_objset_hold(zc->zc_name, FTAG, &os); if (error) return (error == ENOENT ? ESRCH : error); - if (zc->zc_cookie == 0) - zfs_prefetch_snapshots(zc); /* * A dataset name of maximum length cannot have any snapshots, * so exit immediately. */ if (strlcat(zc->zc_name, "@", sizeof (zc->zc_name)) >= MAXNAMELEN) { - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); return (ESRCH); } error = dmu_snapshot_list_next(os, sizeof (zc->zc_name) - strlen(zc->zc_name), zc->zc_name + strlen(zc->zc_name), NULL, &zc->zc_cookie, NULL); - dmu_objset_close(os); - if (error == 0) + dmu_objset_rele(os, FTAG); + if (error == 0) { error = zfs_ioc_objset_stats(zc); /* fill in the stats */ - else if (error == ENOENT) + if (error == ENOENT) { + /* We lost a race with destroy, get the next one. */ + *strchr(zc->zc_name, '@') = '\0'; + goto top; + } + } else if (error == ENOENT) { error = ESRCH; + } /* if we failed, undo the @ that we tacked on to zc_name */ if (error) @@ -1414,233 +1863,410 @@ zfs_ioc_snapshot_list_next(zfs_cmd_t *zc) return (error); } -int -zfs_set_prop_nvlist(const char *name, nvlist_t *nvl) +static int +zfs_prop_set_userquota(const char *dsname, nvpair_t *pair) { - nvpair_t *elem; - int error; - uint64_t intval; - char *strval; + const char *propname = nvpair_name(pair); + uint64_t *valary; + unsigned int vallen; + const char *domain; + zfs_userquota_prop_t type; + uint64_t rid; + uint64_t quota; + zfsvfs_t *zfsvfs; + int err; + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } + VERIFY(nvpair_value_uint64_array(pair, &valary, &vallen) == 0); + VERIFY(vallen == 3); + type = valary[0]; + rid = valary[1]; + quota = valary[2]; /* - * First validate permission to set all of the properties + * The propname is encoded as + * userquota@-. */ - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - const char *propname = nvpair_name(elem); - zfs_prop_t prop = zfs_name_to_prop(propname); + domain = strchr(propname, '-') + 1; - if (prop == ZPROP_INVAL) { - /* - * If this is a user-defined property, it must be a - * string, and there is no further validation to do. - */ - if (!zfs_prop_user(propname) || - nvpair_type(elem) != DATA_TYPE_STRING) - return (EINVAL); - - if (error = zfs_secpolicy_write_perms(name, - ZFS_DELEG_PERM_USERPROP, CRED())) - return (error); - continue; - } - - if ((error = zfs_secpolicy_setprop(name, prop, CRED())) != 0) - return (error); - - /* - * Check that this value is valid for this pool version - */ - switch (prop) { - case ZFS_PROP_COMPRESSION: - /* - * If the user specified gzip compression, make sure - * the SPA supports it. We ignore any errors here since - * we'll catch them later. - */ - if (nvpair_type(elem) == DATA_TYPE_UINT64 && - nvpair_value_uint64(elem, &intval) == 0) { - if (intval >= ZIO_COMPRESS_GZIP_1 && - intval <= ZIO_COMPRESS_GZIP_9 && - zfs_earlier_version(name, - SPA_VERSION_GZIP_COMPRESSION)) - return (ENOTSUP); + err = zfsvfs_hold(dsname, FTAG, &zfsvfs); + if (err == 0) { + err = zfs_set_userquota(zfsvfs, type, domain, rid, quota); + zfsvfs_rele(zfsvfs, FTAG); + } - /* - * If this is a bootable dataset then - * verify that the compression algorithm - * is supported for booting. We must return - * something other than ENOTSUP since it - * implies a downrev pool version. - */ - if (zfs_is_bootfs(name) && - !BOOTFS_COMPRESS_VALID(intval)) - return (ERANGE); - } - break; + return (err); +} - case ZFS_PROP_COPIES: - if (zfs_earlier_version(name, - SPA_VERSION_DITTO_BLOCKS)) - return (ENOTSUP); - break; +/* + * If the named property is one that has a special function to set its value, + * return 0 on success and a positive error code on failure; otherwise if it is + * not one of the special properties handled by this function, return -1. + * + * XXX: It would be better for callers of the properety interface if we handled + * these special cases in dsl_prop.c (in the dsl layer). + */ +static int +zfs_prop_set_special(const char *dsname, zprop_source_t source, + nvpair_t *pair) +{ + const char *propname = nvpair_name(pair); + zfs_prop_t prop = zfs_name_to_prop(propname); + uint64_t intval; + int err; - case ZFS_PROP_SHARESMB: - if (zpl_earlier_version(name, ZPL_VERSION_FUID)) - return (ENOTSUP); - break; + if (prop == ZPROP_INVAL) { + if (zfs_prop_userquota(propname)) + return (zfs_prop_set_userquota(dsname, pair)); + return (-1); + } - case ZFS_PROP_ACLINHERIT: - if (nvpair_type(elem) == DATA_TYPE_UINT64 && - nvpair_value_uint64(elem, &intval) == 0) - if (intval == ZFS_ACL_PASSTHROUGH_X && - zfs_earlier_version(name, - SPA_VERSION_PASSTHROUGH_X)) - return (ENOTSUP); - } + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); } - elem = NULL; - while ((elem = nvlist_next_nvpair(nvl, elem)) != NULL) { - const char *propname = nvpair_name(elem); - zfs_prop_t prop = zfs_name_to_prop(propname); + if (zfs_prop_get_type(prop) == PROP_TYPE_STRING) + return (-1); - if (prop == ZPROP_INVAL) { - VERIFY(nvpair_value_string(elem, &strval) == 0); - error = dsl_prop_set(name, propname, 1, - strlen(strval) + 1, strval); - if (error == 0) - continue; - else - return (error); - } + VERIFY(0 == nvpair_value_uint64(pair, &intval)); - switch (prop) { - case ZFS_PROP_QUOTA: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dir_set_quota(name, intval)) != 0) - return (error); - break; + switch (prop) { + case ZFS_PROP_QUOTA: + err = dsl_dir_set_quota(dsname, source, intval); + break; + case ZFS_PROP_REFQUOTA: + err = dsl_dataset_set_quota(dsname, source, intval); + break; + case ZFS_PROP_RESERVATION: + err = dsl_dir_set_reservation(dsname, source, intval); + break; + case ZFS_PROP_REFRESERVATION: + err = dsl_dataset_set_reservation(dsname, source, intval); + break; + case ZFS_PROP_VOLSIZE: + err = zvol_set_volsize(dsname, ddi_driver_major(zfs_dip), + intval); + break; + case ZFS_PROP_VERSION: + { + zfsvfs_t *zfsvfs; - case ZFS_PROP_REFQUOTA: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dataset_set_quota(name, intval)) != 0) - return (error); + if ((err = zfsvfs_hold(dsname, FTAG, &zfsvfs)) != 0) break; - case ZFS_PROP_RESERVATION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dir_set_reservation(name, - intval)) != 0) - return (error); - break; + err = zfs_set_version(zfsvfs, intval); + zfsvfs_rele(zfsvfs, FTAG); - case ZFS_PROP_REFRESERVATION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = dsl_dataset_set_reservation(name, - intval)) != 0) - return (error); - break; + if (err == 0 && intval >= ZPL_VERSION_USERSPACE) { + zfs_cmd_t *zc; - case ZFS_PROP_VOLSIZE: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volsize(name, - ddi_driver_major(zfs_dip), intval)) != 0) - return (error); - break; + zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); + (void) strcpy(zc->zc_name, dsname); + (void) zfs_ioc_userspace_upgrade(zc); + kmem_free(zc, sizeof (zfs_cmd_t)); + } + break; + } - case ZFS_PROP_VOLBLOCKSIZE: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zvol_set_volblocksize(name, intval)) != 0) - return (error); - break; + default: + err = -1; + } - case ZFS_PROP_VERSION: - if ((error = nvpair_value_uint64(elem, &intval)) != 0 || - (error = zfs_set_version(name, intval)) != 0) - return (error); - break; + return (err); +} - default: - if (nvpair_type(elem) == DATA_TYPE_STRING) { - if (zfs_prop_get_type(prop) != - PROP_TYPE_STRING) - return (EINVAL); - VERIFY(nvpair_value_string(elem, &strval) == 0); - if ((error = dsl_prop_set(name, - nvpair_name(elem), 1, strlen(strval) + 1, - strval)) != 0) - return (error); - } else if (nvpair_type(elem) == DATA_TYPE_UINT64) { +/* + * This function is best effort. If it fails to set any of the given properties, + * it continues to set as many as it can and returns the first error + * encountered. If the caller provides a non-NULL errlist, it also gives the + * complete list of names of all the properties it failed to set along with the + * corresponding error numbers. The caller is responsible for freeing the + * returned errlist. + * + * If every property is set successfully, zero is returned and the list pointed + * at by errlist is NULL. + */ +int +zfs_set_prop_nvlist(const char *dsname, zprop_source_t source, nvlist_t *nvl, + nvlist_t **errlist) +{ + nvpair_t *pair; + nvpair_t *propval; + int rv = 0; + uint64_t intval; + char *strval; + nvlist_t *genericnvl; + nvlist_t *errors; + nvlist_t *retrynvl; + + VERIFY(nvlist_alloc(&genericnvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_alloc(&retrynvl, NV_UNIQUE_NAME, KM_SLEEP) == 0); + +retry: + pair = NULL; + while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + zfs_prop_t prop = zfs_name_to_prop(propname); + int err = 0; + + /* decode the property value */ + propval = pair; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &propval) == 0); + } + + /* Validate value type */ + if (prop == ZPROP_INVAL) { + if (zfs_prop_user(propname)) { + if (nvpair_type(propval) != DATA_TYPE_STRING) + err = EINVAL; + } else if (zfs_prop_userquota(propname)) { + if (nvpair_type(propval) != + DATA_TYPE_UINT64_ARRAY) + err = EINVAL; + } + } else { + if (nvpair_type(propval) == DATA_TYPE_STRING) { + if (zfs_prop_get_type(prop) != PROP_TYPE_STRING) + err = EINVAL; + } else if (nvpair_type(propval) == DATA_TYPE_UINT64) { const char *unused; - VERIFY(nvpair_value_uint64(elem, &intval) == 0); + VERIFY(nvpair_value_uint64(propval, + &intval) == 0); switch (zfs_prop_get_type(prop)) { case PROP_TYPE_NUMBER: break; case PROP_TYPE_STRING: - return (EINVAL); + err = EINVAL; + break; case PROP_TYPE_INDEX: if (zfs_prop_index_to_string(prop, intval, &unused) != 0) - return (EINVAL); + err = EINVAL; break; default: cmn_err(CE_PANIC, "unknown property type"); - break; } + } else { + err = EINVAL; + } + } + + /* Validate permissions */ + if (err == 0) + err = zfs_check_settable(dsname, pair, CRED()); + + if (err == 0) { + err = zfs_prop_set_special(dsname, source, pair); + if (err == -1) { + /* + * For better performance we build up a list of + * properties to set in a single transaction. + */ + err = nvlist_add_nvpair(genericnvl, pair); + } else if (err != 0 && nvl != retrynvl) { + /* + * This may be a spurious error caused by + * receiving quota and reservation out of order. + * Try again in a second pass. + */ + err = nvlist_add_nvpair(retrynvl, pair); + } + } + + if (err != 0) + VERIFY(nvlist_add_int32(errors, propname, err) == 0); + } - if ((error = dsl_prop_set(name, propname, - 8, 1, &intval)) != 0) - return (error); + if (nvl != retrynvl && !nvlist_empty(retrynvl)) { + nvl = retrynvl; + goto retry; + } + + if (!nvlist_empty(genericnvl) && + dsl_props_set(dsname, source, genericnvl) != 0) { + /* + * If this fails, we still want to set as many properties as we + * can, so try setting them individually. + */ + pair = NULL; + while ((pair = nvlist_next_nvpair(genericnvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + int err = 0; + + propval = pair; + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &propval) == 0); + } + + if (nvpair_type(propval) == DATA_TYPE_STRING) { + VERIFY(nvpair_value_string(propval, + &strval) == 0); + err = dsl_prop_set(dsname, propname, source, 1, + strlen(strval) + 1, strval); } else { - return (EINVAL); + VERIFY(nvpair_value_uint64(propval, + &intval) == 0); + err = dsl_prop_set(dsname, propname, source, 8, + 1, &intval); + } + + if (err != 0) { + VERIFY(nvlist_add_int32(errors, propname, + err) == 0); } - break; } } + nvlist_free(genericnvl); + nvlist_free(retrynvl); + + if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { + nvlist_free(errors); + errors = NULL; + } else { + VERIFY(nvpair_value_int32(pair, &rv) == 0); + } + + if (errlist == NULL) + nvlist_free(errors); + else + *errlist = errors; + + return (rv); +} + +/* + * Check that all the properties are valid user properties. + */ +static int +zfs_check_userprops(char *fsname, nvlist_t *nvl) +{ + nvpair_t *pair = NULL; + int error = 0; + + while ((pair = nvlist_next_nvpair(nvl, pair)) != NULL) { + const char *propname = nvpair_name(pair); + char *valstr; + + if (!zfs_prop_user(propname) || + nvpair_type(pair) != DATA_TYPE_STRING) + return (EINVAL); + if (error = zfs_secpolicy_write_perms(fsname, + ZFS_DELEG_PERM_USERPROP, CRED())) + return (error); + + if (strlen(propname) >= ZAP_MAXNAMELEN) + return (ENAMETOOLONG); + + VERIFY(nvpair_value_string(pair, &valstr) == 0); + if (strlen(valstr) >= ZAP_MAXVALUELEN) + return (E2BIG); + } return (0); } +static void +props_skip(nvlist_t *props, nvlist_t *skipped, nvlist_t **newprops) +{ + nvpair_t *pair; + + VERIFY(nvlist_alloc(newprops, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + pair = NULL; + while ((pair = nvlist_next_nvpair(props, pair)) != NULL) { + if (nvlist_exists(skipped, nvpair_name(pair))) + continue; + + VERIFY(nvlist_add_nvpair(*newprops, pair) == 0); + } +} + +static int +clear_received_props(objset_t *os, const char *fs, nvlist_t *props, + nvlist_t *skipped) +{ + int err = 0; + nvlist_t *cleared_props = NULL; + props_skip(props, skipped, &cleared_props); + if (!nvlist_empty(cleared_props)) { + /* + * Acts on local properties until the dataset has received + * properties at least once on or after SPA_VERSION_RECVD_PROPS. + */ + zprop_source_t flags = (ZPROP_SRC_NONE | + (dsl_prop_get_hasrecvd(os) ? ZPROP_SRC_RECEIVED : 0)); + err = zfs_set_prop_nvlist(fs, flags, cleared_props, NULL); + } + nvlist_free(cleared_props); + return (err); +} + /* * inputs: * zc_name name of filesystem - * zc_value name of property to inherit + * zc_value name of property to set * zc_nvlist_src{_size} nvlist of properties to apply - * zc_cookie clear existing local props? + * zc_cookie received properties flag * - * outputs: none + * outputs: + * zc_nvlist_dst{_size} error for each unapplied received property */ static int zfs_ioc_set_prop(zfs_cmd_t *zc) { nvlist_t *nvl; + boolean_t received = zc->zc_cookie; + zprop_source_t source = (received ? ZPROP_SRC_RECEIVED : + ZPROP_SRC_LOCAL); + nvlist_t *errors = NULL; int error; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvl)) != 0) + zc->zc_iflags, &nvl)) != 0) return (error); - if (zc->zc_cookie) { + if (received) { nvlist_t *origprops; objset_t *os; - if (dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - if (dsl_prop_get_all(os, &origprops, TRUE) == 0) { - clear_props(zc->zc_name, origprops); + if (dmu_objset_hold(zc->zc_name, FTAG, &os) == 0) { + if (dsl_prop_get_received(os, &origprops) == 0) { + (void) clear_received_props(os, + zc->zc_name, origprops, nvl); nvlist_free(origprops); } - dmu_objset_close(os); - } + dsl_prop_set_hasrecvd(os); + dmu_objset_rele(os, FTAG); + } } - error = zfs_set_prop_nvlist(zc->zc_name, nvl); + error = zfs_set_prop_nvlist(zc->zc_name, source, nvl, &errors); + + if (zc->zc_nvlist_dst != NULL && errors != NULL) { + (void) put_nvlist(zc, errors); + } + nvlist_free(errors); nvlist_free(nvl); return (error); } @@ -1649,14 +2275,75 @@ zfs_ioc_set_prop(zfs_cmd_t *zc) * inputs: * zc_name name of filesystem * zc_value name of property to inherit + * zc_cookie revert to received value if TRUE * * outputs: none */ static int zfs_ioc_inherit_prop(zfs_cmd_t *zc) { + const char *propname = zc->zc_value; + zfs_prop_t prop = zfs_name_to_prop(propname); + boolean_t received = zc->zc_cookie; + zprop_source_t source = (received + ? ZPROP_SRC_NONE /* revert to received value, if any */ + : ZPROP_SRC_INHERITED); /* explicitly inherit */ + + if (received) { + nvlist_t *dummy; + nvpair_t *pair; + zprop_type_t type; + int err; + + /* + * zfs_prop_set_special() expects properties in the form of an + * nvpair with type info. + */ + if (prop == ZPROP_INVAL) { + if (!zfs_prop_user(propname)) + return (EINVAL); + + type = PROP_TYPE_STRING; + } else if (prop == ZFS_PROP_VOLSIZE || + prop == ZFS_PROP_VERSION) { + return (EINVAL); + } else { + type = zfs_prop_get_type(prop); + } + + VERIFY(nvlist_alloc(&dummy, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + switch (type) { + case PROP_TYPE_STRING: + VERIFY(0 == nvlist_add_string(dummy, propname, "")); + break; + case PROP_TYPE_NUMBER: + case PROP_TYPE_INDEX: + VERIFY(0 == nvlist_add_uint64(dummy, propname, 0)); + break; + default: + nvlist_free(dummy); + return (EINVAL); + } + + pair = nvlist_next_nvpair(dummy, NULL); + err = zfs_prop_set_special(zc->zc_name, source, pair); + nvlist_free(dummy); + if (err != -1) + return (err); /* special property already handled */ + } else { + /* + * Only check this in the non-received case. We want to allow + * 'inherit -S' to revert non-inheritable properties like quota + * and reservation to the received or default values even though + * they are not considered inheritable. + */ + if (prop != ZPROP_INVAL && !zfs_prop_inheritable(prop)) + return (EINVAL); + } + /* the property name has been validated by zfs_secpolicy_inherit() */ - return (dsl_prop_set(zc->zc_name, zc->zc_value, 0, 0, NULL)); + return (dsl_prop_set(zc->zc_name, zc->zc_value, source, 0, 0, NULL)); } static int @@ -1665,11 +2352,32 @@ zfs_ioc_pool_set_props(zfs_cmd_t *zc) nvlist_t *props; spa_t *spa; int error; + nvpair_t *pair; - if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &props))) + if (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, + zc->zc_iflags, &props)) return (error); + /* + * If the only property is the configfile, then just do a spa_lookup() + * to handle the faulted case. + */ + pair = nvlist_next_nvpair(props, NULL); + if (pair != NULL && strcmp(nvpair_name(pair), + zpool_prop_to_name(ZPOOL_PROP_CACHEFILE)) == 0 && + nvlist_next_nvpair(props, pair) == NULL) { + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) { + spa_configfile_set(spa, props, B_FALSE); + spa_config_sync(spa, B_FALSE, B_TRUE); + } + mutex_exit(&spa_namespace_lock); + if (spa != NULL) { + nvlist_free(props); + return (0); + } + } + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { nvlist_free(props); return (error); @@ -1690,20 +2398,27 @@ zfs_ioc_pool_get_props(zfs_cmd_t *zc) int error; nvlist_t *nvp = NULL; - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) - return (error); - - error = spa_prop_get(spa, &nvp); + if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) { + /* + * If the pool is faulted, there may be properties we can still + * get (such as altroot and cachefile), so attempt to get them + * anyway. + */ + mutex_enter(&spa_namespace_lock); + if ((spa = spa_lookup(zc->zc_name)) != NULL) + error = spa_prop_get(spa, &nvp); + mutex_exit(&spa_namespace_lock); + } else { + error = spa_prop_get(spa, &nvp); + spa_close(spa, FTAG); + } if (error == 0 && zc->zc_nvlist_dst != NULL) error = put_nvlist(zc, nvp); else error = EFAULT; - spa_close(spa, FTAG); - - if (nvp) - nvlist_free(nvp); + nvlist_free(nvp); return (error); } @@ -1719,7 +2434,7 @@ zfs_ioc_iscsi_perm_check(zfs_cmd_t *zc) cred_t *usercred; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvp)) != 0) { + zc->zc_iflags, &nvp)) != 0) { return (error); } @@ -1769,7 +2484,7 @@ zfs_ioc_set_fsacl(zfs_cmd_t *zc) nvlist_t *fsaclnv = NULL; if ((error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &fsaclnv)) != 0) + zc->zc_iflags, &fsaclnv)) != 0) return (error); /* @@ -1825,30 +2540,6 @@ zfs_ioc_get_fsacl(zfs_cmd_t *zc) return (error); } -/* - * inputs: - * zc_name name of volume - * - * outputs: none - */ -static int -zfs_ioc_create_minor(zfs_cmd_t *zc) -{ - return (zvol_create_minor(zc->zc_name, ddi_driver_major(zfs_dip))); -} - -/* - * inputs: - * zc_name name of volume - * - * outputs: none - */ -static int -zfs_ioc_remove_minor(zfs_cmd_t *zc) -{ - return (zvol_remove_minor(zc->zc_name)); -} - /* * Search the vfs list for a specified resource. Returns a pointer to it * or NULL if no suitable entry is found. The caller of this routine @@ -1906,11 +2597,10 @@ zfs_create_cb(objset_t *os, void *arg, cred_t *cr, dmu_tx_t *tx) * processing. */ static int -zfs_fill_zplprops_impl(objset_t *os, uint64_t default_zplver, +zfs_fill_zplprops_impl(objset_t *os, uint64_t zplver, boolean_t fuids_ok, nvlist_t *createprops, nvlist_t *zplprops, boolean_t *is_ci) { - uint64_t zplver = default_zplver; uint64_t sense = ZFS_PROP_UNDEFINED; uint64_t norm = ZFS_PROP_UNDEFINED; uint64_t u8 = ZFS_PROP_UNDEFINED; @@ -1998,6 +2688,8 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, ASSERT(cp != NULL); cp[0] = '\0'; + if (zfs_earlier_version(dataset, SPA_VERSION_USERSPACE)) + zplver = ZPL_VERSION_USERSPACE - 1; if (zfs_earlier_version(dataset, SPA_VERSION_FUID)) { zplver = ZPL_VERSION_FUID - 1; fuids_ok = B_FALSE; @@ -2006,13 +2698,12 @@ zfs_fill_zplprops(const char *dataset, nvlist_t *createprops, /* * Open parent object set so we can inherit zplprop values. */ - if ((error = dmu_objset_open(parentname, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os)) != 0) + if ((error = dmu_objset_hold(parentname, FTAG, &os)) != 0) return (error); error = zfs_fill_zplprops_impl(os, zplver, fuids_ok, createprops, zplprops, is_ci); - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); return (error); } @@ -2073,7 +2764,7 @@ zfs_ioc_create(zfs_cmd_t *zc) if (zc->zc_nvlist_src != NULL && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvprops)) != 0) + zc->zc_iflags, &nvprops)) != 0) return (error); zct.zct_zplprops = NULL; @@ -2089,21 +2780,18 @@ zfs_ioc_create(zfs_cmd_t *zc) return (EINVAL); } - error = dmu_objset_open(zc->zc_value, type, - DS_MODE_USER | DS_MODE_READONLY, &clone); + error = dmu_objset_hold(zc->zc_value, FTAG, &clone); if (error) { nvlist_free(nvprops); return (error); } - error = dmu_objset_create(zc->zc_name, type, clone, 0, - NULL, NULL); + error = dmu_objset_clone(zc->zc_name, dmu_objset_ds(clone), 0); + dmu_objset_rele(clone, FTAG); if (error) { - dmu_objset_close(clone); nvlist_free(nvprops); return (error); } - dmu_objset_close(clone); } else { boolean_t is_insensitive = B_FALSE; @@ -2160,7 +2848,7 @@ zfs_ioc_create(zfs_cmd_t *zc) return (error); } } - error = dmu_objset_create(zc->zc_name, type, NULL, + error = dmu_objset_create(zc->zc_name, type, is_insensitive ? DS_FLAG_CI_DATASET : 0, cbfunc, &zct); nvlist_free(zct.zct_zplprops); } @@ -2169,41 +2857,24 @@ zfs_ioc_create(zfs_cmd_t *zc) * It would be nice to do this atomically. */ if (error == 0) { - if ((error = zfs_set_prop_nvlist(zc->zc_name, nvprops)) != 0) - (void) dmu_objset_destroy(zc->zc_name); + error = zfs_set_prop_nvlist(zc->zc_name, ZPROP_SRC_LOCAL, + nvprops, NULL); + if (error != 0) + (void) dmu_objset_destroy(zc->zc_name, B_FALSE); } nvlist_free(nvprops); return (error); } -struct snap_prop_arg { - nvlist_t *nvprops; - const char *snapname; -}; - -static int -set_snap_props(char *name, void *arg) -{ - struct snap_prop_arg *snpa = arg; - int len = strlen(name) + strlen(snpa->snapname) + 2; - char *buf = kmem_alloc(len, KM_SLEEP); - int err; - - (void) snprintf(buf, len, "%s@%s", name, snpa->snapname); - err = zfs_set_prop_nvlist(buf, snpa->nvprops); - if (err) - (void) dmu_objset_destroy(buf); - kmem_free(buf, len); - return (err); -} - /* * inputs: * zc_name name of filesystem * zc_value short name of snapshot * zc_cookie recursive flag + * zc_nvlist_src[_size] property list * - * outputs: none + * outputs: + * zc_value short snapname (i.e. part after the '@') */ static int zfs_ioc_snapshot(zfs_cmd_t *zc) @@ -2217,48 +2888,37 @@ zfs_ioc_snapshot(zfs_cmd_t *zc) if (zc->zc_nvlist_src != NULL && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &nvprops)) != 0) + zc->zc_iflags, &nvprops)) != 0) return (error); - error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, recursive); + error = zfs_check_userprops(zc->zc_name, nvprops); + if (error) + goto out; - /* - * It would be nice to do this atomically. - */ - if (error == 0) { - struct snap_prop_arg snpa; - snpa.nvprops = nvprops; - snpa.snapname = zc->zc_value; - if (recursive) { - error = dmu_objset_find(zc->zc_name, - set_snap_props, &snpa, DS_FIND_CHILDREN); - if (error) { - (void) dmu_snapshots_destroy(zc->zc_name, - zc->zc_value); - } - } else { - error = set_snap_props(zc->zc_name, &snpa); - } + if (!nvlist_empty(nvprops) && + zfs_earlier_version(zc->zc_name, SPA_VERSION_SNAP_PROPS)) { + error = ENOTSUP; + goto out; } + + error = dmu_objset_snapshot(zc->zc_name, zc->zc_value, + nvprops, recursive); + +out: nvlist_free(nvprops); return (error); } int -zfs_unmount_snap(char *name, void *arg) +zfs_unmount_snap(const char *name, void *arg) { vfs_t *vfsp = NULL; if (arg) { char *snapname = arg; - int len = strlen(name) + strlen(snapname) + 2; - char *buf = kmem_alloc(len, KM_SLEEP); - - (void) strcpy(buf, name); - (void) strcat(buf, "@"); - (void) strcat(buf, snapname); - vfsp = zfs_get_vfs(buf); - kmem_free(buf, len); + char *fullname = kmem_asprintf("%s@%s", name, snapname); + vfsp = zfs_get_vfs(fullname); + strfree(fullname); } else if (strchr(name, '@')) { vfsp = zfs_get_vfs(name); } @@ -2283,8 +2943,9 @@ zfs_unmount_snap(char *name, void *arg) /* * inputs: - * zc_name name of filesystem - * zc_value short name of snapshot + * zc_name name of filesystem + * zc_value short name of snapshot + * zc_defer_destroy mark for deferred destroy * * outputs: none */ @@ -2299,26 +2960,32 @@ zfs_ioc_destroy_snaps(zfs_cmd_t *zc) zfs_unmount_snap, zc->zc_value, DS_FIND_CHILDREN); if (err) return (err); - return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value)); + return (dmu_snapshots_destroy(zc->zc_name, zc->zc_value, + zc->zc_defer_destroy)); } /* * inputs: * zc_name name of dataset to destroy * zc_objset_type type of objset + * zc_defer_destroy mark for deferred destroy * * outputs: none */ static int zfs_ioc_destroy(zfs_cmd_t *zc) { + int err; if (strchr(zc->zc_name, '@') && zc->zc_objset_type == DMU_OST_ZFS) { - int err = zfs_unmount_snap(zc->zc_name, NULL); + err = zfs_unmount_snap(zc->zc_name, NULL); if (err) return (err); } - return (dmu_objset_destroy(zc->zc_name)); + err = dmu_objset_destroy(zc->zc_name, zc->zc_defer_destroy); + if (zc->zc_objset_type == DMU_OST_ZVOL && err == 0) + (void) zvol_remove_minor(zc->zc_name); + return (err); } /* @@ -2330,50 +2997,78 @@ zfs_ioc_destroy(zfs_cmd_t *zc) static int zfs_ioc_rollback(zfs_cmd_t *zc) { - objset_t *os; + dsl_dataset_t *ds, *clone; int error; - zfsvfs_t *zfsvfs = NULL; + zfsvfs_t *zfsvfs; + char *clone_name; - /* - * Get the zfsvfs for the receiving objset. There - * won't be one if we're operating on a zvol, if the - * objset doesn't exist yet, or is not mounted. - */ - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, DS_MODE_USER, &os); + error = dsl_dataset_hold(zc->zc_name, FTAG, &ds); if (error) return (error); - if (dmu_objset_type(os) == DMU_OST_ZFS) { - mutex_enter(&os->os->os_user_ptr_lock); - zfsvfs = dmu_objset_get_user(os); - if (zfsvfs != NULL) - VFS_HOLD(zfsvfs->z_vfs); - mutex_exit(&os->os->os_user_ptr_lock); + /* must not be a snapshot */ + if (dsl_dataset_is_snapshot(ds)) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); + } + + /* must have a most recent snapshot */ + if (ds->ds_phys->ds_prev_snap_txg < TXG_INITIAL) { + dsl_dataset_rele(ds, FTAG); + return (EINVAL); } - if (zfsvfs != NULL) { - char *osname; - int mode; + /* + * Create clone of most recent snapshot. + */ + clone_name = kmem_asprintf("%s/%%rollback", zc->zc_name); + error = dmu_objset_clone(clone_name, ds->ds_prev, DS_FLAG_INCONSISTENT); + if (error) + goto out; + + error = dsl_dataset_own(clone_name, B_TRUE, FTAG, &clone); + if (error) + goto out; - osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - error = zfs_suspend_fs(zfsvfs, osname, &mode); + /* + * Do clone swap. + */ + if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { + error = zfs_suspend_fs(zfsvfs); if (error == 0) { int resume_err; - ASSERT(strcmp(osname, zc->zc_name) == 0); - error = dmu_objset_rollback(os); - resume_err = zfs_resume_fs(zfsvfs, osname, mode); + if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { + error = dsl_dataset_clone_swap(clone, ds, + B_TRUE); + dsl_dataset_disown(ds, FTAG); + ds = NULL; + } else { + error = EBUSY; + } + resume_err = zfs_resume_fs(zfsvfs, zc->zc_name); error = error ? error : resume_err; - } else { - dmu_objset_close(os); } - kmem_free(osname, MAXNAMELEN); VFS_RELE(zfsvfs->z_vfs); } else { - error = dmu_objset_rollback(os); + if (dsl_dataset_tryown(ds, B_FALSE, FTAG)) { + error = dsl_dataset_clone_swap(clone, ds, B_TRUE); + dsl_dataset_disown(ds, FTAG); + ds = NULL; + } else { + error = EBUSY; + } } - /* Note, the dmu_objset_rollback() releases the objset for us. */ + /* + * Destroy clone (which also closes it). + */ + (void) dsl_dataset_destroy(clone, FTAG, B_FALSE); + +out: + strfree(clone_name); + if (ds) + dsl_dataset_rele(ds, FTAG); return (error); } @@ -2406,28 +3101,267 @@ zfs_ioc_rename(zfs_cmd_t *zc) if (err) return (err); } + if (zc->zc_objset_type == DMU_OST_ZVOL) + (void) zvol_remove_minor(zc->zc_name); return (dmu_objset_rename(zc->zc_name, zc->zc_value, recursive)); } -static void -clear_props(char *dataset, nvlist_t *props) +static int +zfs_check_settable(const char *dsname, nvpair_t *pair, cred_t *cr) +{ + const char *propname = nvpair_name(pair); + boolean_t issnap = (strchr(dsname, '@') != NULL); + zfs_prop_t prop = zfs_name_to_prop(propname); + uint64_t intval; + int err; + + if (prop == ZPROP_INVAL) { + if (zfs_prop_user(propname)) { + if (err = zfs_secpolicy_write_perms(dsname, + ZFS_DELEG_PERM_USERPROP, cr)) + return (err); + return (0); + } + + if (!issnap && zfs_prop_userquota(propname)) { + const char *perm = NULL; + const char *uq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA]; + const char *gq_prefix = + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA]; + + if (strncmp(propname, uq_prefix, + strlen(uq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_USERQUOTA; + } else if (strncmp(propname, gq_prefix, + strlen(gq_prefix)) == 0) { + perm = ZFS_DELEG_PERM_GROUPQUOTA; + } else { + /* USERUSED and GROUPUSED are read-only */ + return (EINVAL); + } + + if (err = zfs_secpolicy_write_perms(dsname, perm, cr)) + return (err); + return (0); + } + + return (EINVAL); + } + + if (issnap) + return (EINVAL); + + if (nvpair_type(pair) == DATA_TYPE_NVLIST) { + /* + * dsl_prop_get_all_impl() returns properties in this + * format. + */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(pair, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &pair) == 0); + } + + /* + * Check that this value is valid for this pool version + */ + switch (prop) { + case ZFS_PROP_COMPRESSION: + /* + * If the user specified gzip compression, make sure + * the SPA supports it. We ignore any errors here since + * we'll catch them later. + */ + if (nvpair_type(pair) == DATA_TYPE_UINT64 && + nvpair_value_uint64(pair, &intval) == 0) { + if (intval >= ZIO_COMPRESS_GZIP_1 && + intval <= ZIO_COMPRESS_GZIP_9 && + zfs_earlier_version(dsname, + SPA_VERSION_GZIP_COMPRESSION)) { + return (ENOTSUP); + } + + if (intval == ZIO_COMPRESS_ZLE && + zfs_earlier_version(dsname, + SPA_VERSION_ZLE_COMPRESSION)) + return (ENOTSUP); + + /* + * If this is a bootable dataset then + * verify that the compression algorithm + * is supported for booting. We must return + * something other than ENOTSUP since it + * implies a downrev pool version. + */ + if (zfs_is_bootfs(dsname) && + !BOOTFS_COMPRESS_VALID(intval)) { + return (ERANGE); + } + } + break; + + case ZFS_PROP_COPIES: + if (zfs_earlier_version(dsname, SPA_VERSION_DITTO_BLOCKS)) + return (ENOTSUP); + break; + + case ZFS_PROP_DEDUP: + if (zfs_earlier_version(dsname, SPA_VERSION_DEDUP)) + return (ENOTSUP); + break; + + case ZFS_PROP_SHARESMB: + if (zpl_earlier_version(dsname, ZPL_VERSION_FUID)) + return (ENOTSUP); + break; + + case ZFS_PROP_ACLINHERIT: + if (nvpair_type(pair) == DATA_TYPE_UINT64 && + nvpair_value_uint64(pair, &intval) == 0) { + if (intval == ZFS_ACL_PASSTHROUGH_X && + zfs_earlier_version(dsname, + SPA_VERSION_PASSTHROUGH_X)) + return (ENOTSUP); + } + break; + } + + return (zfs_secpolicy_setprop(dsname, prop, pair, CRED())); +} + +/* + * Removes properties from the given props list that fail permission checks + * needed to clear them and to restore them in case of a receive error. For each + * property, make sure we have both set and inherit permissions. + * + * Returns the first error encountered if any permission checks fail. If the + * caller provides a non-NULL errlist, it also gives the complete list of names + * of all the properties that failed a permission check along with the + * corresponding error numbers. The caller is responsible for freeing the + * returned errlist. + * + * If every property checks out successfully, zero is returned and the list + * pointed at by errlist is NULL. + */ +static int +zfs_check_clearable(char *dataset, nvlist_t *props, nvlist_t **errlist) { zfs_cmd_t *zc; - nvpair_t *prop; + nvpair_t *pair, *next_pair; + nvlist_t *errors; + int err, rv = 0; if (props == NULL) - return; + return (0); + + VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + zc = kmem_alloc(sizeof (zfs_cmd_t), KM_SLEEP); (void) strcpy(zc->zc_name, dataset); - for (prop = nvlist_next_nvpair(props, NULL); prop; - prop = nvlist_next_nvpair(props, prop)) { - (void) strcpy(zc->zc_value, nvpair_name(prop)); - if (zfs_secpolicy_inherit(zc, CRED()) == 0) - (void) zfs_ioc_inherit_prop(zc); + pair = nvlist_next_nvpair(props, NULL); + while (pair != NULL) { + next_pair = nvlist_next_nvpair(props, pair); + + (void) strcpy(zc->zc_value, nvpair_name(pair)); + if ((err = zfs_check_settable(dataset, pair, CRED())) != 0 || + (err = zfs_secpolicy_inherit(zc, CRED())) != 0) { + VERIFY(nvlist_remove_nvpair(props, pair) == 0); + VERIFY(nvlist_add_int32(errors, + zc->zc_value, err) == 0); + } + pair = next_pair; + } + kmem_free(zc, sizeof (zfs_cmd_t)); + + if ((pair = nvlist_next_nvpair(errors, NULL)) == NULL) { + nvlist_free(errors); + errors = NULL; + } else { + VERIFY(nvpair_value_int32(pair, &rv) == 0); + } + + if (errlist == NULL) + nvlist_free(errors); + else + *errlist = errors; + + return (rv); +} + +static boolean_t +propval_equals(nvpair_t *p1, nvpair_t *p2) +{ + if (nvpair_type(p1) == DATA_TYPE_NVLIST) { + /* dsl_prop_get_all_impl() format */ + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(p1, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &p1) == 0); + } + + if (nvpair_type(p2) == DATA_TYPE_NVLIST) { + nvlist_t *attrs; + VERIFY(nvpair_value_nvlist(p2, &attrs) == 0); + VERIFY(nvlist_lookup_nvpair(attrs, ZPROP_VALUE, + &p2) == 0); + } + + if (nvpair_type(p1) != nvpair_type(p2)) + return (B_FALSE); + + if (nvpair_type(p1) == DATA_TYPE_STRING) { + char *valstr1, *valstr2; + + VERIFY(nvpair_value_string(p1, (char **)&valstr1) == 0); + VERIFY(nvpair_value_string(p2, (char **)&valstr2) == 0); + return (strcmp(valstr1, valstr2) == 0); + } else { + uint64_t intval1, intval2; + + VERIFY(nvpair_value_uint64(p1, &intval1) == 0); + VERIFY(nvpair_value_uint64(p2, &intval2) == 0); + return (intval1 == intval2); + } +} + +/* + * Remove properties from props if they are not going to change (as determined + * by comparison with origprops). Remove them from origprops as well, since we + * do not need to clear or restore properties that won't change. + */ +static void +props_reduce(nvlist_t *props, nvlist_t *origprops) +{ + nvpair_t *pair, *next_pair; + + if (origprops == NULL) + return; /* all props need to be received */ + + pair = nvlist_next_nvpair(props, NULL); + while (pair != NULL) { + const char *propname = nvpair_name(pair); + nvpair_t *match; + + next_pair = nvlist_next_nvpair(props, pair); + + if ((nvlist_lookup_nvpair(origprops, propname, + &match) != 0) || !propval_equals(pair, match)) + goto next; /* need to set received value */ + + /* don't clear the existing received value */ + (void) nvlist_remove_nvpair(origprops, match); + /* don't bother receiving the property */ + (void) nvlist_remove_nvpair(props, pair); +next: + pair = next_pair; } - kmem_free(zc, sizeof (zfs_cmd_t)); } +#ifdef DEBUG +static boolean_t zfs_ioc_recv_inject_err; +#endif + /* * inputs: * zc_name name of containing filesystem @@ -2440,6 +3374,8 @@ clear_props(char *dataset, nvlist_t *props) * * outputs: * zc_cookie number of bytes read + * zc_nvlist_dst{_size} error for each unapplied received property + * zc_obj zprop_errflags_t */ static int zfs_ioc_recv(zfs_cmd_t *zc) @@ -2447,15 +3383,18 @@ zfs_ioc_recv(zfs_cmd_t *zc) file_t *fp; objset_t *os; dmu_recv_cookie_t drc; - zfsvfs_t *zfsvfs = NULL; boolean_t force = (boolean_t)zc->zc_guid; - int error, fd; + int fd; + int error = 0; + int props_error = 0; + nvlist_t *errors; offset_t off; - nvlist_t *props = NULL; - nvlist_t *origprops = NULL; + nvlist_t *props = NULL; /* sent properties */ + nvlist_t *origprops = NULL; /* existing properties */ objset_t *origin = NULL; char *tosnap; char tofs[ZFS_MAXNAMELEN]; + boolean_t first_recvd_props = B_FALSE; if (dataset_namecheck(zc->zc_value, NULL, NULL) != 0 || strchr(zc->zc_value, '@') == NULL || @@ -2464,12 +3403,11 @@ zfs_ioc_recv(zfs_cmd_t *zc) (void) strcpy(tofs, zc->zc_value); tosnap = strchr(tofs, '@'); - *tosnap = '\0'; - tosnap++; + *tosnap++ = '\0'; if (zc->zc_nvlist_src != NULL && (error = get_nvlist(zc->zc_nvlist_src, zc->zc_nvlist_src_size, - &props)) != 0) + zc->zc_iflags, &props)) != 0) return (error); fd = zc->zc_cookie; @@ -2479,105 +3417,182 @@ zfs_ioc_recv(zfs_cmd_t *zc) return (EBADF); } - if (dmu_objset_open(tofs, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &os) == 0) { - /* - * Try to get the zfsvfs for the receiving objset. - * There won't be one if we're operating on a zvol, - * if the objset doesn't exist yet, or is not mounted. - */ - mutex_enter(&os->os->os_user_ptr_lock); - if (zfsvfs = dmu_objset_get_user(os)) { - if (!mutex_tryenter(&zfsvfs->z_online_recv_lock)) { - mutex_exit(&os->os->os_user_ptr_lock); - dmu_objset_close(os); - zfsvfs = NULL; - error = EBUSY; - goto out; - } - VFS_HOLD(zfsvfs->z_vfs); + VERIFY(nvlist_alloc(&errors, NV_UNIQUE_NAME, KM_SLEEP) == 0); + + if (props && dmu_objset_hold(tofs, FTAG, &os) == 0) { + if ((spa_version(os->os_spa) >= SPA_VERSION_RECVD_PROPS) && + !dsl_prop_get_hasrecvd(os)) { + first_recvd_props = B_TRUE; } - mutex_exit(&os->os->os_user_ptr_lock); /* - * If new properties are supplied, they are to completely - * replace the existing ones, so stash away the existing ones. + * If new received properties are supplied, they are to + * completely replace the existing received properties, so stash + * away the existing ones. */ - if (props) - (void) dsl_prop_get_all(os, &origprops, TRUE); + if (dsl_prop_get_received(os, &origprops) == 0) { + nvlist_t *errlist = NULL; + /* + * Don't bother writing a property if its value won't + * change (and avoid the unnecessary security checks). + * + * The first receive after SPA_VERSION_RECVD_PROPS is a + * special case where we blow away all local properties + * regardless. + */ + if (!first_recvd_props) + props_reduce(props, origprops); + if (zfs_check_clearable(tofs, origprops, + &errlist) != 0) + (void) nvlist_merge(errors, errlist, 0); + nvlist_free(errlist); + } - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); } if (zc->zc_string[0]) { - error = dmu_objset_open(zc->zc_string, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &origin); + error = dmu_objset_hold(zc->zc_string, FTAG, &origin); if (error) goto out; } - error = dmu_recv_begin(tofs, tosnap, &zc->zc_begin_record, - force, origin, zfsvfs != NULL, &drc); + error = dmu_recv_begin(tofs, tosnap, zc->zc_top_ds, + &zc->zc_begin_record, force, origin, &drc); if (origin) - dmu_objset_close(origin); + dmu_objset_rele(origin, FTAG); if (error) goto out; /* - * Reset properties. We do this before we receive the stream - * so that the properties are applied to the new data. + * Set properties before we receive the stream so that they are applied + * to the new data. Note that we must call dmu_recv_stream() if + * dmu_recv_begin() succeeds. */ if (props) { - clear_props(tofs, origprops); + nvlist_t *errlist; + + if (dmu_objset_from_ds(drc.drc_logical_ds, &os) == 0) { + if (drc.drc_newfs) { + if (spa_version(os->os_spa) >= + SPA_VERSION_RECVD_PROPS) + first_recvd_props = B_TRUE; + } else if (origprops != NULL) { + if (clear_received_props(os, tofs, origprops, + first_recvd_props ? NULL : props) != 0) + zc->zc_obj |= ZPROP_ERR_NOCLEAR; + } else { + zc->zc_obj |= ZPROP_ERR_NOCLEAR; + } + dsl_prop_set_hasrecvd(os); + } else if (!drc.drc_newfs) { + zc->zc_obj |= ZPROP_ERR_NOCLEAR; + } + + (void) zfs_set_prop_nvlist(tofs, ZPROP_SRC_RECEIVED, + props, &errlist); + (void) nvlist_merge(errors, errlist, 0); + nvlist_free(errlist); + } + + if (fit_error_list(zc, &errors) != 0 || put_nvlist(zc, errors) != 0) { /* - * XXX - Note, this is all-or-nothing; should be best-effort. + * Caller made zc->zc_nvlist_dst less than the minimum expected + * size or supplied an invalid address. */ - (void) zfs_set_prop_nvlist(tofs, props); + props_error = EINVAL; } off = fp->f_offset; error = dmu_recv_stream(&drc, fp->f_vnode, &off); - if (error == 0 && zfsvfs) { - char *osname; - int mode; + if (error == 0) { + zfsvfs_t *zfsvfs = NULL; - /* online recv */ - osname = kmem_alloc(MAXNAMELEN, KM_SLEEP); - error = zfs_suspend_fs(zfsvfs, osname, &mode); - if (error == 0) { - int resume_err; + if (getzfsvfs(tofs, &zfsvfs) == 0) { + /* online recv */ + int end_err; - error = dmu_recv_end(&drc); - resume_err = zfs_resume_fs(zfsvfs, osname, mode); - error = error ? error : resume_err; + error = zfs_suspend_fs(zfsvfs); + /* + * If the suspend fails, then the recv_end will + * likely also fail, and clean up after itself. + */ + end_err = dmu_recv_end(&drc); + if (error == 0) { + int resume_err = + zfs_resume_fs(zfsvfs, tofs); + error = error ? error : resume_err; + } + error = error ? error : end_err; + VFS_RELE(zfsvfs->z_vfs); } else { - dmu_recv_abort_cleanup(&drc); + error = dmu_recv_end(&drc); } - kmem_free(osname, MAXNAMELEN); - } else if (error == 0) { - error = dmu_recv_end(&drc); } zc->zc_cookie = off - fp->f_offset; if (VOP_SEEK(fp->f_vnode, fp->f_offset, &off, NULL) == 0) fp->f_offset = off; +#ifdef DEBUG + if (zfs_ioc_recv_inject_err) { + zfs_ioc_recv_inject_err = B_FALSE; + error = 1; + } +#endif /* * On error, restore the original props. */ if (error && props) { - clear_props(tofs, props); - (void) zfs_set_prop_nvlist(tofs, origprops); + if (dmu_objset_hold(tofs, FTAG, &os) == 0) { + if (clear_received_props(os, tofs, props, NULL) != 0) { + /* + * We failed to clear the received properties. + * Since we may have left a $recvd value on the + * system, we can't clear the $hasrecvd flag. + */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } else if (first_recvd_props) { + dsl_prop_unset_hasrecvd(os); + } + dmu_objset_rele(os, FTAG); + } else if (!drc.drc_newfs) { + /* We failed to clear the received properties. */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } + + if (origprops == NULL && !drc.drc_newfs) { + /* We failed to stash the original properties. */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } + + /* + * dsl_props_set() will not convert RECEIVED to LOCAL on or + * after SPA_VERSION_RECVD_PROPS, so we need to specify LOCAL + * explictly if we're restoring local properties cleared in the + * first new-style receive. + */ + if (origprops != NULL && + zfs_set_prop_nvlist(tofs, (first_recvd_props ? + ZPROP_SRC_LOCAL : ZPROP_SRC_RECEIVED), + origprops, NULL) != 0) { + /* + * We stashed the original properties but failed to + * restore them. + */ + zc->zc_obj |= ZPROP_ERR_NORESTORE; + } } out: - if (zfsvfs) { - mutex_exit(&zfsvfs->z_online_recv_lock); - VFS_RELE(zfsvfs->z_vfs); - } nvlist_free(props); nvlist_free(origprops); + nvlist_free(errors); releasef(fd); + + if (error == 0) + error = props_error; + return (error); } @@ -2599,8 +3614,7 @@ zfs_ioc_send(zfs_cmd_t *zc) int error; offset_t off; - error = dmu_objset_open(zc->zc_name, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &tosnap); + error = dmu_objset_hold(zc->zc_name, FTAG, &tosnap); if (error) return (error); @@ -2614,20 +3628,19 @@ zfs_ioc_send(zfs_cmd_t *zc) if (cp) *(cp+1) = 0; (void) strncat(buf, zc->zc_value, MAXPATHLEN); - error = dmu_objset_open(buf, DMU_OST_ANY, - DS_MODE_USER | DS_MODE_READONLY, &fromsnap); + error = dmu_objset_hold(buf, FTAG, &fromsnap); kmem_free(buf, MAXPATHLEN); if (error) { - dmu_objset_close(tosnap); + dmu_objset_rele(tosnap, FTAG); return (error); } } fp = getf(zc->zc_cookie); if (fp == NULL) { - dmu_objset_close(tosnap); + dmu_objset_rele(tosnap, FTAG); if (fromsnap) - dmu_objset_close(fromsnap); + dmu_objset_rele(fromsnap, FTAG); return (EBADF); } @@ -2638,8 +3651,8 @@ zfs_ioc_send(zfs_cmd_t *zc) fp->f_offset = off; releasef(zc->zc_cookie); if (fromsnap) - dmu_objset_close(fromsnap); - dmu_objset_close(tosnap); + dmu_objset_rele(fromsnap, FTAG); + dmu_objset_rele(tosnap, FTAG); return (error); } @@ -2715,16 +3728,38 @@ zfs_ioc_clear(zfs_cmd_t *zc) mutex_exit(&spa_namespace_lock); return (EIO); } - if (spa->spa_log_state == SPA_LOG_MISSING) { + if (spa_get_log_state(spa) == SPA_LOG_MISSING) { /* we need to let spa_open/spa_load clear the chains */ - spa->spa_log_state = SPA_LOG_CLEAR; + spa_set_log_state(spa, SPA_LOG_CLEAR); } + spa->spa_last_open_failed = 0; mutex_exit(&spa_namespace_lock); - if ((error = spa_open(zc->zc_name, &spa, FTAG)) != 0) + if (zc->zc_cookie & ZPOOL_NO_REWIND) { + error = spa_open(zc->zc_name, &spa, FTAG); + } else { + nvlist_t *policy; + nvlist_t *config = NULL; + + if (zc->zc_nvlist_src == NULL) + return (EINVAL); + + if ((error = get_nvlist(zc->zc_nvlist_src, + zc->zc_nvlist_src_size, zc->zc_iflags, &policy)) == 0) { + error = spa_open_rewind(zc->zc_name, &spa, FTAG, + policy, &config); + if (config != NULL) { + (void) put_nvlist(zc, config); + nvlist_free(config); + } + nvlist_free(policy); + } + } + + if (error) return (error); - spa_vdev_state_enter(spa); + spa_vdev_state_enter(spa, SCL_NONE); if (zc->zc_guid == 0) { vd = NULL; @@ -2744,11 +3779,12 @@ zfs_ioc_clear(zfs_cmd_t *zc) /* * Resume any suspended I/Os. */ - zio_resume(spa); + if (zio_resume(spa) != 0) + error = EIO; spa_close(spa, FTAG); - return (0); + return (error); } /* @@ -2756,7 +3792,8 @@ zfs_ioc_clear(zfs_cmd_t *zc) * zc_name name of filesystem * zc_value name of origin snapshot * - * outputs: none + * outputs: + * zc_string name of conflicting snapshot, if there is one */ static int zfs_ioc_promote(zfs_cmd_t *zc) @@ -2772,7 +3809,118 @@ zfs_ioc_promote(zfs_cmd_t *zc) *cp = '\0'; (void) dmu_objset_find(zc->zc_value, zfs_unmount_snap, NULL, DS_FIND_SNAPSHOTS); - return (dsl_dataset_promote(zc->zc_name)); + return (dsl_dataset_promote(zc->zc_name, zc->zc_string)); +} + +/* + * Retrieve a single {user|group}{used|quota}@... property. + * + * inputs: + * zc_name name of filesystem + * zc_objset_type zfs_userquota_prop_t + * zc_value domain name (eg. "S-1-234-567-89") + * zc_guid RID/UID/GID + * + * outputs: + * zc_cookie property value + */ +static int +zfs_ioc_userspace_one(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int error; + + if (zc->zc_objset_type >= ZFS_NUM_USERQUOTA_PROPS) + return (EINVAL); + + error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs); + if (error) + return (error); + + error = zfs_userspace_one(zfsvfs, + zc->zc_objset_type, zc->zc_value, zc->zc_guid, &zc->zc_cookie); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_cookie zap cursor + * zc_objset_type zfs_userquota_prop_t + * zc_nvlist_dst[_size] buffer to fill (not really an nvlist) + * + * outputs: + * zc_nvlist_dst[_size] data buffer (array of zfs_useracct_t) + * zc_cookie zap cursor + */ +static int +zfs_ioc_userspace_many(zfs_cmd_t *zc) +{ + zfsvfs_t *zfsvfs; + int error; + + error = zfsvfs_hold(zc->zc_name, FTAG, &zfsvfs); + if (error) + return (error); + + int bufsize = zc->zc_nvlist_dst_size; + void *buf = kmem_alloc(bufsize, KM_SLEEP); + + error = zfs_userspace_many(zfsvfs, zc->zc_objset_type, &zc->zc_cookie, + buf, &zc->zc_nvlist_dst_size); + + if (error == 0) { + error = xcopyout(buf, + (void *)(uintptr_t)zc->zc_nvlist_dst, + zc->zc_nvlist_dst_size); + } + kmem_free(buf, bufsize); + zfsvfs_rele(zfsvfs, FTAG); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * none + */ +static int +zfs_ioc_userspace_upgrade(zfs_cmd_t *zc) +{ + objset_t *os; + int error = 0; + zfsvfs_t *zfsvfs; + + if (getzfsvfs(zc->zc_name, &zfsvfs) == 0) { + if (!dmu_objset_userused_enabled(zfsvfs->z_os)) { + /* + * If userused is not enabled, it may be because the + * objset needs to be closed & reopened (to grow the + * objset_phys_t). Suspend/resume the fs will do that. + */ + error = zfs_suspend_fs(zfsvfs); + if (error == 0) + error = zfs_resume_fs(zfsvfs, zc->zc_name); + } + if (error == 0) + error = dmu_objset_userspace_upgrade(zfsvfs->z_os); + VFS_RELE(zfsvfs->z_vfs); + } else { + /* XXX kind of reading contents without owning */ + error = dmu_objset_hold(zc->zc_name, FTAG, &os); + if (error) + return (error); + + error = dmu_objset_userspace_upgrade(os); + dmu_objset_rele(os, FTAG); + } + + return (error); } /* @@ -2888,7 +4036,7 @@ zfs_ioc_share(zfs_cmd_t *zc) if (error = zsmbexport_fs((void *) (uintptr_t)zc->zc_share.z_exportdata, zc->zc_share.z_sharetype == ZFS_SHARE_SMB ? - B_TRUE : B_FALSE)) { + B_TRUE: B_FALSE)) { return (error); } break; @@ -2909,64 +4057,350 @@ zfs_ioc_share(zfs_cmd_t *zc) } +ace_t full_access[] = { + {(uid_t)-1, ACE_ALL_PERMS, ACE_EVERYONE, 0} +}; + +/* + * Remove all ACL files in shares dir + */ +static int +zfs_smb_acl_purge(znode_t *dzp) +{ + zap_cursor_t zc; + zap_attribute_t zap; + zfsvfs_t *zfsvfs = dzp->z_zfsvfs; + int error; + + for (zap_cursor_init(&zc, zfsvfs->z_os, dzp->z_id); + (error = zap_cursor_retrieve(&zc, &zap)) == 0; + zap_cursor_advance(&zc)) { + if ((error = VOP_REMOVE(ZTOV(dzp), zap.za_name, kcred, + NULL, 0)) != 0) + break; + } + zap_cursor_fini(&zc); + return (error); +} + +static int +zfs_ioc_smb_acl(zfs_cmd_t *zc) +{ + vnode_t *vp; + znode_t *dzp; + vnode_t *resourcevp = NULL; + znode_t *sharedir; + zfsvfs_t *zfsvfs; + nvlist_t *nvlist; + char *src, *target; + vattr_t vattr; + vsecattr_t vsec; + int error = 0; + + if ((error = lookupname(zc->zc_value, UIO_SYSSPACE, + NO_FOLLOW, NULL, &vp)) != 0) + return (error); + + /* Now make sure mntpnt and dataset are ZFS */ + + if (vp->v_vfsp->vfs_fstype != zfsfstype || + (strcmp((char *)refstr_value(vp->v_vfsp->vfs_resource), + zc->zc_name) != 0)) { + VN_RELE(vp); + return (EINVAL); + } + + dzp = VTOZ(vp); + zfsvfs = dzp->z_zfsvfs; + ZFS_ENTER(zfsvfs); + + /* + * Create share dir if its missing. + */ + mutex_enter(&zfsvfs->z_lock); + if (zfsvfs->z_shares_dir == 0) { + dmu_tx_t *tx; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, TRUE, + ZFS_SHARES_DIR); + dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + } else { + error = zfs_create_share_dir(zfsvfs, tx); + dmu_tx_commit(tx); + } + if (error) { + mutex_exit(&zfsvfs->z_lock); + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + } + mutex_exit(&zfsvfs->z_lock); + + ASSERT(zfsvfs->z_shares_dir); + if ((error = zfs_zget(zfsvfs, zfsvfs->z_shares_dir, &sharedir)) != 0) { + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + + switch (zc->zc_cookie) { + case ZFS_SMB_ACL_ADD: + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VREG; + vattr.va_mode = S_IFREG|0777; + vattr.va_uid = 0; + vattr.va_gid = 0; + + vsec.vsa_mask = VSA_ACE; + vsec.vsa_aclentp = &full_access; + vsec.vsa_aclentsz = sizeof (full_access); + vsec.vsa_aclcnt = 1; + + error = VOP_CREATE(ZTOV(sharedir), zc->zc_string, + &vattr, EXCL, 0, &resourcevp, kcred, 0, NULL, &vsec); + if (resourcevp) + VN_RELE(resourcevp); + break; + + case ZFS_SMB_ACL_REMOVE: + error = VOP_REMOVE(ZTOV(sharedir), zc->zc_string, kcred, + NULL, 0); + break; + + case ZFS_SMB_ACL_RENAME: + if ((error = get_nvlist(zc->zc_nvlist_src, + zc->zc_nvlist_src_size, zc->zc_iflags, &nvlist)) != 0) { + VN_RELE(vp); + ZFS_EXIT(zfsvfs); + return (error); + } + if (nvlist_lookup_string(nvlist, ZFS_SMB_ACL_SRC, &src) || + nvlist_lookup_string(nvlist, ZFS_SMB_ACL_TARGET, + &target)) { + VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); + ZFS_EXIT(zfsvfs); + nvlist_free(nvlist); + return (error); + } + error = VOP_RENAME(ZTOV(sharedir), src, ZTOV(sharedir), target, + kcred, NULL, 0); + nvlist_free(nvlist); + break; + + case ZFS_SMB_ACL_PURGE: + error = zfs_smb_acl_purge(sharedir); + break; + + default: + error = EINVAL; + break; + } + + VN_RELE(vp); + VN_RELE(ZTOV(sharedir)); + + ZFS_EXIT(zfsvfs); + + return (error); +} + +/* + * inputs: + * zc_name name of filesystem + * zc_value short name of snap + * zc_string user-supplied tag for this reference + * zc_cookie recursive flag + * zc_temphold set if hold is temporary + * + * outputs: none + */ +static int +zfs_ioc_hold(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + return (dsl_dataset_user_hold(zc->zc_name, zc->zc_value, + zc->zc_string, recursive, zc->zc_temphold)); +} + +/* + * inputs: + * zc_name name of dataset from which we're releasing a user reference + * zc_value short name of snap + * zc_string user-supplied tag for this reference + * zc_cookie recursive flag + * + * outputs: none + */ +static int +zfs_ioc_release(zfs_cmd_t *zc) +{ + boolean_t recursive = zc->zc_cookie; + + if (snapshot_namecheck(zc->zc_value, NULL, NULL) != 0) + return (EINVAL); + + return (dsl_dataset_user_release(zc->zc_name, zc->zc_value, + zc->zc_string, recursive)); +} + +/* + * inputs: + * zc_name name of filesystem + * + * outputs: + * zc_nvlist_src{_size} nvlist of snapshot holds + */ +static int +zfs_ioc_get_holds(zfs_cmd_t *zc) +{ + nvlist_t *nvp; + int error; + + if ((error = dsl_dataset_get_holds(zc->zc_name, &nvp)) == 0) { + error = put_nvlist(zc, nvp); + nvlist_free(nvp); + } + + return (error); +} + /* * pool create, destroy, and export don't log the history as part of * zfsdev_ioctl, but rather zfs_ioc_pool_create, and zfs_ioc_pool_export * do the logging of those commands. */ static zfs_ioc_vec_t zfs_ioc_vec[] = { - { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE }, - { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE }, - { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_dataset_list_next, zfs_secpolicy_read, - DATASET_NAME, B_FALSE }, - { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, - DATASET_NAME, B_FALSE }, - { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE }, - { zfs_ioc_create_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, - { zfs_ioc_remove_minor, zfs_secpolicy_minor, DATASET_NAME, B_FALSE }, - { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE }, - { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, - { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE }, - { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE }, - { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE }, - { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE }, - { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE }, - { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE }, - { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE }, - { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE }, - { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE }, - { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE }, - { zfs_ioc_obj_to_path, zfs_secpolicy_config, NO_NAME, B_FALSE }, - { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE }, - { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE }, - { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE }, - { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE }, - { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, - DATASET_NAME, B_FALSE }, - { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE }, - { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE }, + { zfs_ioc_pool_create, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_destroy, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_import, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_FALSE }, + { zfs_ioc_pool_export, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_configs, zfs_secpolicy_none, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_stats, zfs_secpolicy_read, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_tryimport, zfs_secpolicy_config, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_scrub, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_pool_freeze, zfs_secpolicy_config, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_pool_upgrade, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_pool_get_history, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_vdev_add, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_remove, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_set_state, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_FALSE }, + { zfs_ioc_vdev_attach, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_detach, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_vdev_setpath, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_vdev_setfru, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_objset_stats, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_objset_zplprops, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_dataset_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_snapshot_list_next, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_set_prop, zfs_secpolicy_none, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_create, zfs_secpolicy_create, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_destroy, zfs_secpolicy_destroy, DATASET_NAME, B_TRUE, + B_TRUE}, + { zfs_ioc_rollback, zfs_secpolicy_rollback, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_rename, zfs_secpolicy_rename, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_recv, zfs_secpolicy_receive, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_send, zfs_secpolicy_send, DATASET_NAME, B_TRUE, B_FALSE }, + { zfs_ioc_inject_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_clear_fault, zfs_secpolicy_inject, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_inject_list_next, zfs_secpolicy_inject, NO_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_error_log, zfs_secpolicy_inject, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_clear, zfs_secpolicy_config, POOL_NAME, B_TRUE, B_FALSE }, + { zfs_ioc_promote, zfs_secpolicy_promote, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_destroy_snaps, zfs_secpolicy_destroy_snaps, DATASET_NAME, + B_TRUE, B_TRUE }, + { zfs_ioc_snapshot, zfs_secpolicy_snapshot, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_dsobj_to_dsname, zfs_secpolicy_config, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_obj_to_path, zfs_secpolicy_config, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_pool_set_props, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_pool_get_props, zfs_secpolicy_read, POOL_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_set_fsacl, zfs_secpolicy_fsacl, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_get_fsacl, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_iscsi_perm_check, zfs_secpolicy_iscsi, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_share, zfs_secpolicy_share, DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_inherit_prop, zfs_secpolicy_inherit, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_smb_acl, zfs_secpolicy_smb_acl, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_userspace_one, zfs_secpolicy_userspace_one, + DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_userspace_many, zfs_secpolicy_userspace_many, + DATASET_NAME, B_FALSE, B_FALSE }, + { zfs_ioc_userspace_upgrade, zfs_secpolicy_userspace_upgrade, + DATASET_NAME, B_FALSE, B_TRUE }, + { zfs_ioc_hold, zfs_secpolicy_hold, DATASET_NAME, B_TRUE, B_TRUE }, + { zfs_ioc_release, zfs_secpolicy_release, DATASET_NAME, B_TRUE, + B_TRUE }, + { zfs_ioc_get_holds, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_TRUE }, + { zfs_ioc_objset_recvd_props, zfs_secpolicy_read, DATASET_NAME, B_FALSE, + B_FALSE }, + { zfs_ioc_vdev_split, zfs_secpolicy_config, POOL_NAME, B_TRUE, + B_TRUE } }; +int +pool_status_check(const char *name, zfs_ioc_namecheck_t type) +{ + spa_t *spa; + int error; + + ASSERT(type == POOL_NAME || type == DATASET_NAME); + + error = spa_open(name, &spa, FTAG); + if (error == 0) { + if (spa_suspended(spa)) + error = EAGAIN; + spa_close(spa, FTAG); + } + return (error); +} + static int zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) { @@ -2985,9 +4419,9 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) zc = kmem_zalloc(sizeof (zfs_cmd_t), KM_SLEEP); - error = xcopyin((void *)arg, zc, sizeof (zfs_cmd_t)); + error = ddi_copyin((void *)arg, zc, sizeof (zfs_cmd_t), flag); - if (error == 0) + if ((error == 0) && !(flag & FKIOCTL)) error = zfs_ioc_vec[vec].zvec_secpolicy(zc, cr); /* @@ -2996,15 +4430,22 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) */ if (error == 0) { zc->zc_name[sizeof (zc->zc_name) - 1] = '\0'; + zc->zc_iflags = flag & FKIOCTL; switch (zfs_ioc_vec[vec].zvec_namecheck) { case POOL_NAME: if (pool_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; + if (zfs_ioc_vec[vec].zvec_pool_check) + error = pool_status_check(zc->zc_name, + zfs_ioc_vec[vec].zvec_namecheck); break; case DATASET_NAME: if (dataset_namecheck(zc->zc_name, NULL, NULL) != 0) error = EINVAL; + if (zfs_ioc_vec[vec].zvec_pool_check) + error = pool_status_check(zc->zc_name, + zfs_ioc_vec[vec].zvec_namecheck); break; case NO_NAME: @@ -3015,10 +4456,10 @@ zfsdev_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) if (error == 0) error = zfs_ioc_vec[vec].zvec_func(zc); - rc = xcopyout(zc, (void *)arg, sizeof (zfs_cmd_t)); + rc = ddi_copyout(zc, (void *)arg, sizeof (zfs_cmd_t), flag); if (error == 0) { error = rc; - if (zfs_ioc_vec[vec].zvec_his_log == B_TRUE) + if (zfs_ioc_vec[vec].zvec_his_log) zfs_log_history(zc); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c index 11cd4c264b573..b4e74dad1f44a 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_log.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -45,13 +45,25 @@ #include #include #include +#include /* - * All the functions in this file are used to construct the log entries - * to record transactions. They allocate * an intent log transaction - * structure (itx_t) and save within it all the information necessary to - * possibly replay the transaction. The itx is then assigned a sequence - * number and inserted in the in-memory list anchored in the zilog. + * These zfs_log_* functions must be called within a dmu tx, in one + * of 2 contexts depending on zilog->z_replay: + * + * Non replay mode + * --------------- + * We need to record the transaction so that if it is committed to + * the Intent Log then it can be replayed. An intent log transaction + * structure (itx_t) is allocated and all the information necessary to + * possibly replay the transaction is saved in it. The itx is then assigned + * a sequence number and inserted in the in-memory list anchored in the zilog. + * + * Replay mode + * ----------- + * We need to mark the intent log record as replayed in the log header. + * This is done in the same transaction as the replay so that they + * commit atomically. */ int @@ -155,6 +167,9 @@ zfs_log_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) ZFS_TIME_ENCODE(&xoap->xoa_createtime, crtime); if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) bcopy(xoap->xoa_av_scanstamp, scanstamp, AV_SCANSTAMP_SZ); + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) + *attrs |= (xoap->xoa_reparse == 0) ? 0 : + XAT0_REPARSE; } static void * @@ -228,7 +243,7 @@ zfs_log_create(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, size_t namesize = strlen(name) + 1; size_t fuidsz = 0; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; /* @@ -331,7 +346,7 @@ zfs_log_remove(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr_remove_t *lr; size_t namesize = strlen(name) + 1; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(txtype, sizeof (*lr) + namesize); @@ -355,7 +370,7 @@ zfs_log_link(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, lr_link_t *lr; size_t namesize = strlen(name) + 1; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(txtype, sizeof (*lr) + namesize); @@ -382,7 +397,7 @@ zfs_log_symlink(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, size_t namesize = strlen(name) + 1; size_t linksize = strlen(link) + 1; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(txtype, sizeof (*lr) + namesize + linksize); @@ -416,7 +431,7 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, size_t snamesize = strlen(sname) + 1; size_t dnamesize = strlen(dname) + 1; - if (zilog == NULL) + if (zil_replaying(zilog, tx)) return; itx = zil_itx_create(txtype, sizeof (*lr) + snamesize + dnamesize); @@ -437,9 +452,6 @@ zfs_log_rename(zilog_t *zilog, dmu_tx_t *tx, uint64_t txtype, */ ssize_t zfs_immediate_write_sz = 32768; -#define ZIL_MAX_LOG_DATA (SPA_MAXBLOCKSIZE - sizeof (zil_trailer_t) - \ - sizeof (lr_write_t)) - void zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, znode_t *zp, offset_t off, ssize_t resid, int ioflag) @@ -447,35 +459,17 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, itx_wr_state_t write_state; boolean_t slogging; uintptr_t fsync_cnt; + ssize_t immediate_write_sz; - if (zilog == NULL || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) return; - /* - * Writes are handled in three different ways: - * - * WR_INDIRECT: - * In this mode, if we need to commit the write later, then the block - * is immediately written into the file system (using dmu_sync), - * and a pointer to the block is put into the log record. - * When the txg commits the block is linked in. - * This saves additionally writing the data into the log record. - * There are a few requirements for this to occur: - * - write is greater than zfs_immediate_write_sz - * - not using slogs (as slogs are assumed to always be faster - * than writing into the main pool) - * - the write occupies only one block - * WR_COPIED: - * If we know we'll immediately be committing the - * transaction (FSYNC or FDSYNC), the we allocate a larger - * log record here for the data and copy the data in. - * WR_NEED_COPY: - * Otherwise we don't allocate a buffer, and *if* we need to - * flush the write later then a buffer is allocated and - * we retrieve the data using the dmu. - */ - slogging = spa_has_slogs(zilog->zl_spa); - if (resid > zfs_immediate_write_sz && !slogging && resid <= zp->z_blksz) + immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + ? 0 : zfs_immediate_write_sz; + + slogging = spa_has_slogs(zilog->zl_spa) && + (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); + if (resid > immediate_write_sz && !slogging && resid <= zp->z_blksz) write_state = WR_INDIRECT; else if (ioflag & (FSYNC | FDSYNC)) write_state = WR_COPIED; @@ -503,9 +497,8 @@ zfs_log_write(zilog_t *zilog, dmu_tx_t *tx, int txtype, (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; if (write_state == WR_COPIED && dmu_read(zp->z_zfsvfs->z_os, - zp->z_id, off, len, lr + 1) != 0) { - kmem_free(itx, offsetof(itx_t, itx_lr) + - itx->itx_lr.lrc_reclen); + zp->z_id, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); itx = zil_itx_create(txtype, sizeof (*lr)); lr = (lr_write_t *)&itx->itx_lr; write_state = WR_NEED_COPY; @@ -546,7 +539,7 @@ zfs_log_truncate(zilog_t *zilog, dmu_tx_t *tx, int txtype, uint64_t seq; lr_truncate_t *lr; - if (zilog == NULL || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) return; itx = zil_itx_create(txtype, sizeof (*lr)); @@ -574,8 +567,7 @@ zfs_log_setattr(zilog_t *zilog, dmu_tx_t *tx, int txtype, size_t recsize = sizeof (lr_setattr_t); void *start; - - if (zilog == NULL || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) return; /* @@ -641,7 +633,7 @@ zfs_log_acl(zilog_t *zilog, dmu_tx_t *tx, znode_t *zp, size_t txsize; size_t aclbytes = vsecp->vsa_aclentsz; - if (zilog == NULL || zp->z_unlinked) + if (zil_replaying(zilog, tx) || zp->z_unlinked) return; txtype = (zp->z_zfsvfs->z_version < ZPL_VERSION_FUID) ? diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c index 85b79703a7807..39daf968b2310 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_replay.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -129,6 +127,8 @@ zfs_replay_xvattr(lr_attr_t *lrattr, xvattr_t *xvap) ZFS_TIME_DECODE(&xoap->xoa_createtime, crtime); if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) bcopy(scanstamp, xoap->xoa_av_scanstamp, AV_SCANSTAMP_SZ); + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) + xoap->xoa_reparse = ((*attrs & XAT0_REPARSE) != 0); } static int @@ -275,9 +275,9 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs, uint64_t txtype; int error; + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); - txtype = (int)lr->lr_common.lrc_txtype; if (txtype == TX_CREATE_ACL_ATTR || txtype == TX_MKDIR_ACL_ATTR) { lrattr = (lr_attr_t *)(caddr_t)(lracl + 1); @@ -318,7 +318,7 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs, if (lr->lr_common.lrc_txtype & TX_CI) vflg |= FIGNORECASE; - switch ((int)lr->lr_common.lrc_txtype) { + switch (txtype) { case TX_CREATE_ACL: aclstart = (caddr_t)(lracl + 1); fuidstart = (caddr_t)aclstart + @@ -391,7 +391,8 @@ zfs_replay_create_acl(zfsvfs_t *zfsvfs, VN_RELE(ZTOV(dzp)); - zfs_fuid_info_free(zfsvfs->z_fuid_replay); + if (zfsvfs->z_fuid_replay) + zfs_fuid_info_free(zfsvfs->z_fuid_replay); zfsvfs->z_fuid_replay = NULL; return (error); @@ -413,9 +414,9 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) uint64_t txtype; int error; + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); - txtype = (int)lr->lr_common.lrc_txtype; if (txtype == TX_CREATE_ATTR || txtype == TX_MKDIR_ATTR) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } @@ -460,7 +461,7 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) lr->lr_uid, lr->lr_gid); } - switch ((int)lr->lr_common.lrc_txtype) { + switch (txtype) { case TX_CREATE_ATTR: lrattr = (lr_attr_t *)(caddr_t)(lr + 1); xvatlen = ZIL_XVAT_SIZE(lrattr->lr_attr_masksize); @@ -498,7 +499,6 @@ zfs_replay_create(zfsvfs_t *zfsvfs, lr_create_t *lr, boolean_t byteswap) &vp, kcred, NULL, vflg, NULL); break; case TX_MKXATTR: - name = (char *)(lr + 1); error = zfs_make_xattrdir(dzp, &xva.xva_vattr, &vp, kcred); break; case TX_SYMLINK: @@ -625,6 +625,7 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) znode_t *zp; int error; ssize_t resid; + uint64_t orig_eof, eod, offset, length; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -640,8 +641,64 @@ zfs_replay_write(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) return (error); } - error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, lr->lr_length, - lr->lr_offset, UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + offset = lr->lr_offset; + length = lr->lr_length; + eod = offset + length; /* end of data for this write */ + + orig_eof = zp->z_phys->zp_size; + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + } + + error = vn_rdwr(UIO_WRITE, ZTOV(zp), data, length, offset, + UIO_SYSSPACE, 0, RLIM64_INFINITY, kcred, &resid); + + /* + * This may be a write from a dmu_sync() for a whole block, + * and may extend beyond the current end of the file. + * We can't just replay what was written for this TX_WRITE as + * a future TX_WRITE2 may extend the eof and the data for that + * write needs to be there. So we write the whole block and + * reduce the eof. + */ + if (orig_eof < zp->z_phys->zp_size) /* file length grew ? */ + zp->z_phys->zp_size = eod; + + VN_RELE(ZTOV(zp)); + + return (error); +} + +/* + * TX_WRITE2 are only generated when dmu_sync() returns EALREADY + * meaning the pool block is already being synced. So now that we always write + * out full blocks, all we have to do is expand the eof if + * the file is grown. + */ +static int +zfs_replay_write2(zfsvfs_t *zfsvfs, lr_write_t *lr, boolean_t byteswap) +{ + znode_t *zp; + int error; + uint64_t end; + + if (byteswap) + byteswap_uint64_array(lr, sizeof (*lr)); + + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) + return (error); + + end = lr->lr_offset + lr->lr_length; + if (end > zp->z_phys->zp_size) { + ASSERT3U(end - zp->z_phys->zp_size, <, zp->z_blksz); + zp->z_phys->zp_size = end; + } VN_RELE(ZTOV(zp)); @@ -658,16 +715,8 @@ zfs_replay_truncate(zfsvfs_t *zfsvfs, lr_truncate_t *lr, boolean_t byteswap) if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log truncates out of order, it's possible the - * file has been removed. In this case just drop the truncate - * and return success. - */ - if (error == ENOENT) - error = 0; + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - } bzero(&fl, sizeof (fl)); fl.l_type = F_WRLCK; @@ -701,16 +750,8 @@ zfs_replay_setattr(zfsvfs_t *zfsvfs, lr_setattr_t *lr, boolean_t byteswap) zfs_replay_swap_attrs((lr_attr_t *)(lr + 1)); } - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log setattrs out of order, it's possible the - * file has been removed. In this case just drop the setattr - * and return success. - */ - if (error == ENOENT) - error = 0; + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - } zfs_init_vattr(vap, lr->lr_mask, lr->lr_mode, lr->lr_uid, lr->lr_gid, 0, lr->lr_foid); @@ -756,16 +797,8 @@ zfs_replay_acl_v0(zfsvfs_t *zfsvfs, lr_acl_v0_t *lr, boolean_t byteswap) zfs_oldace_byteswap(ace, lr->lr_aclcnt); } - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log acls out of order, it's possible the - * file has been removed. In this case just drop the acl - * and return success. - */ - if (error == ENOENT) - error = 0; + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - } bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT; @@ -813,16 +846,8 @@ zfs_replay_acl(zfsvfs_t *zfsvfs, lr_acl_t *lr, boolean_t byteswap) } } - if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) { - /* - * As we can log acls out of order, it's possible the - * file has been removed. In this case just drop the acl - * and return success. - */ - if (error == ENOENT) - error = 0; + if ((error = zfs_zget(zfsvfs, lr->lr_foid, &zp)) != 0) return (error); - } bzero(&vsa, sizeof (vsa)); vsa.vsa_mask = VSA_ACE | VSA_ACECNT | VSA_ACE_ACLFLAGS; @@ -875,4 +900,5 @@ zil_replay_func_t *zfs_replay_vector[TX_MAX_TYPE] = { zfs_replay_create_acl, /* TX_MKDIR_ACL */ zfs_replay_create, /* TX_MKDIR_ATTR */ zfs_replay_create_acl, /* TX_MKDIR_ACL_ATTR */ + zfs_replay_write2, /* TX_WRITE2 */ }; diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c index f0a75b5fa0d71..4de8d8a2dfed9 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_rlock.c @@ -19,12 +19,10 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This file contains the code to implement file range locking in * ZFS, although there isn't much specific to ZFS (all that comes to mind @@ -431,6 +429,8 @@ zfs_range_lock(znode_t *zp, uint64_t off, uint64_t len, rl_type_t type) new = kmem_alloc(sizeof (rl_t), KM_SLEEP); new->r_zp = zp; new->r_off = off; + if (len + off < off) /* overflow */ + len = UINT64_MAX - off; new->r_len = len; new->r_cnt = 1; /* assume it's going to be in the tree */ new->r_type = type; diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c index 06b4dee4620bb..6759a812edefc 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vfsops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -67,6 +67,8 @@ static major_t zfs_major; static minor_t zfs_minor; static kmutex_t zfs_dev_mtx; +extern int sys_shutdown; + static int zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr); static int zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr); static int zfs_mountroot(vfs_t *vfsp, enum whymountroot); @@ -145,12 +147,24 @@ zfs_sync(vfs_t *vfsp, short flag, cred_t *cr) * Sync a specific filesystem. */ zfsvfs_t *zfsvfs = vfsp->vfs_data; + dsl_pool_t *dp; ZFS_ENTER(zfsvfs); + dp = dmu_objset_pool(zfsvfs->z_os); + + /* + * If the system is shutting down, then skip any + * filesystems which may exist on a suspended pool. + */ + if (sys_shutdown && spa_suspended(dp->dp_spa)) { + ZFS_EXIT(zfsvfs); + return (0); + } + if (zfsvfs->z_log != NULL) zil_commit(zfsvfs->z_log, UINT64_MAX, 0); else - txg_wait_synced(dmu_objset_pool(zfsvfs->z_os), 0); + txg_wait_synced(dp, 0); ZFS_EXIT(zfsvfs); } else { /* @@ -554,6 +568,371 @@ zfs_register_callbacks(vfs_t *vfsp) } +static void +uidacct(objset_t *os, boolean_t isgroup, uint64_t fuid, + int64_t delta, dmu_tx_t *tx) +{ + uint64_t used = 0; + char buf[32]; + int err; + uint64_t obj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + + if (delta == 0) + return; + + (void) snprintf(buf, sizeof (buf), "%llx", (longlong_t)fuid); + err = zap_lookup(os, obj, buf, 8, 1, &used); + ASSERT(err == 0 || err == ENOENT); + /* no underflow/overflow */ + ASSERT(delta > 0 || used >= -delta); + ASSERT(delta < 0 || used + delta > used); + used += delta; + if (used == 0) + err = zap_remove(os, obj, buf, tx); + else + err = zap_update(os, obj, buf, 8, 1, &used, tx); + ASSERT(err == 0); +} + +static int +zfs_space_delta_cb(dmu_object_type_t bonustype, void *bonus, + uint64_t *userp, uint64_t *groupp) +{ + znode_phys_t *znp = bonus; + + if (bonustype != DMU_OT_ZNODE) + return (ENOENT); + + *userp = znp->zp_uid; + *groupp = znp->zp_gid; + return (0); +} + +static void +fuidstr_to_sid(zfsvfs_t *zfsvfs, const char *fuidstr, + char *domainbuf, int buflen, uid_t *ridp) +{ + uint64_t fuid; + const char *domain; + + fuid = strtonum(fuidstr, NULL); + + domain = zfs_fuid_find_by_idx(zfsvfs, FUID_INDEX(fuid)); + if (domain) + (void) strlcpy(domainbuf, domain, buflen); + else + domainbuf[0] = '\0'; + *ridp = FUID_RID(fuid); +} + +static uint64_t +zfs_userquota_prop_to_obj(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type) +{ + switch (type) { + case ZFS_PROP_USERUSED: + return (DMU_USERUSED_OBJECT); + case ZFS_PROP_GROUPUSED: + return (DMU_GROUPUSED_OBJECT); + case ZFS_PROP_USERQUOTA: + return (zfsvfs->z_userquota_obj); + case ZFS_PROP_GROUPQUOTA: + return (zfsvfs->z_groupquota_obj); + } + return (0); +} + +int +zfs_userspace_many(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + uint64_t *cookiep, void *vbuf, uint64_t *bufsizep) +{ + int error; + zap_cursor_t zc; + zap_attribute_t za; + zfs_useracct_t *buf = vbuf; + uint64_t obj; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (ENOTSUP); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) { + *bufsizep = 0; + return (0); + } + + for (zap_cursor_init_serialized(&zc, zfsvfs->z_os, obj, *cookiep); + (error = zap_cursor_retrieve(&zc, &za)) == 0; + zap_cursor_advance(&zc)) { + if ((uintptr_t)buf - (uintptr_t)vbuf + sizeof (zfs_useracct_t) > + *bufsizep) + break; + + fuidstr_to_sid(zfsvfs, za.za_name, + buf->zu_domain, sizeof (buf->zu_domain), &buf->zu_rid); + + buf->zu_space = za.za_first_integer; + buf++; + } + if (error == ENOENT) + error = 0; + + ASSERT3U((uintptr_t)buf - (uintptr_t)vbuf, <=, *bufsizep); + *bufsizep = (uintptr_t)buf - (uintptr_t)vbuf; + *cookiep = zap_cursor_serialize(&zc); + zap_cursor_fini(&zc); + return (error); +} + +/* + * buf must be big enough (eg, 32 bytes) + */ +static int +id_to_fuidstr(zfsvfs_t *zfsvfs, const char *domain, uid_t rid, + char *buf, boolean_t addok) +{ + uint64_t fuid; + int domainid = 0; + + if (domain && domain[0]) { + domainid = zfs_fuid_find_by_domain(zfsvfs, domain, NULL, addok); + if (domainid == -1) + return (ENOENT); + } + fuid = FUID_ENCODE(domainid, rid); + (void) sprintf(buf, "%llx", (longlong_t)fuid); + return (0); +} + +int +zfs_userspace_one(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t *valp) +{ + char buf[32]; + int err; + uint64_t obj; + + *valp = 0; + + if (!dmu_objset_userspace_present(zfsvfs->z_os)) + return (ENOTSUP); + + obj = zfs_userquota_prop_to_obj(zfsvfs, type); + if (obj == 0) + return (0); + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_FALSE); + if (err) + return (err); + + err = zap_lookup(zfsvfs->z_os, obj, buf, 8, 1, valp); + if (err == ENOENT) + err = 0; + return (err); +} + +int +zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type, + const char *domain, uint64_t rid, uint64_t quota) +{ + char buf[32]; + int err; + dmu_tx_t *tx; + uint64_t *objp; + boolean_t fuid_dirtied; + + if (type != ZFS_PROP_USERQUOTA && type != ZFS_PROP_GROUPQUOTA) + return (EINVAL); + + if (zfsvfs->z_version < ZPL_VERSION_USERSPACE) + return (ENOTSUP); + + objp = (type == ZFS_PROP_USERQUOTA) ? &zfsvfs->z_userquota_obj : + &zfsvfs->z_groupquota_obj; + + err = id_to_fuidstr(zfsvfs, domain, rid, buf, B_TRUE); + if (err) + return (err); + fuid_dirtied = zfsvfs->z_fuid_dirty; + + tx = dmu_tx_create(zfsvfs->z_os); + dmu_tx_hold_zap(tx, *objp ? *objp : DMU_NEW_OBJECT, B_TRUE, NULL); + if (*objp == 0) { + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_TRUE, + zfs_userquota_prop_prefixes[type]); + } + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + err = dmu_tx_assign(tx, TXG_WAIT); + if (err) { + dmu_tx_abort(tx); + return (err); + } + + mutex_enter(&zfsvfs->z_lock); + if (*objp == 0) { + *objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA, + DMU_OT_NONE, 0, tx); + VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[type], 8, 1, objp, tx)); + } + mutex_exit(&zfsvfs->z_lock); + + if (quota == 0) { + err = zap_remove(zfsvfs->z_os, *objp, buf, tx); + if (err == ENOENT) + err = 0; + } else { + err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, "a, tx); + } + ASSERT(err == 0); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + dmu_tx_commit(tx); + return (err); +} + +boolean_t +zfs_usergroup_overquota(zfsvfs_t *zfsvfs, boolean_t isgroup, uint64_t fuid) +{ + char buf[32]; + uint64_t used, quota, usedobj, quotaobj; + int err; + + usedobj = isgroup ? DMU_GROUPUSED_OBJECT : DMU_USERUSED_OBJECT; + quotaobj = isgroup ? zfsvfs->z_groupquota_obj : zfsvfs->z_userquota_obj; + + if (quotaobj == 0 || zfsvfs->z_replay) + return (B_FALSE); + + (void) sprintf(buf, "%llx", (longlong_t)fuid); + err = zap_lookup(zfsvfs->z_os, quotaobj, buf, 8, 1, "a); + if (err != 0) + return (B_FALSE); + + err = zap_lookup(zfsvfs->z_os, usedobj, buf, 8, 1, &used); + if (err != 0) + return (B_FALSE); + return (used >= quota); +} + +int +zfsvfs_create(const char *osname, zfsvfs_t **zfvp) +{ + objset_t *os; + zfsvfs_t *zfsvfs; + uint64_t zval; + int i, error; + + zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + + /* + * We claim to always be readonly so we can open snapshots; + * other ZPL code will prevent us from writing to snapshots. + */ + error = dmu_objset_own(osname, DMU_OST_ZFS, B_TRUE, zfsvfs, &os); + if (error) { + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); + } + + /* + * Initialize the zfs-specific filesystem structure. + * Should probably make this a kmem cache, shuffle fields, + * and just bzero up to z_hold_mtx[]. + */ + zfsvfs->z_vfs = NULL; + zfsvfs->z_parent = zfsvfs; + zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; + zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; + zfsvfs->z_os = os; + + error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); + if (error) { + goto out; + } else if (zfsvfs->z_version > ZPL_VERSION) { + (void) printf("Mismatched versions: File system " + "is version %llu on-disk format, which is " + "incompatible with this software version %lld!", + (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); + error = ENOTSUP; + goto out; + } + + if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) + goto out; + zfsvfs->z_norm = (int)zval; + + if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) + goto out; + zfsvfs->z_utf8 = (zval != 0); + + if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) + goto out; + zfsvfs->z_case = (uint_t)zval; + + /* + * Fold case on file systems that are always or sometimes case + * insensitive. + */ + if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || + zfsvfs->z_case == ZFS_CASE_MIXED) + zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, + &zfsvfs->z_root); + if (error) + goto out; + ASSERT(zfsvfs->z_root != 0); + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, + &zfsvfs->z_unlinkedobj); + if (error) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_USERQUOTA], + 8, 1, &zfsvfs->z_userquota_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, + zfs_userquota_prop_prefixes[ZFS_PROP_GROUPQUOTA], + 8, 1, &zfsvfs->z_groupquota_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, + &zfsvfs->z_fuid_obj); + if (error && error != ENOENT) + goto out; + + error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_SHARES_DIR, 8, 1, + &zfsvfs->z_shares_dir); + if (error && error != ENOENT) + goto out; + + mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL); + list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), + offsetof(znode_t, z_link_node)); + rrw_init(&zfsvfs->z_teardown_lock); + rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); + rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + + *zfvp = zfsvfs; + return (0); + +out: + dmu_objset_disown(os, zfsvfs); + *zfvp = NULL; + kmem_free(zfsvfs, sizeof (zfsvfs_t)); + return (error); +} + static int zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) { @@ -566,9 +945,15 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) /* * Set the objset user_ptr to track its zfsvfs. */ - mutex_enter(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); dmu_objset_set_user(zfsvfs->z_os, zfsvfs); - mutex_exit(&zfsvfs->z_os->os->os_user_ptr_lock); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); + + zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); + if (zil_disable) { + zil_destroy(zfsvfs->z_log, B_FALSE); + zfsvfs->z_log = NULL; + } /* * If we are not mounting (ie: online recv), then we don't @@ -583,68 +968,106 @@ zfsvfs_setup(zfsvfs_t *zfsvfs, boolean_t mounting) * allow replays to succeed. */ readonly = zfsvfs->z_vfs->vfs_flag & VFS_RDONLY; - zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; - - /* - * Parse and replay the intent log. - */ - zil_replay(zfsvfs->z_os, zfsvfs, &zfsvfs->z_assign, - zfs_replay_vector, zfs_unlinked_drain); + if (readonly != 0) + zfsvfs->z_vfs->vfs_flag &= ~VFS_RDONLY; + else + zfs_unlinked_drain(zfsvfs); - zfs_unlinked_drain(zfsvfs); + if (zfsvfs->z_log) { + /* + * Parse and replay the intent log. + * + * Because of ziltest, this must be done after + * zfs_unlinked_drain(). (Further note: ziltest + * doesn't use readonly mounts, where + * zfs_unlinked_drain() isn't called.) This is because + * ziltest causes spa_sync() to think it's committed, + * but actually it is not, so the intent log contains + * many txg's worth of changes. + * + * In particular, if object N is in the unlinked set in + * the last txg to actually sync, then it could be + * actually freed in a later txg and then reallocated + * in a yet later txg. This would write a "create + * object N" record to the intent log. Normally, this + * would be fine because the spa_sync() would have + * written out the fact that object N is free, before + * we could write the "create object N" intent log + * record. + * + * But when we are in ziltest mode, we advance the "open + * txg" without actually spa_sync()-ing the changes to + * disk. So we would see that object N is still + * allocated and in the unlinked set, and there is an + * intent log record saying to allocate it. + */ + zfsvfs->z_replay = B_TRUE; + zil_replay(zfsvfs->z_os, zfsvfs, zfs_replay_vector); + zfsvfs->z_replay = B_FALSE; + } zfsvfs->z_vfs->vfs_flag |= readonly; /* restore readonly bit */ } - if (!zil_disable) - zfsvfs->z_log = zil_open(zfsvfs->z_os, zfs_get_data); - return (0); } -static void -zfs_freezfsvfs(zfsvfs_t *zfsvfs) +void +zfsvfs_free(zfsvfs_t *zfsvfs) { + int i; + extern krwlock_t zfsvfs_lock; /* in zfs_znode.c */ + + /* + * This is a barrier to prevent the filesystem from going away in + * zfs_znode_move() until we can safely ensure that the filesystem is + * not unmounted. We consider the filesystem valid before the barrier + * and invalid after the barrier. + */ + rw_enter(&zfsvfs_lock, RW_READER); + rw_exit(&zfsvfs_lock); + + zfs_fuid_destroy(zfsvfs); + mutex_destroy(&zfsvfs->z_znodes_lock); - mutex_destroy(&zfsvfs->z_online_recv_lock); + mutex_destroy(&zfsvfs->z_lock); list_destroy(&zfsvfs->z_all_znodes); rrw_destroy(&zfsvfs->z_teardown_lock); rw_destroy(&zfsvfs->z_teardown_inactive_lock); rw_destroy(&zfsvfs->z_fuid_lock); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs->z_hold_mtx[i]); kmem_free(zfsvfs, sizeof (zfsvfs_t)); } +static void +zfs_set_fuid_feature(zfsvfs_t *zfsvfs) +{ + zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); + if (zfsvfs->z_use_fuids && zfsvfs->z_vfs) { + vfs_set_feature(zfsvfs->z_vfs, VFSFT_XVATTR); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_SYSATTR_VIEWS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACEMASKONACCESS); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACLONCREATE); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_ACCESS_FILTER); + vfs_set_feature(zfsvfs->z_vfs, VFSFT_REPARSE); + } +} + static int zfs_domount(vfs_t *vfsp, char *osname) { dev_t mount_dev; - uint64_t recordsize, readonly; + uint64_t recordsize, fsid_guid; int error = 0; - int mode; zfsvfs_t *zfsvfs; - znode_t *zp = NULL; ASSERT(vfsp); ASSERT(osname); - /* - * Initialize the zfs-specific filesystem structure. - * Should probably make this a kmem cache, shuffle fields, - * and just bzero up to z_hold_mtx[]. - */ - zfsvfs = kmem_zalloc(sizeof (zfsvfs_t), KM_SLEEP); + error = zfsvfs_create(osname, &zfsvfs); + if (error) + return (error); zfsvfs->z_vfs = vfsp; - zfsvfs->z_parent = zfsvfs; - zfsvfs->z_assign = TXG_NOWAIT; - zfsvfs->z_max_blksz = SPA_MAXBLOCKSIZE; - zfsvfs->z_show_ctldir = ZFS_SNAPDIR_VISIBLE; - - mutex_init(&zfsvfs->z_znodes_lock, NULL, MUTEX_DEFAULT, NULL); - mutex_init(&zfsvfs->z_online_recv_lock, NULL, MUTEX_DEFAULT, NULL); - list_create(&zfsvfs->z_all_znodes, sizeof (znode_t), - offsetof(znode_t, z_link_node)); - rrw_init(&zfsvfs->z_teardown_lock); - rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL); - rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL); /* Initialize the generic filesystem structure. */ vfsp->vfs_bcount = 0; @@ -666,39 +1089,24 @@ zfs_domount(vfs_t *vfsp, char *osname) vfsp->vfs_flag |= VFS_NOTRUNC; vfsp->vfs_data = zfsvfs; - if (error = dsl_prop_get_integer(osname, "readonly", &readonly, NULL)) - goto out; - - mode = DS_MODE_OWNER; - if (readonly) - mode |= DS_MODE_READONLY; - - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); - if (error == EROFS) { - mode = DS_MODE_OWNER | DS_MODE_READONLY; - error = dmu_objset_open(osname, DMU_OST_ZFS, mode, - &zfsvfs->z_os); - } - - if (error) - goto out; - - if (error = zfs_init_fs(zfsvfs, &zp)) - goto out; - - /* The call to zfs_init_fs leaves the vnode held, release it here. */ - VN_RELE(ZTOV(zp)); + /* + * The fsid is 64 bits, composed of an 8-bit fs type, which + * separates our fsid from any other filesystem types, and a + * 56-bit objset unique ID. The objset unique ID is unique to + * all objsets open on this system, provided by unique_create(). + * The 8-bit fs type must be put in the low bits of fsid[1] + * because that's where other Solaris filesystems put it. + */ + fsid_guid = dmu_objset_fsid_guid(zfsvfs->z_os); + ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); + vfsp->vfs_fsid.val[0] = fsid_guid; + vfsp->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | + zfsfstype & 0xFF; /* * Set features for file system. */ - zfsvfs->z_use_fuids = USE_FUIDS(zfsvfs->z_version, zfsvfs->z_os); - if (zfsvfs->z_use_fuids) { - vfs_set_feature(vfsp, VFSFT_XVATTR); - vfs_set_feature(vfsp, VFSFT_SYSATTR_VIEWS); - vfs_set_feature(vfsp, VFSFT_ACEMASKONACCESS); - vfs_set_feature(vfsp, VFSFT_ACLONCREATE); - } + zfs_set_fuid_feature(zfsvfs); if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE) { vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); @@ -707,17 +1115,21 @@ zfs_domount(vfs_t *vfsp, char *osname) vfs_set_feature(vfsp, VFSFT_DIRENTFLAGS); vfs_set_feature(vfsp, VFSFT_CASEINSENSITIVE); } + vfs_set_feature(vfsp, VFSFT_ZEROCOPY_SUPPORTED); if (dmu_objset_is_snapshot(zfsvfs->z_os)) { uint64_t pval; - ASSERT(mode & DS_MODE_READONLY); atime_changed_cb(zfsvfs, B_FALSE); readonly_changed_cb(zfsvfs, B_TRUE); if (error = dsl_prop_get_integer(osname, "xattr", &pval, NULL)) goto out; xattr_changed_cb(zfsvfs, pval); zfsvfs->z_issnap = B_TRUE; + + mutex_enter(&zfsvfs->z_os->os_user_ptr_lock); + dmu_objset_set_user(zfsvfs->z_os, zfsvfs); + mutex_exit(&zfsvfs->z_os->os_user_ptr_lock); } else { error = zfsvfs_setup(zfsvfs, B_TRUE); } @@ -726,9 +1138,8 @@ zfs_domount(vfs_t *vfsp, char *osname) zfsctl_create(zfsvfs); out: if (error) { - if (zfsvfs->z_os) - dmu_objset_close(zfsvfs->z_os); - zfs_freezfsvfs(zfsvfs); + dmu_objset_disown(zfsvfs->z_os, zfsvfs); + zfsvfs_free(zfsvfs); } else { atomic_add_32(&zfs_active_fs_count, 1); } @@ -837,6 +1248,139 @@ zfs_parse_bootfs(char *bpath, char *outpath) return (error); } +/* + * zfs_check_global_label: + * Check that the hex label string is appropriate for the dataset + * being mounted into the global_zone proper. + * + * Return an error if the hex label string is not default or + * admin_low/admin_high. For admin_low labels, the corresponding + * dataset must be readonly. + */ +int +zfs_check_global_label(const char *dsname, const char *hexsl) +{ + if (strcasecmp(hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_HIGH) == 0) + return (0); + if (strcasecmp(hexsl, ADMIN_LOW) == 0) { + /* must be readonly */ + uint64_t rdonly; + + if (dsl_prop_get_integer(dsname, + zfs_prop_to_name(ZFS_PROP_READONLY), &rdonly, NULL)) + return (EACCES); + return (rdonly ? 0 : EACCES); + } + return (EACCES); +} + +/* + * zfs_mount_label_policy: + * Determine whether the mount is allowed according to MAC check. + * by comparing (where appropriate) label of the dataset against + * the label of the zone being mounted into. If the dataset has + * no label, create one. + * + * Returns: + * 0 : access allowed + * >0 : error code, such as EACCES + */ +static int +zfs_mount_label_policy(vfs_t *vfsp, char *osname) +{ + int error, retv; + zone_t *mntzone = NULL; + ts_label_t *mnt_tsl; + bslabel_t *mnt_sl; + bslabel_t ds_sl; + char ds_hexsl[MAXNAMELEN]; + + retv = EACCES; /* assume the worst */ + + /* + * Start by getting the dataset label if it exists. + */ + error = dsl_prop_get(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + 1, sizeof (ds_hexsl), &ds_hexsl, NULL); + if (error) + return (EACCES); + + /* + * If labeling is NOT enabled, then disallow the mount of datasets + * which have a non-default label already. No other label checks + * are needed. + */ + if (!is_system_labeled()) { + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) + return (0); + return (EACCES); + } + + /* + * Get the label of the mountpoint. If mounting into the global + * zone (i.e. mountpoint is not within an active zone and the + * zoned property is off), the label must be default or + * admin_low/admin_high only; no other checks are needed. + */ + mntzone = zone_find_by_any_path(refstr_value(vfsp->vfs_mntpt), B_FALSE); + if (mntzone->zone_id == GLOBAL_ZONEID) { + uint64_t zoned; + + zone_rele(mntzone); + + if (dsl_prop_get_integer(osname, + zfs_prop_to_name(ZFS_PROP_ZONED), &zoned, NULL)) + return (EACCES); + if (!zoned) + return (zfs_check_global_label(osname, ds_hexsl)); + else + /* + * This is the case of a zone dataset being mounted + * initially, before the zone has been fully created; + * allow this mount into global zone. + */ + return (0); + } + + mnt_tsl = mntzone->zone_slabel; + ASSERT(mnt_tsl != NULL); + label_hold(mnt_tsl); + mnt_sl = label2bslabel(mnt_tsl); + + if (strcasecmp(ds_hexsl, ZFS_MLSLABEL_DEFAULT) == 0) { + /* + * The dataset doesn't have a real label, so fabricate one. + */ + char *str = NULL; + + if (l_to_str_internal(mnt_sl, &str) == 0 && + dsl_prop_set(osname, zfs_prop_to_name(ZFS_PROP_MLSLABEL), + ZPROP_SRC_LOCAL, 1, strlen(str) + 1, str) == 0) + retv = 0; + if (str != NULL) + kmem_free(str, strlen(str) + 1); + } else if (hexstr_to_label(ds_hexsl, &ds_sl) == 0) { + /* + * Now compare labels to complete the MAC check. If the + * labels are equal then allow access. If the mountpoint + * label dominates the dataset label, allow readonly access. + * Otherwise, access is denied. + */ + if (blequal(mnt_sl, &ds_sl)) + retv = 0; + else if (bldominates(mnt_sl, &ds_sl)) { + vfs_setmntopt(vfsp, MNTOPT_RO, NULL, 0); + retv = 0; + } + } + + label_rele(mnt_tsl); + zone_rele(mntzone); + return (retv); +} + static int zfs_mountroot(vfs_t *vfsp, enum whymountroot why) { @@ -1026,6 +1570,10 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) goto out; } + error = zfs_mount_label_policy(vfsp, osname); + if (error) + goto out; + /* * When doing a remount, we simply refresh our temporary properties * according to those options set in the current VFS options. @@ -1039,6 +1587,13 @@ zfs_mount(vfs_t *vfsp, vnode_t *mvp, struct mounta *uap, cred_t *cr) error = zfs_domount(vfsp, osname); + /* + * Add an extra VFS_HOLD on our parent vfs so that it can't + * disappear due to a forced unmount. + */ + if (error == 0 && ((zfsvfs_t *)vfsp->vfs_data)->z_issnap) + VFS_HOLD(mvp->v_vfsp); + out: pn_free(&spn); return (error); @@ -1288,14 +1843,14 @@ zfs_umount(vfs_t *vfsp, int fflag, cred_t *cr) /* * Unset the objset user_ptr. */ - mutex_enter(&os->os->os_user_ptr_lock); + mutex_enter(&os->os_user_ptr_lock); dmu_objset_set_user(os, NULL); - mutex_exit(&os->os->os_user_ptr_lock); + mutex_exit(&os->os_user_ptr_lock); /* * Finally release the objset */ - dmu_objset_close(os); + dmu_objset_disown(os, zfsvfs); } /* @@ -1398,16 +1953,13 @@ zfs_vget(vfs_t *vfsp, vnode_t **vpp, fid_t *fidp) * 'z_teardown_inactive_lock' write held. */ int -zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) +zfs_suspend_fs(zfsvfs_t *zfsvfs) { int error; if ((error = zfsvfs_teardown(zfsvfs, B_FALSE)) != 0) return (error); - - *mode = zfsvfs->z_os->os_mode; - dmu_objset_name(zfsvfs->z_os, name); - dmu_objset_close(zfsvfs->z_os); + dmu_objset_disown(zfsvfs->z_os, zfsvfs); return (0); } @@ -1416,14 +1968,15 @@ zfs_suspend_fs(zfsvfs_t *zfsvfs, char *name, int *mode) * Reopen zfsvfs_t::z_os and release VOPs. */ int -zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname, int mode) +zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname) { int err; ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock)); ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock)); - err = dmu_objset_open(osname, DMU_OST_ZFS, mode, &zfsvfs->z_os); + err = dmu_objset_own(osname, DMU_OST_ZFS, B_FALSE, zfsvfs, + &zfsvfs->z_os); if (err) { zfsvfs->z_os = NULL; } else { @@ -1465,13 +2018,15 @@ static void zfs_freevfs(vfs_t *vfsp) { zfsvfs_t *zfsvfs = vfsp->vfs_data; - int i; - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); + /* + * If this is a snapshot, we have an extra VFS_HOLD on our parent + * from zfs_mount(). Release it here. + */ + if (zfsvfs->z_issnap) + VFS_RELE(zfsvfs->z_parent->z_vfs); - zfs_fuid_destroy(zfsvfs); - zfs_freezfsvfs(zfsvfs); + zfsvfs_free(zfsvfs); atomic_add_32(&zfs_active_fs_count, -1); } @@ -1530,6 +2085,8 @@ zfs_init(void) * Initialize znode cache, vnode ops, etc... */ zfs_znode_init(); + + dmu_objset_register_type(DMU_OST_ZFS, zfs_space_delta_cb); } void @@ -1546,54 +2103,46 @@ zfs_busy(void) } int -zfs_set_version(const char *name, uint64_t newvers) +zfs_set_version(zfsvfs_t *zfsvfs, uint64_t newvers) { int error; - objset_t *os; + objset_t *os = zfsvfs->z_os; dmu_tx_t *tx; - uint64_t curvers; - - /* - * XXX for now, require that the filesystem be unmounted. Would - * be nice to find the zfsvfs_t and just update that if - * possible. - */ if (newvers < ZPL_VERSION_INITIAL || newvers > ZPL_VERSION) return (EINVAL); - error = dmu_objset_open(name, DMU_OST_ZFS, DS_MODE_OWNER, &os); - if (error) - return (error); - - error = zap_lookup(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, - 8, 1, &curvers); - if (error) - goto out; - if (newvers < curvers) { - error = EINVAL; - goto out; - } + if (newvers < zfsvfs->z_version) + return (EINVAL); tx = dmu_tx_create(os); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, 0, ZPL_VERSION_STR); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, B_FALSE, ZPL_VERSION_STR); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); - goto out; + return (error); + } + error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, + 8, 1, &newvers, tx); + + if (error) { + dmu_tx_commit(tx); + return (error); } - error = zap_update(os, MASTER_NODE_OBJ, ZPL_VERSION_STR, 8, 1, - &newvers, tx); spa_history_internal_log(LOG_DS_UPGRADE, dmu_objset_spa(os), tx, CRED(), - "oldver=%llu newver=%llu dataset = %llu", curvers, newvers, - dmu_objset_id(os)); + "oldver=%llu newver=%llu dataset = %llu", + zfsvfs->z_version, newvers, dmu_objset_id(os)); + dmu_tx_commit(tx); -out: - dmu_objset_close(os); - return (error); + zfsvfs->z_version = newvers; + + if (zfsvfs->z_version >= ZPL_VERSION_FUID) + zfs_set_fuid_feature(zfsvfs); + + return (0); } /* diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c index 8e0037e37da52..6883db5cf9a6b 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_vnops.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -73,7 +73,7 @@ #include #include #include -#include +#include #include /* @@ -101,13 +101,12 @@ * pushing cached pages (which acquires range locks) and syncing out * cached atime changes. Third, zfs_zinactive() may require a new tx, * which could deadlock the system if you were already holding one. + * If you must call VN_RELE() within a tx then use VN_RELE_ASYNC(). * * (3) All range locks must be grabbed before calling dmu_tx_assign(), * as they can span dmu_tx_assign() calls. * - * (4) Always pass zfsvfs->z_assign as the second argument to dmu_tx_assign(). - * In normal operation, this will be TXG_NOWAIT. During ZIL replay, - * it will be a specific txg. Either way, dmu_tx_assign() never blocks. + * (4) Always pass TXG_NOWAIT as the second argument to dmu_tx_assign(). * This is critical because we don't want to block while holding locks. * Note, in particular, that if a lock is sometimes acquired before * the tx assigns, and sometimes after (e.g. z_lock), then failing to @@ -124,6 +123,8 @@ * (5) If the operation succeeded, generate the intent log entry for it * before dropping locks. This ensures that the ordering of events * in the intent log matches the order in which they actually occurred. + * During ZIL replay the zfs_log_* functions will update the sequence + * number to indicate the zil transaction has replayed. * * (6) At the end of each vnode op, the DMU tx must always commit, * regardless of whether there were any errors. @@ -139,12 +140,12 @@ * rw_enter(...); // grab any other locks you need * tx = dmu_tx_create(...); // get DMU tx * dmu_tx_hold_*(); // hold each object you might modify - * error = dmu_tx_assign(tx, zfsvfs->z_assign); // try to assign + * error = dmu_tx_assign(tx, TXG_NOWAIT); // try to assign * if (error) { * rw_exit(...); // drop locks * zfs_dirent_unlock(dl); // unlock directory entry * VN_RELE(...); // release held vnodes - * if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + * if (error == ERESTART) { * dmu_tx_wait(tx); * dmu_tx_abort(tx); * goto top; @@ -207,6 +208,12 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; + /* + * Clean up any locks held by this process on the vp. + */ + cleanlocks(vp, ddi_get_pid(), 0); + cleanshares(vp, ddi_get_pid()); + ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -214,12 +221,6 @@ zfs_close(vnode_t *vp, int flag, int count, offset_t offset, cred_t *cr, if ((flag & (FSYNC | FDSYNC)) && (count == 1)) atomic_dec_32(&zp->z_sync_cnt); - /* - * Clean up any locks held by this process on the vp. - */ - cleanlocks(vp, ddi_get_pid(), 0); - cleanshares(vp, ddi_get_pid()); - if (!zfs_has_ctldir(zp) && zp->z_zfsvfs->z_vscan && ZTOV(zp)->v_type == VREG && !(zp->z_phys->zp_flags & ZFS_AV_QUARANTINED) && @@ -348,56 +349,29 @@ zfs_unmap_page(page_t *pp, caddr_t addr) * * On Write: If we find a memory mapped page, we write to *both* * the page and the dmu buffer. - * - * NOTE: We will always "break up" the IO into PAGESIZE uiomoves when - * the file is memory mapped. */ -static int -mappedwrite(vnode_t *vp, int nbytes, uio_t *uio, dmu_tx_t *tx) +static void +update_pages(vnode_t *vp, int64_t start, int len, objset_t *os, uint64_t oid) { - znode_t *zp = VTOZ(vp); - zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int64_t start, off; - int len = nbytes; - int error = 0; + int64_t off; - start = uio->uio_loffset; off = start & PAGEOFFSET; for (start &= PAGEMASK; len > 0; start += PAGESIZE) { page_t *pp; - uint64_t bytes = MIN(PAGESIZE - off, len); - uint64_t woff = uio->uio_loffset; + uint64_t nbytes = MIN(PAGESIZE - off, len); - /* - * We don't want a new page to "appear" in the middle of - * the file update (because it may not get the write - * update data), so we grab a lock to block - * zfs_getpage(). - */ - rw_enter(&zp->z_map_lock, RW_WRITER); if (pp = page_lookup(vp, start, SE_SHARED)) { caddr_t va; - rw_exit(&zp->z_map_lock); va = zfs_map_page(pp, S_WRITE); - error = uiomove(va+off, bytes, UIO_WRITE, uio); - if (error == 0) { - dmu_write(zfsvfs->z_os, zp->z_id, - woff, bytes, va+off, tx); - } + (void) dmu_read(os, oid, start+off, nbytes, va+off, + DMU_READ_PREFETCH); zfs_unmap_page(pp, va); page_unlock(pp); - } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, bytes, tx); - rw_exit(&zp->z_map_lock); } - len -= bytes; + len -= nbytes; off = 0; - if (error) - break; } - return (error); } /* @@ -473,6 +447,7 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) ssize_t n, nbytes; int error; rl_t *rl; + xuio_t *xuio = NULL; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -533,6 +508,35 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) ASSERT(uio->uio_loffset < zp->z_phys->zp_size); n = MIN(uio->uio_resid, zp->z_phys->zp_size - uio->uio_loffset); + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) { + int nblk; + int blksz = zp->z_blksz; + uint64_t offset = uio->uio_loffset; + + xuio = (xuio_t *)uio; + if ((ISP2(blksz))) { + nblk = (P2ROUNDUP(offset + n, blksz) - P2ALIGN(offset, + blksz)) / blksz; + } else { + ASSERT(offset + n <= blksz); + nblk = 1; + } + (void) dmu_xuio_init(xuio, nblk); + + if (vn_has_cached_data(vp)) { + /* + * For simplicity, we always allocate a full buffer + * even if we only expect to read a portion of a block. + */ + while (--nblk >= 0) { + (void) dmu_xuio_add(xuio, + dmu_request_arcbuf(zp->z_dbuf, blksz), + 0, blksz); + } + } + } + while (n > 0) { nbytes = MIN(n, zfs_read_chunk_size - P2PHASE(uio->uio_loffset, zfs_read_chunk_size)); @@ -550,7 +554,6 @@ zfs_read(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) n -= nbytes; } - out: zfs_range_unlock(rl); @@ -595,6 +598,13 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) int max_blksz = zfsvfs->z_max_blksz; uint64_t pflags; int error; + arc_buf_t *abuf; + iovec_t *aiov; + xuio_t *xuio = NULL; + int i_iov = 0; + int iovcnt = uio->uio_iovcnt; + iovec_t *iovp = uio->uio_iov; + int write_eof; /* * Fasttrack empty write @@ -622,45 +632,60 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) zilog = zfsvfs->z_log; + /* + * Validate file offset + */ + woff = ioflag & FAPPEND ? zp->z_phys->zp_size : uio->uio_loffset; + if (woff < 0) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + /* + * Check for mandatory locks before calling zfs_range_lock() + * in order to prevent a deadlock with locks set via fcntl(). + */ + if (MANDMODE((mode_t)zp->z_phys->zp_mode) && + (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { + ZFS_EXIT(zfsvfs); + return (error); + } + /* * Pre-fault the pages to ensure slow (eg NFS) pages * don't hold up txg. + * Skip this if uio contains loaned arc_buf. */ - uio_prefaultpages(n, uio); + if ((uio->uio_extflg == UIO_XUIO) && + (((xuio_t *)uio)->xu_type == UIOTYPE_ZEROCOPY)) + xuio = (xuio_t *)uio; + else + uio_prefaultpages(n, uio); /* * If in append mode, set the io offset pointer to eof. */ if (ioflag & FAPPEND) { /* - * Range lock for a file append: - * The value for the start of range will be determined by - * zfs_range_lock() (to guarantee append semantics). - * If this write will cause the block size to increase, - * zfs_range_lock() will lock the entire file, so we must - * later reduce the range after we grow the block size. + * Obtain an appending range lock to guarantee file append + * semantics. We reset the write offset once we have the lock. */ rl = zfs_range_lock(zp, 0, n, RL_APPEND); + woff = rl->r_off; if (rl->r_len == UINT64_MAX) { - /* overlocked, zp_size can't change */ - woff = uio->uio_loffset = zp->z_phys->zp_size; - } else { - woff = uio->uio_loffset = rl->r_off; + /* + * We overlocked the file because this write will cause + * the file block size to increase. + * Note that zp_size cannot change with this lock held. + */ + woff = zp->z_phys->zp_size; } + uio->uio_loffset = woff; } else { - woff = uio->uio_loffset; - /* - * Validate file offset - */ - if (woff < 0) { - ZFS_EXIT(zfsvfs); - return (EINVAL); - } - /* - * If we need to grow the block size then zfs_range_lock() - * will lock a wider range than we request here. - * Later after growing the block size we reduce the range. + * Note that if the file block size will change as a result of + * this write, then this range lock will lock the entire file + * so that we can re-write the block safely. */ rl = zfs_range_lock(zp, woff, n, RL_WRITER); } @@ -674,15 +699,9 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) if ((woff + n) > limit || woff > (limit - n)) n = limit - woff; - /* - * Check for mandatory locks - */ - if (MANDMODE((mode_t)zp->z_phys->zp_mode) && - (error = chklock(vp, FWRITE, woff, n, uio->uio_fmode, ct)) != 0) { - zfs_range_unlock(rl); - ZFS_EXIT(zfsvfs); - return (error); - } + /* Will this write extend the file length? */ + write_eof = (woff + n > zp->z_phys->zp_size); + end_size = MAX(zp->z_phys->zp_size, woff + n); /* @@ -691,22 +710,70 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * and allows us to do more fine-grained space accounting. */ while (n > 0) { + abuf = NULL; + woff = uio->uio_loffset; +again: + if (zfs_usergroup_overquota(zfsvfs, + B_FALSE, zp->z_phys->zp_uid) || + zfs_usergroup_overquota(zfsvfs, + B_TRUE, zp->z_phys->zp_gid)) { + if (abuf != NULL) + dmu_return_arcbuf(abuf); + error = EDQUOT; + break; + } + + if (xuio && abuf == NULL) { + ASSERT(i_iov < iovcnt); + aiov = &iovp[i_iov]; + abuf = dmu_xuio_arcbuf(xuio, i_iov); + dmu_xuio_clear(xuio, i_iov); + DTRACE_PROBE3(zfs_cp_write, int, i_iov, + iovec_t *, aiov, arc_buf_t *, abuf); + ASSERT((aiov->iov_base == abuf->b_data) || + ((char *)aiov->iov_base - (char *)abuf->b_data + + aiov->iov_len == arc_buf_size(abuf))); + i_iov++; + } else if (abuf == NULL && n >= max_blksz && + woff >= zp->z_phys->zp_size && + P2PHASE(woff, max_blksz) == 0 && + zp->z_blksz == max_blksz) { + /* + * This write covers a full block. "Borrow" a buffer + * from the dmu so that we can fill it before we enter + * a transaction. This avoids the possibility of + * holding up the transaction if the data copy hangs + * up on a pagefault (e.g., from an NFS server mapping). + */ + size_t cbytes; + + abuf = dmu_request_arcbuf(zp->z_dbuf, max_blksz); + ASSERT(abuf != NULL); + ASSERT(arc_buf_size(abuf) == max_blksz); + if (error = uiocopy(abuf->b_data, max_blksz, + UIO_WRITE, uio, &cbytes)) { + dmu_return_arcbuf(abuf); + break; + } + ASSERT(cbytes == max_blksz); + } + /* * Start a transaction. */ - woff = uio->uio_loffset; tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_write(tx, zp->z_id, woff, MIN(n, max_blksz)); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); - continue; + goto again; } dmu_tx_abort(tx); + if (abuf != NULL) + dmu_return_arcbuf(abuf); break; } @@ -734,18 +801,39 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * Perhaps we should use SPA_MAXBLOCKSIZE chunks? */ nbytes = MIN(n, max_blksz - P2PHASE(woff, max_blksz)); - rw_enter(&zp->z_map_lock, RW_READER); - tx_bytes = uio->uio_resid; - if (vn_has_cached_data(vp)) { - rw_exit(&zp->z_map_lock); - error = mappedwrite(vp, nbytes, uio, tx); + if (abuf == NULL) { + tx_bytes = uio->uio_resid; + error = dmu_write_uio(zfsvfs->z_os, zp->z_id, uio, + nbytes, tx); + tx_bytes -= uio->uio_resid; } else { - error = dmu_write_uio(zfsvfs->z_os, zp->z_id, - uio, nbytes, tx); - rw_exit(&zp->z_map_lock); + tx_bytes = nbytes; + ASSERT(xuio == NULL || tx_bytes == aiov->iov_len); + /* + * If this is not a full block write, but we are + * extending the file past EOF and this data starts + * block-aligned, use assign_arcbuf(). Otherwise, + * write via dmu_write(). + */ + if (tx_bytes < max_blksz && (!write_eof || + aiov->iov_base != abuf->b_data)) { + ASSERT(xuio); + dmu_write(zfsvfs->z_os, zp->z_id, woff, + aiov->iov_len, aiov->iov_base, tx); + dmu_return_arcbuf(abuf); + xuio_stat_wbuf_copied(); + } else { + ASSERT(xuio || tx_bytes == max_blksz); + dmu_assign_arcbuf(zp->z_dbuf, woff, abuf, tx); + } + ASSERT(tx_bytes <= uio->uio_resid); + uioskip(uio, tx_bytes); + } + if (tx_bytes && vn_has_cached_data(vp)) { + update_pages(vp, woff, + tx_bytes, zfsvfs->z_os, zp->z_id); } - tx_bytes -= uio->uio_resid; /* * If we made no progress, we're done. If we made even @@ -807,7 +895,7 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) * If we're in replay mode, or we made no progress, return error. * Otherwise, it's at least a partial write, so it's successful. */ - if (zfsvfs->z_assign >= TXG_INITIAL || uio->uio_resid == start_resid) { + if (zfsvfs->z_replay || uio->uio_resid == start_resid) { ZFS_EXIT(zfsvfs); return (error); } @@ -820,19 +908,32 @@ zfs_write(vnode_t *vp, uio_t *uio, int ioflag, cred_t *cr, caller_context_t *ct) } void -zfs_get_done(dmu_buf_t *db, void *vzgd) +zfs_get_done(zgd_t *zgd, int error) { - zgd_t *zgd = (zgd_t *)vzgd; - rl_t *rl = zgd->zgd_rl; - vnode_t *vp = ZTOV(rl->r_zp); + znode_t *zp = zgd->zgd_private; + objset_t *os = zp->z_zfsvfs->z_os; + + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_range_unlock(zgd->zgd_rl); + + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), dsl_pool_vnrele_taskq(dmu_objset_pool(os))); + + if (error == 0 && zgd->zgd_bp) + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); - dmu_buf_rele(db, vzgd); - zfs_range_unlock(rl); - VN_RELE(vp); - zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } +#ifdef DEBUG +static int zil_fault_io = 0; +#endif + /* * Get data to generate a TX_WRITE intent log record. */ @@ -842,26 +943,36 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) zfsvfs_t *zfsvfs = arg; objset_t *os = zfsvfs->z_os; znode_t *zp; - uint64_t off = lr->lr_offset; + uint64_t object = lr->lr_foid; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; + blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; - rl_t *rl; zgd_t *zgd; - int dlen = lr->lr_length; /* length of user data */ int error = 0; - ASSERT(zio); - ASSERT(dlen != 0); + ASSERT(zio != NULL); + ASSERT(size != 0); /* * Nothing to do if the file has been removed */ - if (zfs_zget(zfsvfs, lr->lr_foid, &zp) != 0) + if (zfs_zget(zfsvfs, object, &zp) != 0) return (ENOENT); if (zp->z_unlinked) { - VN_RELE(ZTOV(zp)); + /* + * Release the vnode asynchronously as we currently have the + * txg stopped from syncing. + */ + VN_RELE_ASYNC(ZTOV(zp), + dsl_pool_vnrele_taskq(dmu_objset_pool(os))); return (ENOENT); } + zgd = (zgd_t *)kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_zilog = zfsvfs->z_log; + zgd->zgd_private = zp; + /* * Write records come in two flavors: immediate and indirect. * For small writes it's cheaper to store the data with the @@ -870,16 +981,16 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * we don't have to write the data twice. */ if (buf != NULL) { /* immediate write */ - rl = zfs_range_lock(zp, off, dlen, RL_READER); + zgd->zgd_rl = zfs_range_lock(zp, offset, size, RL_READER); /* test for truncation needs to be done while range locked */ - if (off >= zp->z_phys->zp_size) { + if (offset >= zp->z_phys->zp_size) { error = ENOENT; - goto out; + } else { + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); } - VERIFY(0 == dmu_read(os, lr->lr_foid, off, dlen, buf)); + ASSERT(error == 0 || error == ENOENT); } else { /* indirect write */ - uint64_t boff; /* block starting offset */ - /* * Have to lock the whole block to ensure when it's * written out and it's checksum is being calculated @@ -887,50 +998,58 @@ zfs_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * blocksize after we get the lock in case it's changed! */ for (;;) { - if (ISP2(zp->z_blksz)) { - boff = P2ALIGN_TYPED(off, zp->z_blksz, - uint64_t); - } else { - boff = 0; - } - dlen = zp->z_blksz; - rl = zfs_range_lock(zp, boff, dlen, RL_READER); - if (zp->z_blksz == dlen) + uint64_t blkoff; + size = zp->z_blksz; + blkoff = ISP2(size) ? P2PHASE(offset, size) : offset; + offset -= blkoff; + zgd->zgd_rl = zfs_range_lock(zp, offset, size, + RL_READER); + if (zp->z_blksz == size) break; - zfs_range_unlock(rl); + offset += blkoff; + zfs_range_unlock(zgd->zgd_rl); } /* test for truncation needs to be done while range locked */ - if (off >= zp->z_phys->zp_size) { + if (lr->lr_offset >= zp->z_phys->zp_size) error = ENOENT; - goto out; +#ifdef DEBUG + if (zil_fault_io) { + error = EIO; + zil_fault_io = 0; } - zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_rl = rl; - zgd->zgd_zilog = zfsvfs->z_log; - zgd->zgd_bp = &lr->lr_blkptr; - VERIFY(0 == dmu_buf_hold(os, lr->lr_foid, boff, zgd, &db)); - ASSERT(boff == db->db_offset); - lr->lr_blkoff = off - boff; - error = dmu_sync(zio, db, &lr->lr_blkptr, - lr->lr_common.lrc_txg, zfs_get_done, zgd); - ASSERT((error && error != EINPROGRESS) || - lr->lr_length <= zp->z_blksz); +#endif if (error == 0) - zil_add_block(zfsvfs->z_log, &lr->lr_blkptr); - /* - * If we get EINPROGRESS, then we need to wait for a - * write IO initiated by dmu_sync() to complete before - * we can release this dbuf. We will finish everything - * up in the zfs_get_done() callback. - */ - if (error == EINPROGRESS) - return (0); - dmu_buf_rele(db, zgd); - kmem_free(zgd, sizeof (zgd_t)); + error = dmu_buf_hold(os, object, offset, zgd, &db); + + if (error == 0) { + zgd->zgd_db = db; + zgd->zgd_bp = bp; + + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); + + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zfs_get_done, zgd); + ASSERT(error || lr->lr_length <= zp->z_blksz); + + /* + * On success, we need to wait for the write I/O + * initiated by dmu_sync() to complete before we can + * release this dbuf. We will finish everything up + * in the zfs_get_done() callback. + */ + if (error == 0) + return (0); + + if (error == EALREADY) { + lr->lr_common.lrc_txtype = TX_WRITE2; + error = 0; + } + } } -out: - zfs_range_unlock(rl); - VN_RELE(ZTOV(zp)); + + zfs_get_done(zgd, error); + return (error); } @@ -955,6 +1074,27 @@ zfs_access(vnode_t *vp, int mode, int flag, cred_t *cr, return (error); } +/* + * If vnode is for a device return a specfs vnode instead. + */ +static int +specvp_check(vnode_t **vpp, cred_t *cr) +{ + int error = 0; + + if (IS_DEVVP(*vpp)) { + struct vnode *svp; + + svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); + VN_RELE(*vpp); + if (svp == NULL) + error = ENOSYS; + *vpp = svp; + } + return (error); +} + + /* * Lookup an entry in a directory, or an extended attribute directory. * If it exists, return a held vnode reference for it. @@ -985,7 +1125,46 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, { znode_t *zdp = VTOZ(dvp); zfsvfs_t *zfsvfs = zdp->z_zfsvfs; - int error; + int error = 0; + + /* fast path */ + if (!(flags & (LOOKUP_XATTR | FIGNORECASE))) { + + if (dvp->v_type != VDIR) { + return (ENOTDIR); + } else if (zdp->z_dbuf == NULL) { + return (EIO); + } + + if (nm[0] == 0 || (nm[0] == '.' && nm[1] == '\0')) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (!error) { + *vpp = dvp; + VN_HOLD(*vpp); + return (0); + } + return (error); + } else { + vnode_t *tvp = dnlc_lookup(dvp, nm); + + if (tvp) { + error = zfs_fastaccesschk_execute(zdp, cr); + if (error) { + VN_RELE(tvp); + return (error); + } + if (tvp == DNLC_NO_VNODE) { + VN_RELE(tvp); + return (ENOENT); + } else { + *vpp = tvp; + return (specvp_check(vpp, cr)); + } + } + } + } + + DTRACE_PROBE2(zfs__fastpath__lookup__miss, vnode_t *, dvp, char *, nm); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zdp); @@ -1050,21 +1229,8 @@ zfs_lookup(vnode_t *dvp, char *nm, vnode_t **vpp, struct pathname *pnp, } error = zfs_dirlook(zdp, nm, vpp, flags, direntflags, realpnp); - if (error == 0) { - /* - * Convert device special files - */ - if (IS_DEVVP(*vpp)) { - vnode_t *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) - error = ENOSYS; - else - *vpp = svp; - } - } + if (error == 0) + error = specvp_check(vpp, cr); ZFS_EXIT(zfsvfs); return (error); @@ -1108,11 +1274,11 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, zfs_dirlock_t *dl; dmu_tx_t *tx; int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; /* * If we have an ephemeral id, ACL, or XVATTR then @@ -1175,21 +1341,9 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, if (strcmp(name, "..") == 0) error = EISDIR; ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (error); - } - } - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - ZFS_EXIT(zfsvfs); - if (dl) - zfs_dirent_unlock(dl); return (error); } } - if (zp == NULL) { uint64_t txtype; @@ -1211,52 +1365,52 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, goto out; } + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, + &acl_ids)) != 0) + goto out; + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + error = EDQUOT; + goto out; + } + tx = dmu_tx_create(os); dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || - IS_EPHEMERAL(gid)) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, - FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) { + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) { dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - if (error == ERESTART && - zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); return (error); } - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + (void) zfs_link_create(dl, zp, tx, ZNEW); + txtype = zfs_log_create_txtype(Z_FILE, vsecp, vap); if (flag & FIGNORECASE) txtype |= TX_CI; zfs_log_create(zilog, tx, txtype, dzp, zp, name, - vsecp, fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + vsecp, acl_ids.z_fuidp, vap); + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); } else { int aflags = (flag & FAPPEND) ? V_APPEND : 0; @@ -1313,22 +1467,8 @@ zfs_create(vnode_t *dvp, char *name, vattr_t *vap, vcexcl_t excl, VN_RELE(ZTOV(zp)); } else { *vpp = ZTOV(zp); - /* - * If vnode is for a device return a specfs vnode instead. - */ - if (IS_DEVVP(*vpp)) { - struct vnode *svp; - - svp = specvp(*vpp, (*vpp)->v_rdev, (*vpp)->v_type, cr); - VN_RELE(*vpp); - if (svp == NULL) { - error = ENOSYS; - } - *vpp = svp; - } + error = specvp_check(vpp, cr); } - if (aclp) - zfs_acl_free(aclp); ZFS_EXIT(zfsvfs); return (error); @@ -1449,11 +1589,11 @@ zfs_remove(vnode_t *dvp, char *name, cred_t *cr, caller_context_t *ct, /* charge as an update -- would be nice not to charge at all */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1563,12 +1703,12 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, uint64_t txtype; dmu_tx_t *tx; int error; - zfs_acl_t *aclp = NULL; - zfs_fuid_info_t *fuidp = NULL; int zf = ZNEW; ksid_t *ksid; uid_t uid; gid_t gid = crgetgid(cr); + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; ASSERT(vap->va_type == VDIR); @@ -1629,59 +1769,52 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, return (error); } - if (vsecp && aclp == NULL) { - error = zfs_vsec_2_aclp(zfsvfs, vap->va_type, vsecp, &aclp); - if (error) { - zfs_dirent_unlock(dl); - ZFS_EXIT(zfsvfs); - return (error); - } + if ((error = zfs_acl_ids_create(dzp, 0, vap, cr, vsecp, + &acl_ids)) != 0) { + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (error); + } + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (EDQUOT); } + /* * Add a new entry to the directory. */ tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, dirname); dmu_tx_hold_zap(tx, DMU_NEW_OBJECT, FALSE, NULL); - if ((aclp && aclp->z_has_fuids) || IS_EPHEMERAL(uid) || - IS_EPHEMERAL(gid)) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - if ((dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) || aclp) + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; } dmu_tx_abort(tx); ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); return (error); } /* * Create new node. */ - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, aclp, &fuidp); - - if (aclp) - zfs_acl_free(aclp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); /* * Now put new name in parent dir. */ @@ -1692,10 +1825,10 @@ zfs_mkdir(vnode_t *dvp, char *dirname, vattr_t *vap, vnode_t **vpp, cred_t *cr, txtype = zfs_log_create_txtype(Z_DIR, vsecp, vap); if (flags & FIGNORECASE) txtype |= TX_CI; - zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, fuidp, vap); + zfs_log_create(zilog, tx, txtype, dzp, zp, dirname, vsecp, + acl_ids.z_fuidp, vap); - if (fuidp) - zfs_fuid_info_free(fuidp); + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); zfs_dirent_unlock(dl); @@ -1789,13 +1922,13 @@ zfs_rmdir(vnode_t *dvp, char *name, vnode_t *cwd, cred_t *cr, dmu_tx_hold_zap(tx, dzp->z_id, FALSE, name); dmu_tx_hold_bonus(tx, zp->z_id); dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { rw_exit(&zp->z_parent_lock); rw_exit(&zp->z_name_lock); zfs_dirent_unlock(dl); VN_RELE(vp); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -2004,6 +2137,21 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, } } + if (flags & V_RDDIR_ACCFILTER) { + /* + * If we have no access at all, don't include + * this entry in the returned information + */ + znode_t *ezp; + if (zfs_zget(zp->z_zfsvfs, objnum, &ezp) != 0) + goto skip_entry; + if (!zfs_has_access(ezp, cr)) { + VN_RELE(ZTOV(ezp)); + goto skip_entry; + } + VN_RELE(ZTOV(ezp)); + } + if (flags & V_RDDIR_ENTFLAGS) reclen = EDIRENT_RECLEN(strlen(zap.za_name)); else @@ -2055,6 +2203,7 @@ zfs_readdir(vnode_t *vp, uio_t *uio, cred_t *cr, int *eofp, if (prefetch) dmu_prefetch(os, objnum, 0, 0); + skip_entry: /* * Move to the next entry, fill in the previous offset. */ @@ -2155,8 +2304,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, ZFS_VERIFY_ZP(zp); pzp = zp->z_phys; - mutex_enter(&zp->z_lock); - /* * If ACL is trivial don't bother looking for ACE_READ_ATTRIBUTES. * Also, if we are the owner don't bother, since owner should @@ -2166,7 +2313,6 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, (pzp->zp_uid != crgetuid(cr))) { if (error = zfs_zaccess(zp, ACE_READ_ATTRIBUTES, 0, skipaclchk, cr)) { - mutex_exit(&zp->z_lock); ZFS_EXIT(zfsvfs); return (error); } @@ -2177,6 +2323,7 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, * than to determine whether we were asked the question. */ + mutex_enter(&zp->z_lock); vap->va_type = vp->v_type; vap->va_mode = pzp->zp_mode & MODEMASK; zfs_fuid_map_ids(zp, cr, &vap->va_uid, &vap->va_gid); @@ -2292,6 +2439,12 @@ zfs_getattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, ZFS_TIME_DECODE(&xoap->xoa_createtime, pzp->zp_crtime); XVA_SET_RTN(xvap, XAT_CREATETIME); } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + xoap->xoa_reparse = + ((pzp->zp_flags & ZFS_REPARSE) != 0); + XVA_SET_RTN(xvap, XAT_REPARSE); + } } ZFS_TIME_DECODE(&vap->va_atime, pzp->zp_atime); @@ -2342,10 +2495,12 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; + xvattr_t tmpxvattr; uint_t mask = vap->va_mask; uint_t saved_mask; int trim_mask = 0; uint64_t new_mode; + uint64_t new_uid, new_gid; znode_t *attrzp; int need_policy = FALSE; int err; @@ -2354,6 +2509,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, xoptattr_t *xoap; zfs_acl_t *aclp = NULL; boolean_t skipaclchk = (flags & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; + boolean_t fuid_dirtied = B_FALSE; if (mask == 0) return (0); @@ -2396,6 +2552,8 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, */ xoap = xva_getxoptattr(xvap); + xva_init(&tmpxvattr); + /* * Immutable files can only alter immutable bit and atime */ @@ -2428,6 +2586,7 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, top: attrzp = NULL; + /* Can this be moved to before the top label? */ if (zfsvfs->z_vfs->vfs_flag & VFS_RDONLY) { ZFS_EXIT(zfsvfs); return (EROFS); @@ -2518,45 +2677,101 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, oldva.va_mode = pzp->zp_mode; zfs_fuid_map_ids(zp, cr, &oldva.va_uid, &oldva.va_gid); if (mask & AT_XVATTR) { - if ((need_policy == FALSE) && - (XVA_ISSET_REQ(xvap, XAT_APPENDONLY) && - xoap->xoa_appendonly != - ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_NOUNLINK) && - xoap->xoa_nounlink != - ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE) && - xoap->xoa_immutable != - ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_NODUMP) && - xoap->xoa_nodump != - ((pzp->zp_flags & ZFS_NODUMP) != 0)) || - (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED) && - xoap->xoa_av_modified != - ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) || - ((XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED) && - ((vp->v_type != VREG && xoap->xoa_av_quarantined) || - xoap->xoa_av_quarantined != - ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)))) || - (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) || - (XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { - need_policy = TRUE; + /* + * Update xvattr mask to include only those attributes + * that are actually changing. + * + * the bits will be restored prior to actually setting + * the attributes so the caller thinks they were set. + */ + if (XVA_ISSET_REQ(xvap, XAT_APPENDONLY)) { + if (xoap->xoa_appendonly != + ((pzp->zp_flags & ZFS_APPENDONLY) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_APPENDONLY); + XVA_SET_REQ(&tmpxvattr, XAT_APPENDONLY); + } } - } - - mutex_exit(&zp->z_lock); - if (mask & AT_MODE) { - if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { - err = secpolicy_setid_setsticky_clear(vp, vap, - &oldva, cr); - if (err) { - ZFS_EXIT(zfsvfs); - return (err); + if (XVA_ISSET_REQ(xvap, XAT_NOUNLINK)) { + if (xoap->xoa_nounlink != + ((pzp->zp_flags & ZFS_NOUNLINK) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NOUNLINK); + XVA_SET_REQ(&tmpxvattr, XAT_NOUNLINK); } - trim_mask |= AT_MODE; - } else { - need_policy = TRUE; + } + + if (XVA_ISSET_REQ(xvap, XAT_IMMUTABLE)) { + if (xoap->xoa_immutable != + ((pzp->zp_flags & ZFS_IMMUTABLE) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_IMMUTABLE); + XVA_SET_REQ(&tmpxvattr, XAT_IMMUTABLE); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_NODUMP)) { + if (xoap->xoa_nodump != + ((pzp->zp_flags & ZFS_NODUMP) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_NODUMP); + XVA_SET_REQ(&tmpxvattr, XAT_NODUMP); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_MODIFIED)) { + if (xoap->xoa_av_modified != + ((pzp->zp_flags & ZFS_AV_MODIFIED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_MODIFIED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_MODIFIED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_AV_QUARANTINED)) { + if ((vp->v_type != VREG && + xoap->xoa_av_quarantined) || + xoap->xoa_av_quarantined != + ((pzp->zp_flags & ZFS_AV_QUARANTINED) != 0)) { + need_policy = TRUE; + } else { + XVA_CLR_REQ(xvap, XAT_AV_QUARANTINED); + XVA_SET_REQ(&tmpxvattr, XAT_AV_QUARANTINED); + } + } + + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + mutex_exit(&zp->z_lock); + ZFS_EXIT(zfsvfs); + return (EPERM); + } + + if (need_policy == FALSE && + (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP) || + XVA_ISSET_REQ(xvap, XAT_OPAQUE))) { + need_policy = TRUE; + } + } + + mutex_exit(&zp->z_lock); + + if (mask & AT_MODE) { + if (zfs_zaccess(zp, ACE_WRITE_ACL, 0, skipaclchk, cr) == 0) { + err = secpolicy_setid_setsticky_clear(vp, vap, + &oldva, cr); + if (err) { + ZFS_EXIT(zfsvfs); + return (err); + } + trim_mask |= AT_MODE; + } else { + need_policy = TRUE; } } @@ -2592,30 +2807,14 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - if (((mask & AT_UID) && IS_EPHEMERAL(vap->va_uid)) || - ((mask & AT_GID) && IS_EPHEMERAL(vap->va_gid))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } if (mask & AT_MODE) { uint64_t pmode = pzp->zp_mode; new_mode = (pmode & S_IFMT) | (vap->va_mode & ~S_IFMT); - if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); - } + if (err = zfs_acl_chmod_setattr(zp, &aclp, new_mode)) + goto out; if (pzp->zp_acl.z_acl_extern_obj) { /* Are we upgrading ACL from old V0 format to new V1 */ if (zfsvfs->z_version <= ZPL_VERSION_FUID && @@ -2637,36 +2836,53 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, } } - if ((mask & (AT_UID | AT_GID)) && pzp->zp_xattr != 0) { - err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); - if (err) { - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - if (aclp) - zfs_acl_free(aclp); - return (err); + if (mask & (AT_UID | AT_GID)) { + if (pzp->zp_xattr) { + err = zfs_zget(zp->z_zfsvfs, pzp->zp_xattr, &attrzp); + if (err) + goto out; + dmu_tx_hold_bonus(tx, attrzp->z_id); + } + if (mask & AT_UID) { + new_uid = zfs_fuid_create(zfsvfs, + (uint64_t)vap->va_uid, cr, ZFS_OWNER, &fuidp); + if (new_uid != pzp->zp_uid && + zfs_usergroup_overquota(zfsvfs, B_FALSE, new_uid)) { + err = EDQUOT; + goto out; + } } - dmu_tx_hold_bonus(tx, attrzp->z_id); - } - - err = dmu_tx_assign(tx, zfsvfs->z_assign); - if (err) { - if (attrzp) - VN_RELE(ZTOV(attrzp)); - if (aclp) { - zfs_acl_free(aclp); - aclp = NULL; + if (mask & AT_GID) { + new_gid = zfs_fuid_create(zfsvfs, (uint64_t)vap->va_gid, + cr, ZFS_GROUP, &fuidp); + if (new_gid != pzp->zp_gid && + zfs_usergroup_overquota(zfsvfs, B_TRUE, new_gid)) { + err = EDQUOT; + goto out; + } } + fuid_dirtied = zfsvfs->z_fuid_dirty; + if (fuid_dirtied) { + if (zfsvfs->z_fuid_obj == 0) { + dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); + dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, + FALSE, NULL); + } else { + dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); + dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, + FUID_SIZE_ESTIMATE(zfsvfs)); + } + } + } - if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + err = dmu_tx_assign(tx, TXG_NOWAIT); + if (err) { + if (err == ERESTART) dmu_tx_wait(tx); - dmu_tx_abort(tx); - goto top; - } - dmu_tx_abort(tx); - ZFS_EXIT(zfsvfs); - return (err); + goto out; } dmu_buf_will_dirty(zp->z_dbuf, tx); @@ -2684,8 +2900,10 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, if (mask & AT_MODE) { mutex_enter(&zp->z_acl_lock); zp->z_phys->zp_mode = new_mode; - err = zfs_aclset_common(zp, aclp, cr, &fuidp, tx); + err = zfs_aclset_common(zp, aclp, cr, tx); ASSERT3U(err, ==, 0); + zp->z_acl_cached = aclp; + aclp = NULL; mutex_exit(&zp->z_acl_lock); } @@ -2693,25 +2911,17 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, mutex_enter(&attrzp->z_lock); if (mask & AT_UID) { - pzp->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - if (attrzp) { - attrzp->z_phys->zp_uid = zfs_fuid_create(zfsvfs, - vap->va_uid, cr, ZFS_OWNER, tx, &fuidp); - } + pzp->zp_uid = new_uid; + if (attrzp) + attrzp->z_phys->zp_uid = new_uid; } if (mask & AT_GID) { - pzp->zp_gid = zfs_fuid_create(zfsvfs, vap->va_gid, - cr, ZFS_GROUP, tx, &fuidp); + pzp->zp_gid = new_gid; if (attrzp) - attrzp->z_phys->zp_gid = zfs_fuid_create(zfsvfs, - vap->va_gid, cr, ZFS_GROUP, tx, &fuidp); + attrzp->z_phys->zp_gid = new_gid; } - if (aclp) - zfs_acl_free(aclp); - if (attrzp) mutex_exit(&attrzp->z_lock); @@ -2732,6 +2942,31 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, */ if (xoap && (mask & AT_XVATTR)) { + + /* + * restore trimmed off masks + * so that return masks can be set for caller. + */ + + if (XVA_ISSET_REQ(&tmpxvattr, XAT_APPENDONLY)) { + XVA_SET_REQ(xvap, XAT_APPENDONLY); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NOUNLINK)) { + XVA_SET_REQ(xvap, XAT_NOUNLINK); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_IMMUTABLE)) { + XVA_SET_REQ(xvap, XAT_IMMUTABLE); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_NODUMP)) { + XVA_SET_REQ(xvap, XAT_NODUMP); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_MODIFIED)) { + XVA_SET_REQ(xvap, XAT_AV_MODIFIED); + } + if (XVA_ISSET_REQ(&tmpxvattr, XAT_AV_QUARANTINED)) { + XVA_SET_REQ(xvap, XAT_AV_QUARANTINED); + } + if (XVA_ISSET_REQ(xvap, XAT_AV_SCANSTAMP)) { size_t len; dmu_object_info_t doi; @@ -2748,17 +2983,33 @@ zfs_setattr(vnode_t *vp, vattr_t *vap, int flags, cred_t *cr, zfs_xvattr_set(zp, xvap); } + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); + if (mask != 0) zfs_log_setattr(zilog, tx, TX_SETATTR, zp, vap, mask, fuidp); - if (fuidp) - zfs_fuid_info_free(fuidp); mutex_exit(&zp->z_lock); +out: if (attrzp) VN_RELE(ZTOV(attrzp)); - dmu_tx_commit(tx); + if (aclp) + zfs_acl_free(aclp); + + if (fuidp) { + zfs_fuid_info_free(fuidp); + fuidp = NULL; + } + + if (err) + dmu_tx_abort(tx); + else + dmu_tx_commit(tx); + + if (err == ERESTART) + goto top; ZFS_EXIT(zfsvfs); return (err); @@ -2998,6 +3249,15 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, } } + /* + * If the source and destination directories are the same, we should + * grab the z_name_lock of that directory only once. + */ + if (sdzp == tdzp) { + zflg |= ZHAVELOCK; + rw_enter(&sdzp->z_name_lock, RW_READER); + } + if (cmp < 0) { serr = zfs_dirent_lock(&sdl, sdzp, snm, &szp, ZEXISTS | zflg, NULL, NULL); @@ -3020,6 +3280,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, if (tzp) VN_RELE(ZTOV(tzp)); } + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + if (strcmp(snm, "..") == 0) serr = EINVAL; ZFS_EXIT(zfsvfs); @@ -3028,6 +3292,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, if (terr) { zfs_dirent_unlock(sdl); VN_RELE(ZTOV(szp)); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + if (strcmp(tnm, "..") == 0) terr = EINVAL; ZFS_EXIT(zfsvfs); @@ -3104,16 +3372,20 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, if (tzp) dmu_tx_hold_bonus(tx, tzp->z_id); /* parent changes */ dmu_tx_hold_zap(tx, zfsvfs->z_unlinkedobj, FALSE, NULL); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { if (zl != NULL) zfs_rename_unlock(&zl); zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); + + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -3151,6 +3423,10 @@ zfs_rename(vnode_t *sdvp, char *snm, vnode_t *tdvp, char *tnm, cred_t *cr, zfs_dirent_unlock(sdl); zfs_dirent_unlock(tdl); + if (sdzp == tdzp) + rw_exit(&sdzp->z_name_lock); + + VN_RELE(ZTOV(szp)); if (tzp) VN_RELE(ZTOV(tzp)); @@ -3189,7 +3465,8 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, int len = strlen(link); int error; int zflg = ZNEW; - zfs_fuid_info_t *fuidp = NULL; + zfs_acl_ids_t acl_ids; + boolean_t fuid_dirtied; ASSERT(vap->va_type == VLNK); @@ -3224,28 +3501,27 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, return (error); } + VERIFY(0 == zfs_acl_ids_create(dzp, 0, vap, cr, NULL, &acl_ids)); + if (zfs_acl_ids_overquota(zfsvfs, &acl_ids)) { + zfs_acl_ids_free(&acl_ids); + zfs_dirent_unlock(dl); + ZFS_EXIT(zfsvfs); + return (EDQUOT); + } tx = dmu_tx_create(zfsvfs->z_os); + fuid_dirtied = zfsvfs->z_fuid_dirty; dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, MAX(1, len)); dmu_tx_hold_bonus(tx, dzp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - if (dzp->z_phys->zp_flags & ZFS_INHERIT_ACE) + if (acl_ids.z_aclp->z_acl_bytes > ZFS_ACE_SPACE) dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, SPA_MAXBLOCKSIZE); - if (IS_EPHEMERAL(crgetuid(cr)) || IS_EPHEMERAL(crgetgid(cr))) { - if (zfsvfs->z_fuid_obj == 0) { - dmu_tx_hold_bonus(tx, DMU_NEW_OBJECT); - dmu_tx_hold_write(tx, DMU_NEW_OBJECT, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - dmu_tx_hold_zap(tx, MASTER_NODE_OBJ, FALSE, NULL); - } else { - dmu_tx_hold_bonus(tx, zfsvfs->z_fuid_obj); - dmu_tx_hold_write(tx, zfsvfs->z_fuid_obj, 0, - FUID_SIZE_ESTIMATE(zfsvfs)); - } - } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + if (fuid_dirtied) + zfs_fuid_txhold(zfsvfs, tx); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { + zfs_acl_ids_free(&acl_ids); zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -3263,13 +3539,16 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, * otherwise, store it just like any other file data. */ if (sizeof (znode_phys_t) + len <= dmu_bonus_max()) { - zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, NULL, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, len, &acl_ids); if (len != 0) bcopy(link, zp->z_phys + 1, len); } else { dmu_buf_t *dbp; - zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, NULL, &fuidp); + zfs_mknode(dzp, vap, tx, cr, 0, &zp, 0, &acl_ids); + + if (fuid_dirtied) + zfs_fuid_sync(zfsvfs, tx); /* * Nothing can access the znode yet so no locking needed * for growing the znode's blocksize. @@ -3290,15 +3569,14 @@ zfs_symlink(vnode_t *dvp, char *name, vattr_t *vap, char *link, cred_t *cr, * Insert the new object into the directory. */ (void) zfs_link_create(dl, zp, tx, ZNEW); -out: if (error == 0) { uint64_t txtype = TX_SYMLINK; if (flags & FIGNORECASE) txtype |= TX_CI; zfs_log_symlink(zilog, tx, txtype, dzp, zp, name, link); } - if (fuidp) - zfs_fuid_info_free(fuidp); + + zfs_acl_ids_free(&acl_ids); dmu_tx_commit(tx); @@ -3462,10 +3740,10 @@ zfs_link(vnode_t *tdvp, vnode_t *svp, char *name, cred_t *cr, tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, szp->z_id); dmu_tx_hold_zap(tx, dzp->z_id, TRUE, name); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { zfs_dirent_unlock(dl); - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -3534,9 +3812,7 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - zilog_t *zilog = zfsvfs->z_log; dmu_tx_t *tx; - rl_t *rl; u_offset_t off, koff; size_t len, klen; uint64_t filesz; @@ -3547,30 +3823,22 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, len = PAGESIZE; /* * If our blocksize is bigger than the page size, try to kluster - * muiltiple pages so that we write a full block (thus avoiding + * multiple pages so that we write a full block (thus avoiding * a read-modify-write). */ if (off < filesz && zp->z_blksz > PAGESIZE) { - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - klen = zp->z_blksz; - koff = P2ALIGN(off, (u_offset_t)klen); - } + klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); + koff = ISP2(klen) ? P2ALIGN(off, (u_offset_t)klen) : 0; ASSERT(koff <= filesz); if (koff + klen > filesz) klen = P2ROUNDUP(filesz - koff, (uint64_t)PAGESIZE); pp = pvn_write_kluster(vp, pp, &off, &len, koff, klen, flags); } ASSERT3U(btop(len), ==, btopr(len)); -top: - rl = zfs_range_lock(zp, off, len, RL_WRITER); + /* * Can't push pages past end-of-file. */ - filesz = zp->z_phys->zp_size; if (off >= filesz) { /* ignore all pages */ err = 0; @@ -3586,16 +3854,20 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, len = filesz - off; } + if (zfs_usergroup_overquota(zfsvfs, B_FALSE, zp->z_phys->zp_uid) || + zfs_usergroup_overquota(zfsvfs, B_TRUE, zp->z_phys->zp_gid)) { + err = EDQUOT; + goto out; + } +top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_write(tx, zp->z_id, off, len); dmu_tx_hold_bonus(tx, zp->z_id); - err = dmu_tx_assign(tx, zfsvfs->z_assign); + err = dmu_tx_assign(tx, TXG_NOWAIT); if (err != 0) { - if (err == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { - zfs_range_unlock(rl); + if (err == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); - err = 0; goto top; } dmu_tx_abort(tx); @@ -3613,12 +3885,11 @@ zfs_putapage(vnode_t *vp, page_t *pp, u_offset_t *offp, if (err == 0) { zfs_time_stamper(zp, CONTENT_MODIFIED, tx); - zfs_log_write(zilog, tx, TX_WRITE, zp, off, len, 0); - dmu_tx_commit(tx); + zfs_log_write(zfsvfs->z_log, tx, TX_WRITE, zp, off, len, 0); } + dmu_tx_commit(tx); out: - zfs_range_unlock(rl); pvn_write_done(pp, (err ? B_ERROR : 0) | flags); if (offp) *offp = off; @@ -3655,31 +3926,50 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, page_t *pp; size_t io_len; u_offset_t io_off; - uint64_t filesz; + uint_t blksz; + rl_t *rl; int error = 0; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); - if (len == 0) { + /* + * Align this request to the file block size in case we kluster. + * XXX - this can result in pretty aggresive locking, which can + * impact simultanious read/write access. One option might be + * to break up long requests (len == 0) into block-by-block + * operations to get narrower locking. + */ + blksz = zp->z_blksz; + if (ISP2(blksz)) + io_off = P2ALIGN_TYPED(off, blksz, u_offset_t); + else + io_off = 0; + if (len > 0 && ISP2(blksz)) + io_len = P2ROUNDUP_TYPED(len + (off - io_off), blksz, size_t); + else + io_len = 0; + + if (io_len == 0) { /* - * Search the entire vp list for pages >= off. + * Search the entire vp list for pages >= io_off. */ - error = pvn_vplist_dirty(vp, (u_offset_t)off, zfs_putapage, - flags, cr); + rl = zfs_range_lock(zp, io_off, UINT64_MAX, RL_WRITER); + error = pvn_vplist_dirty(vp, io_off, zfs_putapage, flags, cr); goto out; } + rl = zfs_range_lock(zp, io_off, io_len, RL_WRITER); - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off > filesz) { + if (off > zp->z_phys->zp_size) { /* past end of file */ + zfs_range_unlock(rl); ZFS_EXIT(zfsvfs); return (0); } - len = MIN(len, filesz - off); + len = MIN(io_len, P2ROUNDUP(zp->z_phys->zp_size, PAGESIZE) - io_off); - for (io_off = off; io_off < off + len; io_off += io_len) { + for (off = io_off; io_off < off + len; io_off += io_len) { if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { pp = page_lookup(vp, io_off, (flags & (B_INVAL | B_FREE)) ? SE_EXCL : SE_SHARED); @@ -3702,6 +3992,7 @@ zfs_putpage(vnode_t *vp, offset_t off, size_t len, int flags, cred_t *cr, } } out: + zfs_range_unlock(rl); if ((flags & B_ASYNC) == 0) zil_commit(zfsvfs->z_log, UINT64_MAX, zp->z_id); ZFS_EXIT(zfsvfs); @@ -3728,7 +4019,10 @@ zfs_inactive(vnode_t *vp, cred_t *cr, caller_context_t *ct) } mutex_enter(&zp->z_lock); - vp->v_count = 0; /* count arrives as 1 */ + mutex_enter(&vp->v_lock); + ASSERT(vp->v_count == 1); + vp->v_count = 0; + mutex_exit(&vp->v_lock); mutex_exit(&zp->z_lock); rw_exit(&zfsvfs->z_teardown_inactive_lock); zfs_znode_free(zp); @@ -3795,7 +4089,6 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - int error; ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -3810,15 +4103,16 @@ zfs_frlock(vnode_t *vp, int cmd, flock64_t *bfp, int flag, offset_t offset, ZFS_EXIT(zfsvfs); return (EAGAIN); } - error = fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct); ZFS_EXIT(zfsvfs); - return (error); + return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); } /* * If we can't find a page in the cache, we will create a new page * and fill it with file data. For efficiency, we may try to fill - * multiple pages at once (klustering). + * multiple pages at once (klustering) to fill up the supplied page + * list. Note that the pages to be filled are held with an exclusive + * lock to prevent access by other threads while they are being filled. */ static int zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, @@ -3827,57 +4121,28 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, znode_t *zp = VTOZ(vp); page_t *pp, *cur_pp; objset_t *os = zp->z_zfsvfs->z_os; - caddr_t va; u_offset_t io_off, total; - uint64_t oid = zp->z_id; size_t io_len; - uint64_t filesz; int err; - /* - * If we are only asking for a single page don't bother klustering. - */ - filesz = zp->z_phys->zp_size; /* get consistent copy of zp_size */ - if (off >= filesz) - return (EFAULT); if (plsz == PAGESIZE || zp->z_blksz <= PAGESIZE) { + /* + * We only have a single page, don't bother klustering + */ io_off = off; io_len = PAGESIZE; - pp = page_create_va(vp, io_off, io_len, PG_WAIT, seg, addr); + pp = page_create_va(vp, io_off, io_len, + PG_EXCL | PG_WAIT, seg, addr); } else { /* - * Try to fill a kluster of pages (a blocks worth). + * Try to find enough pages to fill the page list */ - size_t klen; - u_offset_t koff; - - if (!ISP2(zp->z_blksz)) { - /* Only one block in the file. */ - klen = P2ROUNDUP((ulong_t)zp->z_blksz, PAGESIZE); - koff = 0; - } else { - /* - * It would be ideal to align our offset to the - * blocksize but doing so has resulted in some - * strange application crashes. For now, we - * leave the offset as is and only adjust the - * length if we are off the end of the file. - */ - koff = off; - klen = plsz; - } - ASSERT(koff <= filesz); - if (koff + klen > filesz) - klen = P2ROUNDUP(filesz, (uint64_t)PAGESIZE) - koff; - ASSERT3U(off, >=, koff); - ASSERT3U(off, <, koff + klen); pp = pvn_read_kluster(vp, off, seg, addr, &io_off, - &io_len, koff, klen, 0); + &io_len, off, plsz, 0); } if (pp == NULL) { /* - * Some other thread entered the page before us. - * Return to zfs_getpage to retry the lookup. + * The page already exists, nothing to do here. */ *pl = NULL; return (0); @@ -3888,9 +4153,12 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, */ cur_pp = pp; for (total = io_off + io_len; io_off < total; io_off += PAGESIZE) { + caddr_t va; + ASSERT3U(io_off, ==, cur_pp->p_offset); va = zfs_map_page(cur_pp, S_WRITE); - err = dmu_read(os, oid, io_off, PAGESIZE, va); + err = dmu_read(os, zp->z_id, io_off, PAGESIZE, va, + DMU_READ_PREFETCH); zfs_unmap_page(cur_pp, va); if (err) { /* On error, toss the entire kluster */ @@ -3902,15 +4170,14 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, } cur_pp = cur_pp->p_next; } -out: + /* - * Fill in the page list array from the kluster. If - * there are too many pages in the kluster, return - * as many pages as possible starting from the desired - * offset `off'. + * Fill in the page list array from the kluster starting + * from the desired offset `off'. * NOTE: the page list will always be null terminated. */ pvn_plist_init(pp, pl, plsz, off, io_len, rw); + ASSERT(pl == NULL || (*pl)->p_offset == off); return (0); } @@ -3918,10 +4185,10 @@ zfs_fillpage(vnode_t *vp, u_offset_t off, struct seg *seg, /* * Return pointers to the pages for the file region [off, off + len] * in the pl array. If plsz is greater than len, this function may - * also return page pointers from before or after the specified - * region (i.e. some region [off', off' + plsz]). These additional - * pages are only returned if they are already in the cache, or were - * created as part of a klustered read. + * also return page pointers from after the specified region + * (i.e. the region [off, off + plsz]). These additional pages are + * only returned if they are already in the cache, or were created as + * part of a klustered read. * * IN: vp - vnode of file to get data from. * off - position in file to get data from. @@ -3950,9 +4217,17 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, { znode_t *zp = VTOZ(vp); zfsvfs_t *zfsvfs = zp->z_zfsvfs; - page_t *pp, **pl0 = pl; - int need_unlock = 0, err = 0; - offset_t orig_off; + page_t **pl0 = pl; + int err = 0; + + /* we do our own caching, faultahead is unnecessary */ + if (pl == NULL) + return (0); + else if (len > plsz) + len = plsz; + else + len = P2ROUNDUP(len, PAGESIZE); + ASSERT(plsz >= len); ZFS_ENTER(zfsvfs); ZFS_VERIFY_ZP(zp); @@ -3960,104 +4235,51 @@ zfs_getpage(vnode_t *vp, offset_t off, size_t len, uint_t *protp, if (protp) *protp = PROT_ALL; - /* no faultahead (for now) */ - if (pl == NULL) { - ZFS_EXIT(zfsvfs); - return (0); - } - - /* can't fault past EOF */ - if (off >= zp->z_phys->zp_size) { - ZFS_EXIT(zfsvfs); - return (EFAULT); - } - orig_off = off; - - /* - * If we already own the lock, then we must be page faulting - * in the middle of a write to this file (i.e., we are writing - * to this file using data from a mapped region of the file). - */ - if (rw_owner(&zp->z_map_lock) != curthread) { - rw_enter(&zp->z_map_lock, RW_WRITER); - need_unlock = TRUE; - } - /* - * Loop through the requested range [off, off + len] looking + * Loop through the requested range [off, off + len) looking * for pages. If we don't find a page, we will need to create * a new page and fill it with data from the file. */ while (len > 0) { - if (plsz < PAGESIZE) - break; - if (pp = page_lookup(vp, off, SE_SHARED)) { - *pl++ = pp; + if (*pl = page_lookup(vp, off, SE_SHARED)) + *(pl+1) = NULL; + else if (err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw)) + goto out; + while (*pl) { + ASSERT3U((*pl)->p_offset, ==, off); off += PAGESIZE; addr += PAGESIZE; - len -= PAGESIZE; - plsz -= PAGESIZE; - } else { - err = zfs_fillpage(vp, off, seg, addr, pl, plsz, rw); - if (err) - goto out; - /* - * klustering may have changed our region - * to be block aligned. - */ - if (((pp = *pl) != 0) && (off != pp->p_offset)) { - int delta = off - pp->p_offset; - len += delta; - off -= delta; - addr -= delta; - } - while (*pl) { - pl++; - off += PAGESIZE; - addr += PAGESIZE; - plsz -= PAGESIZE; - if (len > PAGESIZE) - len -= PAGESIZE; - else - len = 0; + if (len > 0) { + ASSERT3U(len, >=, PAGESIZE); + len -= PAGESIZE; } + ASSERT3U(plsz, >=, PAGESIZE); + plsz -= PAGESIZE; + pl++; } } /* * Fill out the page array with any pages already in the cache. */ - while (plsz > 0) { - pp = page_lookup_nowait(vp, off, SE_SHARED); - if (pp == NULL) - break; - *pl++ = pp; - off += PAGESIZE; - plsz -= PAGESIZE; + while (plsz > 0 && + (*pl++ = page_lookup_nowait(vp, off, SE_SHARED))) { + off += PAGESIZE; + plsz -= PAGESIZE; } - - ZFS_ACCESSTIME_STAMP(zfsvfs, zp); out: - /* - * We can't grab the range lock for the page as reader which would - * stop truncation as this leads to deadlock. So we need to recheck - * the file size. - */ - if (orig_off >= zp->z_phys->zp_size) - err = EFAULT; if (err) { /* * Release any pages we have previously locked. */ while (pl > pl0) page_unlock(*--pl); + } else { + ZFS_ACCESSTIME_STAMP(zfsvfs, zp); } *pl = NULL; - if (need_unlock) - rw_exit(&zp->z_map_lock); - ZFS_EXIT(zfsvfs); return (err); } @@ -4360,6 +4582,11 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, (vp->v_type == VREG || vp->v_type == VDIR); return (0); + case _PC_ACCESS_FILTERING: + *valp = vfs_has_feature(vp->v_vfsp, VFSFT_ACCESS_FILTER) && + vp->v_type == VDIR; + return (0); + case _PC_ACL_ENABLED: *valp = _ACL_ACE_ENABLED; return (0); @@ -4368,6 +4595,11 @@ zfs_pathconf(vnode_t *vp, int cmd, ulong_t *valp, cred_t *cr, *valp = (ulong_t)SPA_MINBLOCKSIZE; return (0); + case _PC_TIMESTAMP_RESOLUTION: + /* nanosecond timestamp resolution */ + *valp = 1L; + return (0); + default: return (fs_pathconf(vp, cmd, valp, cr, ct)); } @@ -4408,6 +4640,161 @@ zfs_setsecattr(vnode_t *vp, vsecattr_t *vsecp, int flag, cred_t *cr, return (error); } +/* + * Tunable, both must be a power of 2. + * + * zcr_blksz_min: the smallest read we may consider to loan out an arcbuf + * zcr_blksz_max: if set to less than the file block size, allow loaning out of + * an arcbuf for a partial block read + */ +int zcr_blksz_min = (1 << 10); /* 1K */ +int zcr_blksz_max = (1 << 17); /* 128K */ + +/*ARGSUSED*/ +static int +zfs_reqzcbuf(vnode_t *vp, enum uio_rw ioflag, xuio_t *xuio, cred_t *cr, + caller_context_t *ct) +{ + znode_t *zp = VTOZ(vp); + zfsvfs_t *zfsvfs = zp->z_zfsvfs; + int max_blksz = zfsvfs->z_max_blksz; + uio_t *uio = &xuio->xu_uio; + ssize_t size = uio->uio_resid; + offset_t offset = uio->uio_loffset; + int blksz; + int fullblk, i; + arc_buf_t *abuf; + ssize_t maxsize; + int preamble, postamble; + + if (xuio->xu_type != UIOTYPE_ZEROCOPY) + return (EINVAL); + + ZFS_ENTER(zfsvfs); + ZFS_VERIFY_ZP(zp); + switch (ioflag) { + case UIO_WRITE: + /* + * Loan out an arc_buf for write if write size is bigger than + * max_blksz, and the file's block size is also max_blksz. + */ + blksz = max_blksz; + if (size < blksz || zp->z_blksz != blksz) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + /* + * Caller requests buffers for write before knowing where the + * write offset might be (e.g. NFS TCP write). + */ + if (offset == -1) { + preamble = 0; + } else { + preamble = P2PHASE(offset, blksz); + if (preamble) { + preamble = blksz - preamble; + size -= preamble; + } + } + + postamble = P2PHASE(size, blksz); + size -= postamble; + + fullblk = size / blksz; + (void) dmu_xuio_init(xuio, + (preamble != 0) + fullblk + (postamble != 0)); + DTRACE_PROBE3(zfs_reqzcbuf_align, int, preamble, + int, postamble, int, + (preamble != 0) + fullblk + (postamble != 0)); + + /* + * Have to fix iov base/len for partial buffers. They + * currently represent full arc_buf's. + */ + if (preamble) { + /* data begins in the middle of the arc_buf */ + abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); + ASSERT(abuf); + (void) dmu_xuio_add(xuio, abuf, + blksz - preamble, preamble); + } + + for (i = 0; i < fullblk; i++) { + abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); + ASSERT(abuf); + (void) dmu_xuio_add(xuio, abuf, 0, blksz); + } + + if (postamble) { + /* data ends in the middle of the arc_buf */ + abuf = dmu_request_arcbuf(zp->z_dbuf, blksz); + ASSERT(abuf); + (void) dmu_xuio_add(xuio, abuf, 0, postamble); + } + break; + case UIO_READ: + /* + * Loan out an arc_buf for read if the read size is larger than + * the current file block size. Block alignment is not + * considered. Partial arc_buf will be loaned out for read. + */ + blksz = zp->z_blksz; + if (blksz < zcr_blksz_min) + blksz = zcr_blksz_min; + if (blksz > zcr_blksz_max) + blksz = zcr_blksz_max; + /* avoid potential complexity of dealing with it */ + if (blksz > max_blksz) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + maxsize = zp->z_phys->zp_size - uio->uio_loffset; + if (size > maxsize) + size = maxsize; + + if (size < blksz || vn_has_cached_data(vp)) { + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + break; + default: + ZFS_EXIT(zfsvfs); + return (EINVAL); + } + + uio->uio_extflg = UIO_XUIO; + XUIO_XUZC_RW(xuio) = ioflag; + ZFS_EXIT(zfsvfs); + return (0); +} + +/*ARGSUSED*/ +static int +zfs_retzcbuf(vnode_t *vp, xuio_t *xuio, cred_t *cr, caller_context_t *ct) +{ + int i; + arc_buf_t *abuf; + int ioflag = XUIO_XUZC_RW(xuio); + + ASSERT(xuio->xu_type == UIOTYPE_ZEROCOPY); + + i = dmu_xuio_cnt(xuio); + while (i-- > 0) { + abuf = dmu_xuio_arcbuf(xuio, i); + /* + * if abuf == NULL, it must be a write buffer + * that has been returned in zfs_write(). + */ + if (abuf) + dmu_return_arcbuf(abuf); + ASSERT(abuf || ioflag == UIO_WRITE); + } + + dmu_xuio_fini(xuio); + return (0); +} + /* * Predeclare these here so that the compiler assumes that * this is an "old style" function declaration that does @@ -4491,6 +4878,8 @@ const fs_operation_def_t zfs_fvnodeops_template[] = { VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + VOPNAME_REQZCBUF, { .vop_reqzcbuf = zfs_reqzcbuf }, + VOPNAME_RETZCBUF, { .vop_retzcbuf = zfs_retzcbuf }, NULL, NULL }; @@ -4511,6 +4900,22 @@ const fs_operation_def_t zfs_symvnodeops_template[] = { NULL, NULL }; +/* + * special share hidden files vnode operations template + */ +vnodeops_t *zfs_sharevnodeops; +const fs_operation_def_t zfs_sharevnodeops_template[] = { + VOPNAME_GETATTR, { .vop_getattr = zfs_getattr }, + VOPNAME_ACCESS, { .vop_access = zfs_access }, + VOPNAME_INACTIVE, { .vop_inactive = zfs_inactive }, + VOPNAME_FID, { .vop_fid = zfs_fid }, + VOPNAME_PATHCONF, { .vop_pathconf = zfs_pathconf }, + VOPNAME_GETSECATTR, { .vop_getsecattr = zfs_getsecattr }, + VOPNAME_SETSECATTR, { .vop_setsecattr = zfs_setsecattr }, + VOPNAME_VNEVENT, { .vop_vnevent = fs_vnevent_support }, + NULL, NULL +}; + /* * Extended attribute directory vnode operations template * This template is identical to the directory vnodes diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c index 25751ae5f8541..1ff237e163cce 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zfs_znode.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -87,6 +87,12 @@ * (such as VFS logic) that will not compile easily in userland. */ #ifdef _KERNEL +/* + * Needed to close a small window in zfs_znode_move() that allows the zfsvfs to + * be freed before it can be safely accessed. + */ +krwlock_t zfsvfs_lock; + static kmem_cache_t *znode_cache = NULL; /*ARGSUSED*/ @@ -117,7 +123,6 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) list_link_init(&zp->z_link_node); mutex_init(&zp->z_lock, NULL, MUTEX_DEFAULT, NULL); - rw_init(&zp->z_map_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_parent_lock, NULL, RW_DEFAULT, NULL); rw_init(&zp->z_name_lock, NULL, RW_DEFAULT, NULL); mutex_init(&zp->z_acl_lock, NULL, MUTEX_DEFAULT, NULL); @@ -128,6 +133,7 @@ zfs_znode_cache_constructor(void *buf, void *arg, int kmflags) zp->z_dbuf = NULL; zp->z_dirlocks = NULL; + zp->z_acl_cached = NULL; return (0); } @@ -142,7 +148,6 @@ zfs_znode_cache_destructor(void *buf, void *arg) vn_free(ZTOV(zp)); ASSERT(!list_link_active(&zp->z_link_node)); mutex_destroy(&zp->z_lock); - rw_destroy(&zp->z_map_lock); rw_destroy(&zp->z_parent_lock); rw_destroy(&zp->z_name_lock); mutex_destroy(&zp->z_acl_lock); @@ -151,13 +156,15 @@ zfs_znode_cache_destructor(void *buf, void *arg) ASSERT(zp->z_dbuf == NULL); ASSERT(zp->z_dirlocks == NULL); + ASSERT(zp->z_acl_cached == NULL); } #ifdef ZNODE_STATS static struct { uint64_t zms_zfsvfs_invalid; + uint64_t zms_zfsvfs_recheck1; uint64_t zms_zfsvfs_unmounted; - uint64_t zms_zfsvfs_recheck_invalid; + uint64_t zms_zfsvfs_recheck2; uint64_t zms_obj_held; uint64_t zms_vnode_locked; uint64_t zms_not_only_dnlc; @@ -194,6 +201,15 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) nzp->z_phys = ozp->z_phys; nzp->z_dbuf = ozp->z_dbuf; + /* + * Since this is just an idle znode and kmem is already dealing with + * memory pressure, release any cached ACL. + */ + if (ozp->z_acl_cached) { + zfs_acl_free(ozp->z_acl_cached); + ozp->z_acl_cached = NULL; + } + /* Update back pointers. */ (void) dmu_buf_update_user(nzp->z_dbuf, ozp, nzp, &nzp->z_phys, znode_evict_error); @@ -208,17 +224,6 @@ zfs_znode_move_impl(znode_t *ozp, znode_t *nzp) POINTER_INVALIDATE(&ozp->z_zfsvfs); } -/* - * Wrapper function for ZFS_ENTER that returns 0 if successful and otherwise - * returns a non-zero error code. - */ -static int -zfs_enter(zfsvfs_t *zfsvfs) -{ - ZFS_ENTER(zfsvfs); - return (0); -} - /*ARGSUSED*/ static kmem_cbrc_t zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) @@ -242,12 +247,32 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) } /* - * Ensure that the filesystem is not unmounted during the move. + * Close a small window in which it's possible that the filesystem could + * be unmounted and freed, and zfsvfs, though valid in the previous + * statement, could point to unrelated memory by the time we try to + * prevent the filesystem from being unmounted. + */ + rw_enter(&zfsvfs_lock, RW_WRITER); + if (zfsvfs != ozp->z_zfsvfs) { + rw_exit(&zfsvfs_lock); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck1); + return (KMEM_CBRC_DONT_KNOW); + } + + /* + * If the znode is still valid, then so is the file system. We know that + * no valid file system can be freed while we hold zfsvfs_lock, so we + * can safely ensure that the filesystem is not and will not be + * unmounted. The next statement is equivalent to ZFS_ENTER(). */ - if (zfs_enter(zfsvfs) != 0) { /* ZFS_ENTER */ + rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG); + if (zfsvfs->z_unmounted) { + ZFS_EXIT(zfsvfs); + rw_exit(&zfsvfs_lock); ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_unmounted); return (KMEM_CBRC_DONT_KNOW); } + rw_exit(&zfsvfs_lock); mutex_enter(&zfsvfs->z_znodes_lock); /* @@ -257,7 +282,7 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg) if (zfsvfs != ozp->z_zfsvfs) { mutex_exit(&zfsvfs->z_znodes_lock); ZFS_EXIT(zfsvfs); - ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck_invalid); + ZNODE_STAT_ADD(znode_move_stats.zms_zfsvfs_recheck2); return (KMEM_CBRC_DONT_KNOW); } @@ -313,6 +338,7 @@ zfs_znode_init(void) /* * Initialize zcache */ + rw_init(&zfsvfs_lock, NULL, RW_DEFAULT, NULL); ASSERT(znode_cache == NULL); znode_cache = kmem_cache_create("zfs_znode_cache", sizeof (znode_t), 0, zfs_znode_cache_constructor, @@ -334,6 +360,7 @@ zfs_znode_fini(void) if (znode_cache) kmem_cache_destroy(znode_cache); znode_cache = NULL; + rw_destroy(&zfsvfs_lock); } struct vnodeops *zfs_dvnodeops; @@ -341,6 +368,7 @@ struct vnodeops *zfs_fvnodeops; struct vnodeops *zfs_symvnodeops; struct vnodeops *zfs_xdvnodeops; struct vnodeops *zfs_evnodeops; +struct vnodeops *zfs_sharevnodeops; void zfs_remove_op_tables() @@ -365,12 +393,15 @@ zfs_remove_op_tables() vn_freevnodeops(zfs_xdvnodeops); if (zfs_evnodeops) vn_freevnodeops(zfs_evnodeops); + if (zfs_sharevnodeops) + vn_freevnodeops(zfs_sharevnodeops); zfs_dvnodeops = NULL; zfs_fvnodeops = NULL; zfs_symvnodeops = NULL; zfs_xdvnodeops = NULL; zfs_evnodeops = NULL; + zfs_sharevnodeops = NULL; } extern const fs_operation_def_t zfs_dvnodeops_template[]; @@ -378,6 +409,7 @@ extern const fs_operation_def_t zfs_fvnodeops_template[]; extern const fs_operation_def_t zfs_xdvnodeops_template[]; extern const fs_operation_def_t zfs_symvnodeops_template[]; extern const fs_operation_def_t zfs_evnodeops_template[]; +extern const fs_operation_def_t zfs_sharevnodeops_template[]; int zfs_create_op_tables() @@ -414,103 +446,58 @@ zfs_create_op_tables() error = vn_make_ops(MNTTYPE_ZFS, zfs_evnodeops_template, &zfs_evnodeops); + if (error) + return (error); + + error = vn_make_ops(MNTTYPE_ZFS, zfs_sharevnodeops_template, + &zfs_sharevnodeops); return (error); } -/* - * zfs_init_fs - Initialize the zfsvfs struct and the file system - * incore "master" object. Verify version compatibility. - */ int -zfs_init_fs(zfsvfs_t *zfsvfs, znode_t **zpp) +zfs_create_share_dir(zfsvfs_t *zfsvfs, dmu_tx_t *tx) { - extern int zfsfstype; - - objset_t *os = zfsvfs->z_os; - int i, error; - uint64_t fsid_guid; - uint64_t zval; - - *zpp = NULL; - - error = zfs_get_zplprop(os, ZFS_PROP_VERSION, &zfsvfs->z_version); - if (error) { - return (error); - } else if (zfsvfs->z_version > ZPL_VERSION) { - (void) printf("Mismatched versions: File system " - "is version %llu on-disk format, which is " - "incompatible with this software version %lld!", - (u_longlong_t)zfsvfs->z_version, ZPL_VERSION); - return (ENOTSUP); - } - - if ((error = zfs_get_zplprop(os, ZFS_PROP_NORMALIZE, &zval)) != 0) - return (error); - zfsvfs->z_norm = (int)zval; - if ((error = zfs_get_zplprop(os, ZFS_PROP_UTF8ONLY, &zval)) != 0) - return (error); - zfsvfs->z_utf8 = (zval != 0); - if ((error = zfs_get_zplprop(os, ZFS_PROP_CASE, &zval)) != 0) - return (error); - zfsvfs->z_case = (uint_t)zval; - /* - * Fold case on file systems that are always or sometimes case - * insensitive. - */ - if (zfsvfs->z_case == ZFS_CASE_INSENSITIVE || - zfsvfs->z_case == ZFS_CASE_MIXED) - zfsvfs->z_norm |= U8_TEXTPREP_TOUPPER; + zfs_acl_ids_t acl_ids; + vattr_t vattr; + znode_t *sharezp; + vnode_t *vp; + znode_t *zp; + int error; - /* - * The fsid is 64 bits, composed of an 8-bit fs type, which - * separates our fsid from any other filesystem types, and a - * 56-bit objset unique ID. The objset unique ID is unique to - * all objsets open on this system, provided by unique_create(). - * The 8-bit fs type must be put in the low bits of fsid[1] - * because that's where other Solaris filesystems put it. - */ - fsid_guid = dmu_objset_fsid_guid(os); - ASSERT((fsid_guid & ~((1ULL<<56)-1)) == 0); - zfsvfs->z_vfs->vfs_fsid.val[0] = fsid_guid; - zfsvfs->z_vfs->vfs_fsid.val[1] = ((fsid_guid>>32) << 8) | - zfsfstype & 0xFF; - - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_ROOT_OBJ, 8, 1, - &zfsvfs->z_root); - if (error) - return (error); - ASSERT(zfsvfs->z_root != 0); + vattr.va_mask = AT_MODE|AT_UID|AT_GID|AT_TYPE; + vattr.va_type = VDIR; + vattr.va_mode = S_IFDIR|0555; + vattr.va_uid = crgetuid(kcred); + vattr.va_gid = crgetgid(kcred); - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_UNLINKED_SET, 8, 1, - &zfsvfs->z_unlinkedobj); - if (error) - return (error); + sharezp = kmem_cache_alloc(znode_cache, KM_SLEEP); + sharezp->z_unlinked = 0; + sharezp->z_atime_dirty = 0; + sharezp->z_zfsvfs = zfsvfs; - /* - * Initialize zget mutex's - */ - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_init(&zfsvfs->z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + vp = ZTOV(sharezp); + vn_reinit(vp); + vp->v_type = VDIR; - error = zfs_zget(zfsvfs, zfsvfs->z_root, zpp); - if (error) { - /* - * On error, we destroy the mutexes here since it's not - * possible for the caller to determine if the mutexes were - * initialized properly. - */ - for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) - mutex_destroy(&zfsvfs->z_hold_mtx[i]); - return (error); - } - ASSERT3U((*zpp)->z_id, ==, zfsvfs->z_root); - error = zap_lookup(os, MASTER_NODE_OBJ, ZFS_FUID_TABLES, 8, 1, - &zfsvfs->z_fuid_obj); - if (error == ENOENT) - error = 0; + VERIFY(0 == zfs_acl_ids_create(sharezp, IS_ROOT_NODE, &vattr, + kcred, NULL, &acl_ids)); + zfs_mknode(sharezp, &vattr, tx, kcred, IS_ROOT_NODE, + &zp, 0, &acl_ids); + ASSERT3P(zp, ==, sharezp); + ASSERT(!vn_in_dnlc(ZTOV(sharezp))); /* not valid to move */ + POINTER_INVALIDATE(&sharezp->z_zfsvfs); + error = zap_add(zfsvfs->z_os, MASTER_NODE_OBJ, + ZFS_SHARES_DIR, 8, 1, &sharezp->z_id, tx); + zfsvfs->z_shares_dir = sharezp->z_id; + + zfs_acl_ids_free(&acl_ids); + ZTOV(sharezp)->v_count = 0; + dmu_buf_rele(sharezp->z_dbuf, NULL); + sharezp->z_dbuf = NULL; + kmem_cache_free(znode_cache, sharezp); - return (0); + return (error); } /* @@ -581,6 +568,7 @@ zfs_znode_dmu_init(zfsvfs_t *zfsvfs, znode_t *zp, dmu_buf_t *db) mutex_enter(&zp->z_lock); ASSERT(zp->z_dbuf == NULL); + ASSERT(zp->z_acl_cached == NULL); zp->z_dbuf = db; nzp = dmu_buf_set_user_ie(db, zp, &zp->z_phys, znode_evict_error); @@ -678,7 +666,10 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) break; case VREG: vp->v_flag |= VMODSORT; - vn_setops(vp, zfs_fvnodeops); + if (zp->z_phys->zp_parent == zfsvfs->z_shares_dir) + vn_setops(vp, zfs_sharevnodeops); + else + vn_setops(vp, zfs_fvnodeops); break; case VLNK: vn_setops(vp, zfs_symvnodeops); @@ -712,7 +703,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) * flag - flags: * IS_ROOT_NODE - new object will be root * IS_XATTR - new object is an attribute - * IS_REPLAY - intent log replay * bonuslen - length of bonus buffer * setaclp - File/Dir initial ACL * fuidp - Tracks fuid allocation. @@ -722,8 +712,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz) */ void zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, - uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_t *setaclp, - zfs_fuid_info_t **fuidp) + uint_t flag, znode_t **zpp, int bonuslen, zfs_acl_ids_t *acl_ids) { dmu_buf_t *db; znode_phys_t *pzp; @@ -734,9 +723,8 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, ASSERT(vap && (vap->va_mask & (AT_TYPE|AT_MODE)) == (AT_TYPE|AT_MODE)); - if (zfsvfs->z_assign >= TXG_INITIAL) { /* ZIL replay */ + if (zfsvfs->z_replay) { obj = vap->va_nodeid; - flag |= IS_REPLAY; now = vap->va_ctime; /* see zfs_replay_create() */ gen = vap->va_nblocks; /* ditto */ } else { @@ -755,7 +743,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, * assertions below. */ if (vap->va_type == VDIR) { - if (flag & IS_REPLAY) { + if (zfsvfs->z_replay) { err = zap_create_claim_norm(zfsvfs->z_os, obj, zfsvfs->z_norm, DMU_OT_DIRECTORY_CONTENTS, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); @@ -766,7 +754,7 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } else { - if (flag & IS_REPLAY) { + if (zfsvfs->z_replay) { err = dmu_object_claim(zfsvfs->z_os, obj, DMU_OT_PLAIN_FILE_CONTENTS, 0, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); @@ -777,6 +765,8 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, DMU_OT_ZNODE, sizeof (znode_phys_t) + bonuslen, tx); } } + + ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); VERIFY(0 == dmu_bonus_hold(zfsvfs->z_os, obj, NULL, &db)); dmu_buf_will_dirty(db, tx); @@ -835,12 +825,11 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, } else { ZFS_TIME_ENCODE(&now, pzp->zp_mtime); } - - pzp->zp_mode = MAKEIMODE(vap->va_type, vap->va_mode); + pzp->zp_uid = acl_ids->z_fuid; + pzp->zp_gid = acl_ids->z_fgid; + pzp->zp_mode = acl_ids->z_mode; if (!(flag & IS_ROOT_NODE)) { - ZFS_OBJ_HOLD_ENTER(zfsvfs, obj); *zpp = zfs_znode_alloc(zfsvfs, db, 0); - ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } else { /* * If we are creating the root node, the "parent" we @@ -848,7 +837,11 @@ zfs_mknode(znode_t *dzp, vattr_t *vap, dmu_tx_t *tx, cred_t *cr, */ *zpp = dzp; } - zfs_perm_init(*zpp, dzp, flag, vap, tx, cr, setaclp, fuidp); + VERIFY(0 == zfs_aclset_common(*zpp, acl_ids->z_aclp, cr, tx)); + if (vap->va_mask & AT_XVATTR) + zfs_xvattr_set(*zpp, (xvattr_t *)vap); + + ZFS_OBJ_HOLD_EXIT(zfsvfs, obj); } void @@ -914,6 +907,10 @@ zfs_xvattr_set(znode_t *zp, xvattr_t *xvap) zp->z_phys->zp_flags |= ZFS_BONUS_SCANSTAMP; XVA_SET_RTN(xvap, XAT_AV_SCANSTAMP); } + if (XVA_ISSET_REQ(xvap, XAT_REPARSE)) { + ZFS_ATTR_SET(zp, ZFS_REPARSE, xoap->xoa_reparse); + XVA_SET_RTN(xvap, XAT_REPARSE); + } } int @@ -968,11 +965,25 @@ zfs_zget(zfsvfs_t *zfsvfs, uint64_t obj_num, znode_t **zpp) /* * Not found create new znode/vnode + * but only if file exists. + * + * There is a small window where zfs_vget() could + * find this object while a file create is still in + * progress. Since a gen number can never be zero + * we will check that to determine if its an allocated + * file. */ - zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); + + if (((znode_phys_t *)db->db_data)->zp_gen != 0) { + zp = zfs_znode_alloc(zfsvfs, db, doi.doi_data_block_size); + *zpp = zp; + err = 0; + } else { + dmu_buf_rele(db, NULL); + err = ENOENT; + } ZFS_OBJ_HOLD_EXIT(zfsvfs, obj_num); - *zpp = zp; - return (0); + return (err); } int @@ -1006,6 +1017,13 @@ zfs_rezget(znode_t *zp) return (EIO); } + mutex_enter(&zp->z_acl_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + mutex_exit(&zp->z_acl_lock); + zfs_znode_dmu_init(zfsvfs, zp, db); zp->z_unlinked = (zp->z_phys->zp_links == 0); zp->z_blksz = doi.doi_data_block_size; @@ -1098,6 +1116,11 @@ zfs_znode_free(znode_t *zp) list_remove(&zfsvfs->z_all_znodes, zp); mutex_exit(&zfsvfs->z_znodes_lock); + if (zp->z_acl_cached) { + zfs_acl_free(zp->z_acl_cached); + zp->z_acl_cached = NULL; + } + kmem_cache_free(znode_cache, zp); VFS_RELE(zfsvfs->z_vfs); @@ -1254,9 +1277,9 @@ zfs_extend(znode_t *zp, uint64_t end) newblksz = 0; } - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1358,9 +1381,9 @@ zfs_trunc(znode_t *zp, uint64_t end) top: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto top; @@ -1375,15 +1398,12 @@ zfs_trunc(znode_t *zp, uint64_t end) dmu_tx_commit(tx); - zfs_range_unlock(rl); - /* * Clear any mapped pages in the truncated region. This has to * happen outside of the transaction to avoid the possibility of * a deadlock with someone trying to push a page that we are * about to invalidate. */ - rw_enter(&zp->z_map_lock, RW_WRITER); if (vn_has_cached_data(vp)) { page_t *pp; uint64_t start = end & PAGEMASK; @@ -1401,7 +1421,8 @@ zfs_trunc(znode_t *zp, uint64_t end) B_INVAL | B_TRUNC, NULL); ASSERT(error == 0); } - rw_exit(&zp->z_map_lock); + + zfs_range_unlock(rl); return (0); } @@ -1456,9 +1477,9 @@ zfs_freesp(znode_t *zp, uint64_t off, uint64_t len, int flag, boolean_t log) log: tx = dmu_tx_create(zfsvfs->z_os); dmu_tx_hold_bonus(tx, zp->z_id); - error = dmu_tx_assign(tx, zfsvfs->z_assign); + error = dmu_tx_assign(tx, TXG_NOWAIT); if (error) { - if (error == ERESTART && zfsvfs->z_assign == TXG_NOWAIT) { + if (error == ERESTART) { dmu_tx_wait(tx); dmu_tx_abort(tx); goto log; @@ -1478,15 +1499,17 @@ void zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) { zfsvfs_t zfsvfs; - uint64_t moid, doid, version; + uint64_t moid, obj, version; uint64_t sense = ZFS_CASE_SENSITIVE; uint64_t norm = 0; nvpair_t *elem; int error; + int i; znode_t *rootzp = NULL; vnode_t *vp; vattr_t vattr; znode_t *zp; + zfs_acl_ids_t acl_ids; /* * First attempt to create master node. @@ -1503,12 +1526,12 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) /* * Set starting attributes. */ - if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_USERSPACE) version = ZPL_VERSION; + else if (spa_version(dmu_objset_spa(os)) >= SPA_VERSION_FUID) + version = ZPL_VERSION_USERSPACE - 1; else version = ZPL_VERSION_FUID - 1; - error = zap_update(os, moid, ZPL_VERSION_STR, - 8, 1, &version, tx); elem = NULL; while ((elem = nvlist_next_nvpair(zplprops, elem)) != NULL) { /* For the moment we expect all zpl props to be uint64_ts */ @@ -1519,9 +1542,8 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) VERIFY(nvpair_value_uint64(elem, &val) == 0); name = nvpair_name(elem); if (strcmp(name, zfs_prop_to_name(ZFS_PROP_VERSION)) == 0) { - version = val; - error = zap_update(os, moid, ZPL_VERSION_STR, - 8, 1, &version, tx); + if (val < version) + version = val; } else { error = zap_update(os, moid, name, 8, 1, &val, tx); } @@ -1532,13 +1554,14 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) sense = val; } ASSERT(version != 0); + error = zap_update(os, moid, ZPL_VERSION_STR, 8, 1, &version, tx); /* * Create a delete queue. */ - doid = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); + obj = zap_create(os, DMU_OT_UNLINKED_SET, DMU_OT_NONE, 0, tx); - error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &doid, tx); + error = zap_add(os, moid, ZFS_UNLINKED_SET, 8, 1, &obj, tx); ASSERT(error == 0); /* @@ -1562,7 +1585,6 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) bzero(&zfsvfs, sizeof (zfsvfs_t)); zfsvfs.z_os = os; - zfsvfs.z_assign = TXG_NOWAIT; zfsvfs.z_parent = &zfsvfs; zfsvfs.z_version = version; zfsvfs.z_use_fuids = USE_FUIDS(version, os); @@ -1578,19 +1600,36 @@ zfs_create_fs(objset_t *os, cred_t *cr, nvlist_t *zplprops, dmu_tx_t *tx) list_create(&zfsvfs.z_all_znodes, sizeof (znode_t), offsetof(znode_t, z_link_node)); + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_init(&zfsvfs.z_hold_mtx[i], NULL, MUTEX_DEFAULT, NULL); + ASSERT(!POINTER_IS_VALID(rootzp->z_zfsvfs)); rootzp->z_zfsvfs = &zfsvfs; - zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, NULL, NULL); + VERIFY(0 == zfs_acl_ids_create(rootzp, IS_ROOT_NODE, &vattr, + cr, NULL, &acl_ids)); + zfs_mknode(rootzp, &vattr, tx, cr, IS_ROOT_NODE, &zp, 0, &acl_ids); ASSERT3P(zp, ==, rootzp); ASSERT(!vn_in_dnlc(ZTOV(rootzp))); /* not valid to move */ error = zap_add(os, moid, ZFS_ROOT_OBJ, 8, 1, &rootzp->z_id, tx); ASSERT(error == 0); + zfs_acl_ids_free(&acl_ids); POINTER_INVALIDATE(&rootzp->z_zfsvfs); ZTOV(rootzp)->v_count = 0; dmu_buf_rele(rootzp->z_dbuf, NULL); rootzp->z_dbuf = NULL; kmem_cache_free(znode_cache, rootzp); + + /* + * Create shares directory + */ + + error = zfs_create_share_dir(&zfsvfs, tx); + + ASSERT(error == 0); + + for (i = 0; i != ZFS_OBJ_MTX_SZ; i++) + mutex_destroy(&zfsvfs.z_hold_mtx[i]); } #endif /* _KERNEL */ diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c index 043cdb12f33a5..d5459465b9eea 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zil.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -76,11 +76,17 @@ boolean_t zfs_nocacheflush = B_FALSE; static kmem_cache_t *zil_lwb_cache; +static boolean_t zil_empty(zilog_t *zilog); + +#define LWB_EMPTY(lwb) ((BP_GET_LSIZE(&lwb->lwb_blk) - \ + sizeof (zil_chain_t)) == (lwb->lwb_sz - lwb->lwb_nused)) + + static int -zil_dva_compare(const void *x1, const void *x2) +zil_bp_compare(const void *x1, const void *x2) { - const dva_t *dva1 = x1; - const dva_t *dva2 = x2; + const dva_t *dva1 = &((zil_bp_node_t *)x1)->zn_dva; + const dva_t *dva2 = &((zil_bp_node_t *)x2)->zn_dva; if (DVA_GET_VDEV(dva1) < DVA_GET_VDEV(dva2)) return (-1); @@ -96,34 +102,37 @@ zil_dva_compare(const void *x1, const void *x2) } static void -zil_dva_tree_init(avl_tree_t *t) +zil_bp_tree_init(zilog_t *zilog) { - avl_create(t, zil_dva_compare, sizeof (zil_dva_node_t), - offsetof(zil_dva_node_t, zn_node)); + avl_create(&zilog->zl_bp_tree, zil_bp_compare, + sizeof (zil_bp_node_t), offsetof(zil_bp_node_t, zn_node)); } static void -zil_dva_tree_fini(avl_tree_t *t) +zil_bp_tree_fini(zilog_t *zilog) { - zil_dva_node_t *zn; + avl_tree_t *t = &zilog->zl_bp_tree; + zil_bp_node_t *zn; void *cookie = NULL; while ((zn = avl_destroy_nodes(t, &cookie)) != NULL) - kmem_free(zn, sizeof (zil_dva_node_t)); + kmem_free(zn, sizeof (zil_bp_node_t)); avl_destroy(t); } -static int -zil_dva_tree_add(avl_tree_t *t, dva_t *dva) +int +zil_bp_tree_add(zilog_t *zilog, const blkptr_t *bp) { - zil_dva_node_t *zn; + avl_tree_t *t = &zilog->zl_bp_tree; + const dva_t *dva = BP_IDENTITY(bp); + zil_bp_node_t *zn; avl_index_t where; if (avl_find(t, dva, &where) != NULL) return (EEXIST); - zn = kmem_alloc(sizeof (zil_dva_node_t), KM_SLEEP); + zn = kmem_alloc(sizeof (zil_bp_node_t), KM_SLEEP); zn->zn_dva = *dva; avl_insert(t, zn, where); @@ -148,35 +157,31 @@ zil_init_log_chain(zilog_t *zilog, blkptr_t *bp) } /* - * Read a log block, make sure it's valid, and byteswap it if necessary. + * Read a log block and make sure it's valid. */ static int -zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) +zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, blkptr_t *nbp, void *dst, + char **end) { - blkptr_t blk = *bp; - zbookmark_t zb; + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf = NULL; + zbookmark_t zb; int error; - zb.zb_objset = bp->blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = bp->blk_cksum.zc_word[ZIL_ZC_SEQ]; + if (zilog->zl_header->zh_claim_txg == 0) + zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; - *abufpp = NULL; + if (!(zilog->zl_header->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) + zio_flags |= ZIO_FLAG_SPECULATIVE; - /* - * We shouldn't be doing any scrubbing while we're doing log - * replay, it's OK to not lock. - */ - error = arc_read_nolock(NULL, zilog->zl_spa, &blk, - arc_getbuf_func, abufpp, ZIO_PRIORITY_SYNC_READ, ZIO_FLAG_CANFAIL | - ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB, &aflags, &zb); + SET_BOOKMARK(&zb, bp->blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, bp->blk_cksum.zc_word[ZIL_ZC_SEQ]); + + error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); if (error == 0) { - char *data = (*abufpp)->b_data; - uint64_t blksz = BP_GET_LSIZE(bp); - zil_trailer_t *ztp = (zil_trailer_t *)(data + blksz) - 1; zio_cksum_t cksum = bp->blk_cksum; /* @@ -189,43 +194,102 @@ zil_read_log_block(zilog_t *zilog, const blkptr_t *bp, arc_buf_t **abufpp) */ cksum.zc_word[ZIL_ZC_SEQ]++; - if (bcmp(&cksum, &ztp->zit_next_blk.blk_cksum, - sizeof (cksum)) || BP_IS_HOLE(&ztp->zit_next_blk) || - (ztp->zit_nused > (blksz - sizeof (zil_trailer_t)))) { - error = ECKSUM; - } + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + zil_chain_t *zilc = abuf->b_data; + char *lr = (char *)(zilc + 1); + uint64_t len = zilc->zc_nused - sizeof (zil_chain_t); - if (error) { - VERIFY(arc_buf_remove_ref(*abufpp, abufpp) == 1); - *abufpp = NULL; + if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk)) { + error = ECKSUM; + } else { + bcopy(lr, dst, len); + *end = (char *)dst + len; + *nbp = zilc->zc_next_blk; + } + } else { + char *lr = abuf->b_data; + uint64_t size = BP_GET_LSIZE(bp); + zil_chain_t *zilc = (zil_chain_t *)(lr + size) - 1; + + if (bcmp(&cksum, &zilc->zc_next_blk.blk_cksum, + sizeof (cksum)) || BP_IS_HOLE(&zilc->zc_next_blk) || + (zilc->zc_nused > (size - sizeof (*zilc)))) { + error = ECKSUM; + } else { + bcopy(lr, dst, zilc->zc_nused); + *end = (char *)dst + zilc->zc_nused; + *nbp = zilc->zc_next_blk; + } } + + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + } + + return (error); +} + +/* + * Read a TX_WRITE log data block. + */ +static int +zil_read_log_data(zilog_t *zilog, const lr_write_t *lr, void *wbuf) +{ + enum zio_flag zio_flags = ZIO_FLAG_CANFAIL; + const blkptr_t *bp = &lr->lr_blkptr; + uint32_t aflags = ARC_WAIT; + arc_buf_t *abuf = NULL; + zbookmark_t zb; + int error; + + if (BP_IS_HOLE(bp)) { + if (wbuf != NULL) + bzero(wbuf, MAX(BP_GET_LSIZE(bp), lr->lr_length)); + return (0); } - dprintf("error %d on %llu:%llu\n", error, zb.zb_objset, zb.zb_blkid); + if (zilog->zl_header->zh_claim_txg == 0) + zio_flags |= ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB; + + SET_BOOKMARK(&zb, dmu_objset_id(zilog->zl_os), lr->lr_foid, + ZB_ZIL_LEVEL, lr->lr_offset / BP_GET_LSIZE(bp)); + + error = arc_read_nolock(NULL, zilog->zl_spa, bp, arc_getbuf_func, &abuf, + ZIO_PRIORITY_SYNC_READ, zio_flags, &aflags, &zb); + + if (error == 0) { + if (wbuf != NULL) + bcopy(abuf->b_data, wbuf, arc_buf_size(abuf)); + (void) arc_buf_remove_ref(abuf, &abuf); + } return (error); } /* * Parse the intent log, and call parse_func for each valid record within. - * Return the highest sequence number. */ -uint64_t +int zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, zil_parse_lr_func_t *parse_lr_func, void *arg, uint64_t txg) { const zil_header_t *zh = zilog->zl_header; - uint64_t claim_seq = zh->zh_claim_seq; - uint64_t seq = 0; - uint64_t max_seq = 0; - blkptr_t blk = zh->zh_log; - arc_buf_t *abuf; + boolean_t claimed = !!zh->zh_claim_txg; + uint64_t claim_blk_seq = claimed ? zh->zh_claim_blk_seq : UINT64_MAX; + uint64_t claim_lr_seq = claimed ? zh->zh_claim_lr_seq : UINT64_MAX; + uint64_t max_blk_seq = 0; + uint64_t max_lr_seq = 0; + uint64_t blk_count = 0; + uint64_t lr_count = 0; + blkptr_t blk, next_blk; char *lrbuf, *lrp; - zil_trailer_t *ztp; - int reclen, error; + int error = 0; - if (BP_IS_HOLE(&blk)) - return (max_seq); + /* + * Old logs didn't record the maximum zh_claim_lr_seq. + */ + if (!(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID)) + claim_lr_seq = UINT64_MAX; /* * Starting at the block pointed to by zh_log we read the log chain. @@ -236,105 +300,156 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, * If the log has been claimed, stop if we encounter a sequence * number greater than the highest claimed sequence number. */ - zil_dva_tree_init(&zilog->zl_dva_tree); - for (;;) { - seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; - - if (claim_seq != 0 && seq > claim_seq) - break; + lrbuf = zio_buf_alloc(SPA_MAXBLOCKSIZE); + zil_bp_tree_init(zilog); - ASSERT(max_seq < seq); - max_seq = seq; + for (blk = zh->zh_log; !BP_IS_HOLE(&blk); blk = next_blk) { + uint64_t blk_seq = blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + int reclen; + char *end; - error = zil_read_log_block(zilog, &blk, &abuf); + if (blk_seq > claim_blk_seq) + break; + if ((error = parse_blk_func(zilog, &blk, arg, txg)) != 0) + break; + ASSERT3U(max_blk_seq, <, blk_seq); + max_blk_seq = blk_seq; + blk_count++; - if (parse_blk_func != NULL) - parse_blk_func(zilog, &blk, arg, txg); + if (max_lr_seq == claim_lr_seq && max_blk_seq == claim_blk_seq) + break; + error = zil_read_log_block(zilog, &blk, &next_blk, lrbuf, &end); if (error) break; - lrbuf = abuf->b_data; - ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; - blk = ztp->zit_next_blk; - - if (parse_lr_func == NULL) { - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - continue; - } - - for (lrp = lrbuf; lrp < lrbuf + ztp->zit_nused; lrp += reclen) { + for (lrp = lrbuf; lrp < end; lrp += reclen) { lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); - parse_lr_func(zilog, lr, arg, txg); + if (lr->lrc_seq > claim_lr_seq) + goto done; + if ((error = parse_lr_func(zilog, lr, arg, txg)) != 0) + goto done; + ASSERT3U(max_lr_seq, <, lr->lrc_seq); + max_lr_seq = lr->lrc_seq; + lr_count++; } - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); } - zil_dva_tree_fini(&zilog->zl_dva_tree); +done: + zilog->zl_parse_error = error; + zilog->zl_parse_blk_seq = max_blk_seq; + zilog->zl_parse_lr_seq = max_lr_seq; + zilog->zl_parse_blk_count = blk_count; + zilog->zl_parse_lr_count = lr_count; + + ASSERT(!claimed || !(zh->zh_flags & ZIL_CLAIM_LR_SEQ_VALID) || + (max_blk_seq == claim_blk_seq && max_lr_seq == claim_lr_seq)); - return (max_seq); + zil_bp_tree_fini(zilog); + zio_buf_free(lrbuf, SPA_MAXBLOCKSIZE); + + return (error); } -/* ARGSUSED */ -static void +static int zil_claim_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t first_txg) { - spa_t *spa = zilog->zl_spa; - int err; - /* * Claim log block if not already committed and not already claimed. + * If tx == NULL, just verify that the block is claimable. */ - if (bp->blk_birth >= first_txg && - zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp)) == 0) { - err = zio_wait(zio_claim(NULL, spa, first_txg, bp, NULL, NULL, - ZIO_FLAG_MUSTSUCCEED)); - ASSERT(err == 0); - } + if (bp->blk_birth < first_txg || zil_bp_tree_add(zilog, bp) != 0) + return (0); + + return (zio_wait(zio_claim(NULL, zilog->zl_spa, + tx == NULL ? 0 : first_txg, bp, spa_claim_notify, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SCRUB))); } -static void +static int zil_claim_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t first_txg) { - if (lrc->lrc_txtype == TX_WRITE) { - lr_write_t *lr = (lr_write_t *)lrc; - zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg); - } + lr_write_t *lr = (lr_write_t *)lrc; + int error; + + if (lrc->lrc_txtype != TX_WRITE) + return (0); + + /* + * If the block is not readable, don't claim it. This can happen + * in normal operation when a log block is written to disk before + * some of the dmu_sync() blocks it points to. In this case, the + * transaction cannot have been committed to anyone (we would have + * waited for all writes to be stable first), so it is semantically + * correct to declare this the end of the log. + */ + if (lr->lr_blkptr.blk_birth >= first_txg && + (error = zil_read_log_data(zilog, lr, NULL)) != 0) + return (error); + return (zil_claim_log_block(zilog, &lr->lr_blkptr, tx, first_txg)); } /* ARGSUSED */ -static void +static int zil_free_log_block(zilog_t *zilog, blkptr_t *bp, void *tx, uint64_t claim_txg) { - zio_free_blk(zilog->zl_spa, bp, dmu_tx_get_txg(tx)); + zio_free_zil(zilog->zl_spa, dmu_tx_get_txg(tx), bp); + + return (0); } -static void +static int zil_free_log_record(zilog_t *zilog, lr_t *lrc, void *tx, uint64_t claim_txg) { + lr_write_t *lr = (lr_write_t *)lrc; + blkptr_t *bp = &lr->lr_blkptr; + /* * If we previously claimed it, we need to free it. */ - if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE) { - lr_write_t *lr = (lr_write_t *)lrc; - blkptr_t *bp = &lr->lr_blkptr; - if (bp->blk_birth >= claim_txg && - !zil_dva_tree_add(&zilog->zl_dva_tree, BP_IDENTITY(bp))) { - (void) arc_free(NULL, zilog->zl_spa, - dmu_tx_get_txg(tx), bp, NULL, NULL, ARC_WAIT); - } + if (claim_txg != 0 && lrc->lrc_txtype == TX_WRITE && + bp->blk_birth >= claim_txg && zil_bp_tree_add(zilog, bp) == 0) + zio_free(zilog->zl_spa, dmu_tx_get_txg(tx), bp); + + return (0); +} + +static lwb_t * +zil_alloc_lwb(zilog_t *zilog, blkptr_t *bp, uint64_t txg) +{ + lwb_t *lwb; + + lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); + lwb->lwb_zilog = zilog; + lwb->lwb_blk = *bp; + lwb->lwb_buf = zio_buf_alloc(BP_GET_LSIZE(bp)); + lwb->lwb_max_txg = txg; + lwb->lwb_zio = NULL; + lwb->lwb_tx = NULL; + if (BP_GET_CHECKSUM(bp) == ZIO_CHECKSUM_ZILOG2) { + lwb->lwb_nused = sizeof (zil_chain_t); + lwb->lwb_sz = BP_GET_LSIZE(bp); + } else { + lwb->lwb_nused = 0; + lwb->lwb_sz = BP_GET_LSIZE(bp) - sizeof (zil_chain_t); } + + mutex_enter(&zilog->zl_lock); + list_insert_tail(&zilog->zl_lwb_list, lwb); + mutex_exit(&zilog->zl_lock); + + return (lwb); } /* * Create an on-disk intent log. */ -static void +static lwb_t * zil_create(zilog_t *zilog) { const zil_header_t *zh = zilog->zl_header; - lwb_t *lwb; + lwb_t *lwb = NULL; uint64_t txg = 0; dmu_tx_t *tx = NULL; blkptr_t blk; @@ -351,22 +466,23 @@ zil_create(zilog_t *zilog) blk = zh->zh_log; /* - * If we don't already have an initial log block or we have one - * but it's the wrong endianness then allocate one. + * Allocate an initial log block if: + * - there isn't one already + * - the existing block is the wrong endianess */ if (BP_IS_HOLE(&blk) || BP_SHOULD_BYTESWAP(&blk)) { tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); + VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); if (!BP_IS_HOLE(&blk)) { - zio_free_blk(zilog->zl_spa, &blk, txg); + zio_free_zil(zilog->zl_spa, txg, &blk); BP_ZERO(&blk); } - error = zio_alloc_blk(zilog->zl_spa, ZIL_MIN_BLKSZ, &blk, - NULL, txg); + error = zio_alloc_zil(zilog->zl_spa, txg, &blk, NULL, + ZIL_MIN_BLKSZ, zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); if (error == 0) zil_init_log_chain(zilog, &blk); @@ -375,20 +491,8 @@ zil_create(zilog_t *zilog) /* * Allocate a log write buffer (lwb) for the first log block. */ - if (error == 0) { - lwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); - lwb->lwb_zilog = zilog; - lwb->lwb_blk = blk; - lwb->lwb_nused = 0; - lwb->lwb_sz = BP_GET_LSIZE(&lwb->lwb_blk); - lwb->lwb_buf = zio_buf_alloc(lwb->lwb_sz); - lwb->lwb_max_txg = txg; - lwb->lwb_zio = NULL; - - mutex_enter(&zilog->zl_lock); - list_insert_tail(&zilog->zl_lwb_list, lwb); - mutex_exit(&zilog->zl_lock); - } + if (error == 0) + lwb = zil_alloc_lwb(zilog, &blk, txg); /* * If we just allocated the first log block, commit our transaction @@ -401,6 +505,8 @@ zil_create(zilog_t *zilog) } ASSERT(bcmp(&blk, &zh->zh_log, sizeof (blk)) == 0); + + return (lwb); } /* @@ -425,26 +531,18 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) */ txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + zilog->zl_old_header = *zh; /* debugging aid */ + if (BP_IS_HOLE(&zh->zh_log)) return; tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); + VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); mutex_enter(&zilog->zl_lock); - /* - * It is possible for the ZIL to get the previously mounted zilog - * structure of the same dataset if quickly remounted and the dbuf - * eviction has not completed. In this case we can see a non - * empty lwb list and keep_first will be set. We fix this by - * clearing the keep_first. This will be slower but it's very rare. - */ - if (!list_is_empty(&zilog->zl_lwb_list) && keep_first) - keep_first = B_FALSE; - ASSERT3U(zilog->zl_destroy_txg, <, txg); zilog->zl_destroy_txg = txg; zilog->zl_keep_first = keep_first; @@ -456,53 +554,20 @@ zil_destroy(zilog_t *zilog, boolean_t keep_first) list_remove(&zilog->zl_lwb_list, lwb); if (lwb->lwb_buf != NULL) zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); - zio_free_blk(zilog->zl_spa, &lwb->lwb_blk, txg); + zio_free_zil(zilog->zl_spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); } - } else { - if (!keep_first) { - (void) zil_parse(zilog, zil_free_log_block, - zil_free_log_record, tx, zh->zh_claim_txg); - } + } else if (!keep_first) { + (void) zil_parse(zilog, zil_free_log_block, + zil_free_log_record, tx, zh->zh_claim_txg); } mutex_exit(&zilog->zl_lock); dmu_tx_commit(tx); } -/* - * zil_rollback_destroy() is only called by the rollback code. - * We already have a syncing tx. Rollback has exclusive access to the - * dataset, so we don't have to worry about concurrent zil access. - * The actual freeing of any log blocks occurs in zil_sync() later in - * this txg syncing phase. - */ -void -zil_rollback_destroy(zilog_t *zilog, dmu_tx_t *tx) -{ - const zil_header_t *zh = zilog->zl_header; - uint64_t txg; - - if (BP_IS_HOLE(&zh->zh_log)) - return; - - txg = dmu_tx_get_txg(tx); - ASSERT3U(zilog->zl_destroy_txg, <, txg); - zilog->zl_destroy_txg = txg; - zilog->zl_keep_first = B_FALSE; - - /* - * Ensure there's no outstanding ZIL IO. No lwbs or just the - * unused one that allocated in advance is ok. - */ - ASSERT(zilog->zl_lwb_list.list_head.list_next == - zilog->zl_lwb_list.list_head.list_prev); - (void) zil_parse(zilog, zil_free_log_block, zil_free_log_record, - tx, zh->zh_claim_txg); -} - int -zil_claim(char *osname, void *txarg) +zil_claim(const char *osname, void *txarg) { dmu_tx_t *tx = txarg; uint64_t first_txg = dmu_tx_get_txg(tx); @@ -511,7 +576,7 @@ zil_claim(char *osname, void *txarg) objset_t *os; int error; - error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + error = dmu_objset_hold(osname, FTAG, &os); if (error) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); @@ -520,6 +585,15 @@ zil_claim(char *osname, void *txarg) zilog = dmu_objset_zil(os); zh = zil_header_in_syncing_context(zilog); + if (spa_get_log_state(zilog->zl_spa) == SPA_LOG_CLEAR) { + if (!BP_IS_HOLE(&zh->zh_log)) + zio_free_zil(zilog->zl_spa, first_txg, &zh->zh_log); + BP_ZERO(&zh->zh_log); + dsl_dataset_dirty(dmu_objset_ds(os), tx); + dmu_objset_rele(os, FTAG); + return (0); + } + /* * Claim all log blocks if we haven't already done so, and remember * the highest claimed sequence number. This ensures that if we can @@ -529,14 +603,19 @@ zil_claim(char *osname, void *txarg) */ ASSERT3U(zh->zh_claim_txg, <=, first_txg); if (zh->zh_claim_txg == 0 && !BP_IS_HOLE(&zh->zh_log)) { - zh->zh_claim_txg = first_txg; - zh->zh_claim_seq = zil_parse(zilog, zil_claim_log_block, + (void) zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, first_txg); + zh->zh_claim_txg = first_txg; + zh->zh_claim_blk_seq = zilog->zl_parse_blk_seq; + zh->zh_claim_lr_seq = zilog->zl_parse_lr_seq; + if (zilog->zl_parse_lr_count || zilog->zl_parse_blk_count > 1) + zh->zh_flags |= ZIL_REPLAY_NEEDED; + zh->zh_flags |= ZIL_CLAIM_LR_SEQ_VALID; dsl_dataset_dirty(dmu_objset_ds(os), tx); } ASSERT3U(first_txg, ==, (spa_last_synced_txg(zilog->zl_spa) + 1)); - dmu_objset_close(os); + dmu_objset_rele(os, FTAG); return (0); } @@ -545,76 +624,36 @@ zil_claim(char *osname, void *txarg) * Checksum errors are ok as they indicate the end of the chain. * Any other error (no device or read failure) returns an error. */ -/* ARGSUSED */ int -zil_check_log_chain(char *osname, void *txarg) +zil_check_log_chain(const char *osname, void *tx) { zilog_t *zilog; - zil_header_t *zh; - blkptr_t blk; - arc_buf_t *abuf; objset_t *os; - char *lrbuf; - zil_trailer_t *ztp; int error; - error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); + ASSERT(tx == NULL); + + error = dmu_objset_hold(osname, FTAG, &os); if (error) { cmn_err(CE_WARN, "can't open objset for %s", osname); return (0); } zilog = dmu_objset_zil(os); - zh = zil_header_in_syncing_context(zilog); - blk = zh->zh_log; - if (BP_IS_HOLE(&blk)) { - dmu_objset_close(os); - return (0); /* no chain */ - } - - for (;;) { - error = zil_read_log_block(zilog, &blk, &abuf); - if (error) - break; - lrbuf = abuf->b_data; - ztp = (zil_trailer_t *)(lrbuf + BP_GET_LSIZE(&blk)) - 1; - blk = ztp->zit_next_blk; - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - } - dmu_objset_close(os); - if (error == ECKSUM) - return (0); /* normal end of chain */ - return (error); -} -/* - * Clear a log chain - */ -/* ARGSUSED */ -int -zil_clear_log_chain(char *osname, void *txarg) -{ - zilog_t *zilog; - zil_header_t *zh; - objset_t *os; - dmu_tx_t *tx; - int error; + /* + * Because tx == NULL, zil_claim_log_block() will not actually claim + * any blocks, but just determine whether it is possible to do so. + * In addition to checking the log chain, zil_claim_log_block() + * will invoke zio_claim() with a done func of spa_claim_notify(), + * which will update spa_max_claim_txg. See spa_load() for details. + */ + error = zil_parse(zilog, zil_claim_log_block, zil_claim_log_record, tx, + zilog->zl_header->zh_claim_txg ? -1ULL : spa_first_txg(os->os_spa)); - error = dmu_objset_open(osname, DMU_OST_ANY, DS_MODE_USER, &os); - if (error) { - cmn_err(CE_WARN, "can't open objset for %s", osname); - return (0); - } + dmu_objset_rele(os, FTAG); - zilog = dmu_objset_zil(os); - tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); - zh = zil_header_in_syncing_context(zilog); - BP_ZERO(&zh->zh_log); - dsl_dataset_dirty(dmu_objset_ds(os), tx); - dmu_tx_commit(tx); - dmu_objset_close(os); - return (0); + return ((error == ECKSUM || error == ENOENT) ? 0 : error); } static int @@ -632,7 +671,7 @@ zil_vdev_compare(const void *x1, const void *x2) } void -zil_add_block(zilog_t *zilog, blkptr_t *bp) +zil_add_block(zilog_t *zilog, const blkptr_t *bp) { avl_tree_t *t = &zilog->zl_vdev_tree; avl_index_t where; @@ -708,9 +747,9 @@ zil_lwb_write_done(zio_t *zio) { lwb_t *lwb = zio->io_private; zilog_t *zilog = lwb->lwb_zilog; + dmu_tx_t *tx = lwb->lwb_tx; ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); - ASSERT(BP_GET_CHECKSUM(zio->io_bp) == ZIO_CHECKSUM_ZILOG); ASSERT(BP_GET_TYPE(zio->io_bp) == DMU_OT_INTENT_LOG); ASSERT(BP_GET_LEVEL(zio->io_bp) == 0); ASSERT(BP_GET_BYTEORDER(zio->io_bp) == ZFS_HOST_BYTEORDER); @@ -719,18 +758,25 @@ zil_lwb_write_done(zio_t *zio) ASSERT(zio->io_bp->blk_fill == 0); /* - * Now that we've written this log block, we have a stable pointer - * to the next block in the chain, so it's OK to let the txg in - * which we allocated the next block sync. + * Ensure the lwb buffer pointer is cleared before releasing + * the txg. If we have had an allocation failure and + * the txg is waiting to sync then we want want zil_sync() + * to remove the lwb so that it's not picked up as the next new + * one in zil_commit_writer(). zil_sync() will only remove + * the lwb if lwb_buf is null. */ - txg_rele_to_sync(&lwb->lwb_txgh); - zio_buf_free(lwb->lwb_buf, lwb->lwb_sz); mutex_enter(&zilog->zl_lock); lwb->lwb_buf = NULL; - if (zio->io_error) - zilog->zl_log_error = B_TRUE; + lwb->lwb_tx = NULL; mutex_exit(&zilog->zl_lock); + + /* + * Now that we've written this log block, we have a stable pointer + * to the next block in the chain, so it's OK to let the txg in + * which we allocated the next block sync. + */ + dmu_tx_commit(tx); } /* @@ -741,10 +787,9 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) { zbookmark_t zb; - zb.zb_objset = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET]; - zb.zb_object = 0; - zb.zb_level = -1; - zb.zb_blkid = lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]; + SET_BOOKMARK(&zb, lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_OBJSET], + ZB_ZIL_OBJECT, ZB_ZIL_LEVEL, + lwb->lwb_blk.blk_cksum.zc_word[ZIL_ZC_SEQ]); if (zilog->zl_root_zio == NULL) { zilog->zl_root_zio = zio_root(zilog->zl_spa, NULL, NULL, @@ -752,12 +797,35 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) } if (lwb->lwb_zio == NULL) { lwb->lwb_zio = zio_rewrite(zilog->zl_root_zio, zilog->zl_spa, - 0, &lwb->lwb_blk, lwb->lwb_buf, - lwb->lwb_sz, zil_lwb_write_done, lwb, - ZIO_PRIORITY_LOG_WRITE, ZIO_FLAG_CANFAIL, &zb); + 0, &lwb->lwb_blk, lwb->lwb_buf, BP_GET_LSIZE(&lwb->lwb_blk), + zil_lwb_write_done, lwb, ZIO_PRIORITY_LOG_WRITE, + ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE, &zb); } } +/* + * Define a limited set of intent log block sizes. + * These must be a multiple of 4KB. Note only the amount used (again + * aligned to 4KB) actually gets written. However, we can't always just + * allocate SPA_MAXBLOCKSIZE as the slog space could be exhausted. + */ +uint64_t zil_block_buckets[] = { + 4096, /* non TX_WRITE */ + 8192+4096, /* data base */ + 32*1024 + 4096, /* NFS writes */ + UINT64_MAX +}; + +/* + * Use the slog as long as the logbias is 'latency' and the current commit size + * is less than the limit or the total list size is less than 2X the limit. + * Limit checking is disabled by setting zil_slog_limit to UINT64_MAX. + */ +uint64_t zil_slog_limit = 1024 * 1024; +#define USE_SLOG(zilog) (((zilog)->zl_logbias == ZFS_LOGBIAS_LATENCY) && \ + (((zilog)->zl_cur_used < zil_slog_limit) || \ + ((zilog)->zl_itx_list_sz < (zil_slog_limit << 1)))) + /* * Start a log block write and advance to the next log block. * Calls are serialized. @@ -765,105 +833,105 @@ zil_lwb_write_init(zilog_t *zilog, lwb_t *lwb) static lwb_t * zil_lwb_write_start(zilog_t *zilog, lwb_t *lwb) { - lwb_t *nlwb; - zil_trailer_t *ztp = (zil_trailer_t *)(lwb->lwb_buf + lwb->lwb_sz) - 1; + lwb_t *nlwb = NULL; + zil_chain_t *zilc; spa_t *spa = zilog->zl_spa; - blkptr_t *bp = &ztp->zit_next_blk; + blkptr_t *bp; + dmu_tx_t *tx; uint64_t txg; uint64_t zil_blksz; - int error; + int i, error; + + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { + zilc = (zil_chain_t *)lwb->lwb_buf; + bp = &zilc->zc_next_blk; + } else { + zilc = (zil_chain_t *)(lwb->lwb_buf + lwb->lwb_sz); + bp = &zilc->zc_next_blk; + } - ASSERT(lwb->lwb_nused <= ZIL_BLK_DATA_SZ(lwb)); + ASSERT(lwb->lwb_nused <= lwb->lwb_sz); /* * Allocate the next block and save its address in this block * before writing it in order to establish the log chain. * Note that if the allocation of nlwb synced before we wrote * the block that points at it (lwb), we'd leak it if we crashed. - * Therefore, we don't do txg_rele_to_sync() until zil_lwb_write_done(). + * Therefore, we don't do dmu_tx_commit() until zil_lwb_write_done(). + * We dirty the dataset to ensure that zil_sync() will be called + * to clean up in the event of allocation failure or I/O failure. */ - txg = txg_hold_open(zilog->zl_dmu_pool, &lwb->lwb_txgh); - txg_rele_to_quiesce(&lwb->lwb_txgh); + tx = dmu_tx_create(zilog->zl_os); + VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + txg = dmu_tx_get_txg(tx); + + lwb->lwb_tx = tx; /* - * Pick a ZIL blocksize. We request a size that is the - * maximum of the previous used size, the current used size and - * the amount waiting in the queue. + * Log blocks are pre-allocated. Here we select the size of the next + * block, based on size used in the last block. + * - first find the smallest bucket that will fit the block from a + * limited set of block sizes. This is because it's faster to write + * blocks allocated from the same metaslab as they are adjacent or + * close. + * - next find the maximum from the new suggested size and an array of + * previous sizes. This lessens a picket fence effect of wrongly + * guesssing the size if we have a stream of say 2k, 64k, 2k, 64k + * requests. + * + * Note we only write what is used, but we can't just allocate + * the maximum block size because we can exhaust the available + * pool log space. */ - zil_blksz = MAX(zilog->zl_prev_used, - zilog->zl_cur_used + sizeof (*ztp)); - zil_blksz = MAX(zil_blksz, zilog->zl_itx_list_sz + sizeof (*ztp)); - zil_blksz = P2ROUNDUP_TYPED(zil_blksz, ZIL_MIN_BLKSZ, uint64_t); - if (zil_blksz > ZIL_MAX_BLKSZ) - zil_blksz = ZIL_MAX_BLKSZ; + zil_blksz = zilog->zl_cur_used + sizeof (zil_chain_t); + for (i = 0; zil_blksz > zil_block_buckets[i]; i++) + continue; + zil_blksz = zil_block_buckets[i]; + if (zil_blksz == UINT64_MAX) + zil_blksz = SPA_MAXBLOCKSIZE; + zilog->zl_prev_blks[zilog->zl_prev_rotor] = zil_blksz; + for (i = 0; i < ZIL_PREV_BLKS; i++) + zil_blksz = MAX(zil_blksz, zilog->zl_prev_blks[i]); + zilog->zl_prev_rotor = (zilog->zl_prev_rotor + 1) & (ZIL_PREV_BLKS - 1); BP_ZERO(bp); /* pass the old blkptr in order to spread log blocks across devs */ - error = zio_alloc_blk(spa, zil_blksz, bp, &lwb->lwb_blk, txg); - if (error) { - dmu_tx_t *tx = dmu_tx_create_assigned(zilog->zl_dmu_pool, txg); + error = zio_alloc_zil(spa, txg, bp, &lwb->lwb_blk, zil_blksz, + USE_SLOG(zilog)); + if (!error) { + ASSERT3U(bp->blk_birth, ==, txg); + bp->blk_cksum = lwb->lwb_blk.blk_cksum; + bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; /* - * We dirty the dataset to ensure that zil_sync() will - * be called to remove this lwb from our zl_lwb_list. - * Failing to do so, may leave an lwb with a NULL lwb_buf - * hanging around on the zl_lwb_list. + * Allocate a new log write buffer (lwb). */ - dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); - dmu_tx_commit(tx); + nlwb = zil_alloc_lwb(zilog, bp, txg); - /* - * Since we've just experienced an allocation failure so we - * terminate the current lwb and send it on its way. - */ - ztp->zit_pad = 0; - ztp->zit_nused = lwb->lwb_nused; - ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; - zio_nowait(lwb->lwb_zio); - - /* - * By returning NULL the caller will call tx_wait_synced() - */ - return (NULL); + /* Record the block for later vdev flushing */ + zil_add_block(zilog, &lwb->lwb_blk); } - ASSERT3U(bp->blk_birth, ==, txg); - ztp->zit_pad = 0; - ztp->zit_nused = lwb->lwb_nused; - ztp->zit_bt.zbt_cksum = lwb->lwb_blk.blk_cksum; - bp->blk_cksum = lwb->lwb_blk.blk_cksum; - bp->blk_cksum.zc_word[ZIL_ZC_SEQ]++; + if (BP_GET_CHECKSUM(&lwb->lwb_blk) == ZIO_CHECKSUM_ZILOG2) { + uint64_t len; - /* - * Allocate a new log write buffer (lwb). - */ - nlwb = kmem_cache_alloc(zil_lwb_cache, KM_SLEEP); - - nlwb->lwb_zilog = zilog; - nlwb->lwb_blk = *bp; - nlwb->lwb_nused = 0; - nlwb->lwb_sz = BP_GET_LSIZE(&nlwb->lwb_blk); - nlwb->lwb_buf = zio_buf_alloc(nlwb->lwb_sz); - nlwb->lwb_max_txg = txg; - nlwb->lwb_zio = NULL; + /* For Slim ZIL only write what is used. */ + len = P2ROUNDUP_TYPED(lwb->lwb_nused, ZIL_MIN_BLKSZ, uint64_t); + ASSERT3U(len, <=, lwb->lwb_sz); + zio_shrink(lwb->lwb_zio, len); - /* - * Put new lwb at the end of the log chain - */ - mutex_enter(&zilog->zl_lock); - list_insert_tail(&zilog->zl_lwb_list, nlwb); - mutex_exit(&zilog->zl_lock); + } + zilc->zc_pad = 0; + zilc->zc_nused = lwb->lwb_nused; + zilc->zc_eck.zec_cksum = lwb->lwb_blk.blk_cksum; - /* Record the block for later vdev flushing */ - zil_add_block(zilog, &lwb->lwb_blk); + zio_nowait(lwb->lwb_zio); /* Kick off the write for the old log block */ /* - * kick off the write for the old log block + * If there was an allocation failure then nlwb will be null which + * forces a txg_wait_synced(). */ - dprintf_bp(&lwb->lwb_blk, "lwb %p txg %llu: ", lwb, txg); - ASSERT(lwb->lwb_zio); - zio_nowait(lwb->lwb_zio); - return (nlwb); } @@ -871,20 +939,20 @@ static lwb_t * zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) { lr_t *lrc = &itx->itx_lr; /* common log record */ - lr_write_t *lr = (lr_write_t *)lrc; + lr_write_t *lrw = (lr_write_t *)lrc; + char *lr_buf; uint64_t txg = lrc->lrc_txg; uint64_t reclen = lrc->lrc_reclen; - uint64_t dlen; + uint64_t dlen = 0; if (lwb == NULL) return (NULL); + ASSERT(lwb->lwb_buf != NULL); if (lrc->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) dlen = P2ROUNDUP_TYPED( - lr->lr_length, sizeof (uint64_t), uint64_t); - else - dlen = 0; + lrw->lr_length, sizeof (uint64_t), uint64_t); zilog->zl_cur_used += (reclen + dlen); @@ -893,24 +961,22 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) /* * If this record won't fit in the current log block, start a new one. */ - if (lwb->lwb_nused + reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) { + if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { lwb = zil_lwb_write_start(zilog, lwb); if (lwb == NULL) return (NULL); zil_lwb_write_init(zilog, lwb); - ASSERT(lwb->lwb_nused == 0); - if (reclen + dlen > ZIL_BLK_DATA_SZ(lwb)) { + ASSERT(LWB_EMPTY(lwb)); + if (lwb->lwb_nused + reclen + dlen > lwb->lwb_sz) { txg_wait_synced(zilog->zl_dmu_pool, txg); return (lwb); } } - /* - * Update the lrc_seq, to be log record sequence number. See zil.h - * Then copy the record to the log buffer. - */ - lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ - bcopy(lrc, lwb->lwb_buf + lwb->lwb_nused, reclen); + lr_buf = lwb->lwb_buf + lwb->lwb_nused; + bcopy(lrc, lr_buf, reclen); + lrc = (lr_t *)lr_buf; + lrw = (lr_write_t *)lrc; /* * If it's a write, fetch the data or get its blkptr as appropriate. @@ -922,18 +988,20 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) char *dbuf; int error; - /* alignment is guaranteed */ - lr = (lr_write_t *)(lwb->lwb_buf + lwb->lwb_nused); if (dlen) { ASSERT(itx->itx_wr_state == WR_NEED_COPY); - dbuf = lwb->lwb_buf + lwb->lwb_nused + reclen; - lr->lr_common.lrc_reclen += dlen; + dbuf = lr_buf + reclen; + lrw->lr_common.lrc_reclen += dlen; } else { ASSERT(itx->itx_wr_state == WR_INDIRECT); dbuf = NULL; } error = zilog->zl_get_data( - itx->itx_private, lr, dbuf, lwb->lwb_zio); + itx->itx_private, lrw, dbuf, lwb->lwb_zio); + if (error == EIO) { + txg_wait_synced(zilog->zl_dmu_pool, txg); + return (lwb); + } if (error) { ASSERT(error == ENOENT || error == EEXIST || error == EALREADY); @@ -942,9 +1010,16 @@ zil_lwb_commit(zilog_t *zilog, itx_t *itx, lwb_t *lwb) } } + /* + * We're actually making an entry, so update lrc_seq to be the + * log record sequence number. Note that this is generally not + * equal to the itx sequence number because not all transactions + * are synchronous, and sometimes spa_sync() gets there first. + */ + lrc->lrc_seq = ++zilog->zl_lr_seq; /* we are single threaded */ lwb->lwb_nused += reclen + dlen; lwb->lwb_max_txg = MAX(lwb->lwb_max_txg, txg); - ASSERT3U(lwb->lwb_nused, <=, ZIL_BLK_DATA_SZ(lwb)); + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_sz); ASSERT3U(P2PHASE(lwb->lwb_nused, sizeof (uint64_t)), ==, 0); return (lwb); @@ -966,12 +1041,19 @@ zil_itx_create(uint64_t txtype, size_t lrsize) return (itx); } +void +zil_itx_destroy(itx_t *itx) +{ + kmem_free(itx, offsetof(itx_t, itx_lr) + itx->itx_lr.lrc_reclen); +} + uint64_t zil_itx_assign(zilog_t *zilog, itx_t *itx, dmu_tx_t *tx) { uint64_t seq; ASSERT(itx->itx_lr.lrc_seq == 0); + ASSERT(!zilog->zl_replay); mutex_enter(&zilog->zl_lock); list_insert_tail(&zilog->zl_itx_list, itx); @@ -1020,8 +1102,7 @@ zil_itx_clean(zilog_t *zilog) /* destroy sync'd log transactions */ while ((itx = list_head(&clean_list)) != NULL) { list_remove(&clean_list, itx); - kmem_free(itx, offsetof(itx_t, itx_lr) - + itx->itx_lr.lrc_reclen); + zil_itx_destroy(itx); } list_destroy(&clean_list); } @@ -1040,7 +1121,7 @@ zil_clean(zilog_t *zilog) if ((itx != NULL) && (itx->itx_lr.lrc_txg <= spa_last_synced_txg(zilog->zl_spa))) { (void) taskq_dispatch(zilog->zl_clean_taskq, - (void (*)(void *))zil_itx_clean, zilog, TQ_NOSLEEP); + (task_func_t *)zil_itx_clean, zilog, TQ_NOSLEEP); } mutex_exit(&zilog->zl_lock); } @@ -1050,9 +1131,10 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) { uint64_t txg; uint64_t commit_seq = 0; - itx_t *itx, *itx_next = (itx_t *)-1; + itx_t *itx, *itx_next; lwb_t *lwb; spa_t *spa; + int error = 0; zilog->zl_writer = B_TRUE; ASSERT(zilog->zl_root_zio == NULL); @@ -1072,77 +1154,64 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) return; } mutex_exit(&zilog->zl_lock); - zil_create(zilog); + lwb = zil_create(zilog); mutex_enter(&zilog->zl_lock); - lwb = list_tail(&zilog->zl_lwb_list); } } + ASSERT(lwb == NULL || lwb->lwb_zio == NULL); /* Loop through in-memory log transactions filling log blocks. */ DTRACE_PROBE1(zil__cw1, zilog_t *, zilog); - for (;;) { + + for (itx = list_head(&zilog->zl_itx_list); itx; itx = itx_next) { /* - * Find the next itx to push: - * Push all transactions related to specified foid and all - * other transactions except TX_WRITE, TX_TRUNCATE, - * TX_SETATTR and TX_ACL for all other files. + * Save the next pointer. Even though we drop zl_lock below, + * all threads that can remove itx list entries (other writers + * and zil_itx_clean()) can't do so until they have zl_writer. */ - if (itx_next != (itx_t *)-1) - itx = itx_next; - else - itx = list_head(&zilog->zl_itx_list); - for (; itx != NULL; itx = list_next(&zilog->zl_itx_list, itx)) { - if (foid == 0) /* push all foids? */ - break; - if (itx->itx_sync) /* push all O_[D]SYNC */ - break; - switch (itx->itx_lr.lrc_txtype) { - case TX_SETATTR: - case TX_WRITE: - case TX_TRUNCATE: - case TX_ACL: - /* lr_foid is same offset for these records */ - if (((lr_write_t *)&itx->itx_lr)->lr_foid - != foid) { - continue; /* skip this record */ - } - } - break; - } - if (itx == NULL) - break; + itx_next = list_next(&zilog->zl_itx_list, itx); + + /* + * Determine whether to push this itx. + * Push all transactions related to specified foid and + * all other transactions except those that can be logged + * out of order (TX_WRITE, TX_TRUNCATE, TX_SETATTR, TX_ACL) + * for all other files. + * + * If foid == 0 (meaning "push all foids") or + * itx->itx_sync is set (meaning O_[D]SYNC), push regardless. + */ + if (foid != 0 && !itx->itx_sync && + TX_OOO(itx->itx_lr.lrc_txtype) && + ((lr_ooo_t *)&itx->itx_lr)->lr_foid != foid) + continue; /* skip this record */ if ((itx->itx_lr.lrc_seq > seq) && - ((lwb == NULL) || (lwb->lwb_nused == 0) || - (lwb->lwb_nused + itx->itx_sod > ZIL_BLK_DATA_SZ(lwb)))) { + ((lwb == NULL) || (LWB_EMPTY(lwb)) || + (lwb->lwb_nused + itx->itx_sod > lwb->lwb_sz))) break; - } - /* - * Save the next pointer. Even though we soon drop - * zl_lock all threads that may change the list - * (another writer or zil_itx_clean) can't do so until - * they have zl_writer. - */ - itx_next = list_next(&zilog->zl_itx_list, itx); list_remove(&zilog->zl_itx_list, itx); zilog->zl_itx_list_sz -= itx->itx_sod; + mutex_exit(&zilog->zl_lock); + txg = itx->itx_lr.lrc_txg; ASSERT(txg); if (txg > spa_last_synced_txg(spa) || txg > spa_freeze_txg(spa)) lwb = zil_lwb_commit(zilog, itx, lwb); - kmem_free(itx, offsetof(itx_t, itx_lr) - + itx->itx_lr.lrc_reclen); + + zil_itx_destroy(itx); + mutex_enter(&zilog->zl_lock); } DTRACE_PROBE1(zil__cw2, zilog_t *, zilog); /* determine commit sequence number */ itx = list_head(&zilog->zl_itx_list); if (itx) - commit_seq = itx->itx_lr.lrc_seq; + commit_seq = itx->itx_lr.lrc_seq - 1; else commit_seq = zilog->zl_itx_seq; mutex_exit(&zilog->zl_lock); @@ -1159,22 +1228,28 @@ zil_commit_writer(zilog_t *zilog, uint64_t seq, uint64_t foid) */ if (zilog->zl_root_zio) { DTRACE_PROBE1(zil__cw3, zilog_t *, zilog); - (void) zio_wait(zilog->zl_root_zio); + error = zio_wait(zilog->zl_root_zio); zilog->zl_root_zio = NULL; DTRACE_PROBE1(zil__cw4, zilog_t *, zilog); zil_flush_vdevs(zilog); } - if (zilog->zl_log_error || lwb == NULL) { - zilog->zl_log_error = 0; + if (error || lwb == NULL) txg_wait_synced(zilog->zl_dmu_pool, 0); - } mutex_enter(&zilog->zl_lock); zilog->zl_writer = B_FALSE; ASSERT3U(commit_seq, >=, zilog->zl_commit_seq); zilog->zl_commit_seq = commit_seq; + + /* + * Remember the highest committed log sequence number for ztest. + * We only update this value when all the log writes succeeded, + * because ztest wants to ASSERT that it got the whole log chain. + */ + if (error == 0 && lwb != NULL) + zilog->zl_commit_lr_seq = zilog->zl_lr_seq; } /* @@ -1194,7 +1269,7 @@ zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid) while (zilog->zl_writer) { cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); - if (seq < zilog->zl_commit_seq) { + if (seq <= zilog->zl_commit_seq) { mutex_exit(&zilog->zl_lock); return; } @@ -1205,6 +1280,33 @@ zil_commit(zilog_t *zilog, uint64_t seq, uint64_t foid) mutex_exit(&zilog->zl_lock); } +/* + * Report whether all transactions are committed. + */ +static boolean_t +zil_is_committed(zilog_t *zilog) +{ + lwb_t *lwb; + boolean_t committed; + + mutex_enter(&zilog->zl_lock); + + while (zilog->zl_writer) + cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + + if (!list_is_empty(&zilog->zl_itx_list)) + committed = B_FALSE; /* unpushed transactions */ + else if ((lwb = list_head(&zilog->zl_lwb_list)) == NULL) + committed = B_TRUE; /* intent log never used */ + else if (list_next(&zilog->zl_lwb_list, lwb) != NULL) + committed = B_FALSE; /* zil_sync() not done yet */ + else + committed = B_TRUE; /* everything synced */ + + mutex_exit(&zilog->zl_lock); + return (committed); +} + /* * Called in syncing context to free committed log blocks and update log header. */ @@ -1214,22 +1316,33 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) zil_header_t *zh = zil_header_in_syncing_context(zilog); uint64_t txg = dmu_tx_get_txg(tx); spa_t *spa = zilog->zl_spa; + uint64_t *replayed_seq = &zilog->zl_replayed_seq[txg & TXG_MASK]; lwb_t *lwb; + /* + * We don't zero out zl_destroy_txg, so make sure we don't try + * to destroy it twice. + */ + if (spa_sync_pass(spa) != 1) + return; + mutex_enter(&zilog->zl_lock); ASSERT(zilog->zl_stop_sync == 0); - zh->zh_replay_seq = zilog->zl_replay_seq[txg & TXG_MASK]; + if (*replayed_seq != 0) { + ASSERT(zh->zh_replay_seq < *replayed_seq); + zh->zh_replay_seq = *replayed_seq; + *replayed_seq = 0; + } if (zilog->zl_destroy_txg == txg) { blkptr_t blk = zh->zh_log; ASSERT(list_head(&zilog->zl_lwb_list) == NULL); - ASSERT(spa_sync_pass(spa) == 1); bzero(zh, sizeof (zil_header_t)); - bzero(zilog->zl_replay_seq, sizeof (zilog->zl_replay_seq)); + bzero(zilog->zl_replayed_seq, sizeof (zilog->zl_replayed_seq)); if (zilog->zl_keep_first) { /* @@ -1245,17 +1358,12 @@ zil_sync(zilog_t *zilog, dmu_tx_t *tx) } } - for (;;) { - lwb = list_head(&zilog->zl_lwb_list); - if (lwb == NULL) { - mutex_exit(&zilog->zl_lock); - return; - } + while ((lwb = list_head(&zilog->zl_lwb_list)) != NULL) { zh->zh_log = lwb->lwb_blk; if (lwb->lwb_buf != NULL || lwb->lwb_max_txg > txg) break; list_remove(&zilog->zl_lwb_list, lwb); - zio_free_blk(spa, &lwb->lwb_blk, txg); + zio_free_zil(spa, txg, &lwb->lwb_blk); kmem_cache_free(zil_lwb_cache, lwb); /* @@ -1283,6 +1391,12 @@ zil_fini(void) kmem_cache_destroy(zil_lwb_cache); } +void +zil_set_logbias(zilog_t *zilog, uint64_t logbias) +{ + zilog->zl_logbias = logbias; +} + zilog_t * zil_alloc(objset_t *os, zil_header_t *zh_phys) { @@ -1295,6 +1409,7 @@ zil_alloc(objset_t *os, zil_header_t *zh_phys) zilog->zl_spa = dmu_objset_spa(os); zilog->zl_dmu_pool = dmu_objset_pool(os); zilog->zl_destroy_txg = TXG_INITIAL - 1; + zilog->zl_logbias = dmu_objset_logbias(os); mutex_init(&zilog->zl_lock, NULL, MUTEX_DEFAULT, NULL); @@ -1343,25 +1458,6 @@ zil_free(zilog_t *zilog) kmem_free(zilog, sizeof (zilog_t)); } -/* - * return true if the initial log block is not valid - */ -static boolean_t -zil_empty(zilog_t *zilog) -{ - const zil_header_t *zh = zilog->zl_header; - arc_buf_t *abuf = NULL; - - if (BP_IS_HOLE(&zh->zh_log)) - return (B_TRUE); - - if (zil_read_log_block(zilog, &zh->zh_log, &abuf) != 0) - return (B_TRUE); - - VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); - return (B_FALSE); -} - /* * Open an intent log. */ @@ -1390,7 +1486,7 @@ zil_close(zilog_t *zilog) if (!zil_is_committed(zilog)) { uint64_t txg; dmu_tx_t *tx = dmu_tx_create(zilog->zl_os); - (void) dmu_tx_assign(tx, TXG_WAIT); + VERIFY(dmu_tx_assign(tx, TXG_WAIT) == 0); dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); txg = dmu_tx_get_txg(tx); dmu_tx_commit(tx); @@ -1417,7 +1513,7 @@ zil_suspend(zilog_t *zilog) const zil_header_t *zh = zilog->zl_header; mutex_enter(&zilog->zl_lock); - if (zh->zh_claim_txg != 0) { /* unplayed log */ + if (zh->zh_flags & ZIL_REPLAY_NEEDED) { /* unplayed log */ mutex_exit(&zilog->zl_lock); return (EBUSY); } @@ -1464,278 +1560,191 @@ zil_resume(zilog_t *zilog) } typedef struct zil_replay_arg { - objset_t *zr_os; zil_replay_func_t **zr_replay; - zil_replay_cleaner_t *zr_replay_cleaner; void *zr_arg; - uint64_t *zr_txgp; boolean_t zr_byteswap; - char *zr_lrbuf; + char *zr_lr; } zil_replay_arg_t; -static void +static int +zil_replay_error(zilog_t *zilog, lr_t *lr, int error) +{ + char name[MAXNAMELEN]; + + zilog->zl_replaying_seq--; /* didn't actually replay this one */ + + dmu_objset_name(zilog->zl_os, name); + + cmn_err(CE_WARN, "ZFS replay transaction error %d, " + "dataset %s, seq 0x%llx, txtype %llu %s\n", error, name, + (u_longlong_t)lr->lrc_seq, + (u_longlong_t)(lr->lrc_txtype & ~TX_CI), + (lr->lrc_txtype & TX_CI) ? "CI" : ""); + + return (error); +} + +static int zil_replay_log_record(zilog_t *zilog, lr_t *lr, void *zra, uint64_t claim_txg) { zil_replay_arg_t *zr = zra; const zil_header_t *zh = zilog->zl_header; uint64_t reclen = lr->lrc_reclen; uint64_t txtype = lr->lrc_txtype; - char *name; - int pass, error, sunk; - - if (zilog->zl_stop_replay) - return; + int error = 0; - if (lr->lrc_txg < claim_txg) /* already committed */ - return; + zilog->zl_replaying_seq = lr->lrc_seq; if (lr->lrc_seq <= zh->zh_replay_seq) /* already replayed */ - return; + return (0); + + if (lr->lrc_txg < claim_txg) /* already committed */ + return (0); /* Strip case-insensitive bit, still present in log record */ txtype &= ~TX_CI; + if (txtype == 0 || txtype >= TX_MAX_TYPE) + return (zil_replay_error(zilog, lr, EINVAL)); + /* - * Make a copy of the data so we can revise and extend it. + * If this record type can be logged out of order, the object + * (lr_foid) may no longer exist. That's legitimate, not an error. */ - bcopy(lr, zr->zr_lrbuf, reclen); + if (TX_OOO(txtype)) { + error = dmu_object_info(zilog->zl_os, + ((lr_ooo_t *)lr)->lr_foid, NULL); + if (error == ENOENT || error == EEXIST) + return (0); + } /* - * The log block containing this lr may have been byteswapped - * so that we can easily examine common fields like lrc_txtype. - * However, the log is a mix of different data types, and only the - * replay vectors know how to byteswap their records. Therefore, if - * the lr was byteswapped, undo it before invoking the replay vector. + * Make a copy of the data so we can revise and extend it. */ - if (zr->zr_byteswap) - byteswap_uint64_array(zr->zr_lrbuf, reclen); + bcopy(lr, zr->zr_lr, reclen); /* * If this is a TX_WRITE with a blkptr, suck in the data. */ if (txtype == TX_WRITE && reclen == sizeof (lr_write_t)) { - lr_write_t *lrw = (lr_write_t *)lr; - blkptr_t *wbp = &lrw->lr_blkptr; - uint64_t wlen = lrw->lr_length; - char *wbuf = zr->zr_lrbuf + reclen; - - if (BP_IS_HOLE(wbp)) { /* compressed to a hole */ - bzero(wbuf, wlen); - } else { - /* - * A subsequent write may have overwritten this block, - * in which case wbp may have been been freed and - * reallocated, and our read of wbp may fail with a - * checksum error. We can safely ignore this because - * the later write will provide the correct data. - */ - zbookmark_t zb; - - zb.zb_objset = dmu_objset_id(zilog->zl_os); - zb.zb_object = lrw->lr_foid; - zb.zb_level = -1; - zb.zb_blkid = lrw->lr_offset / BP_GET_LSIZE(wbp); - - (void) zio_wait(zio_read(NULL, zilog->zl_spa, - wbp, wbuf, BP_GET_LSIZE(wbp), NULL, NULL, - ZIO_PRIORITY_SYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, &zb)); - (void) memmove(wbuf, wbuf + lrw->lr_blkoff, wlen); - } + error = zil_read_log_data(zilog, (lr_write_t *)lr, + zr->zr_lr + reclen); + if (error) + return (zil_replay_error(zilog, lr, error)); } /* - * Replay of large truncates can end up needing additional txs - * and a different txg. If they are nested within the replay tx - * as below then a hang is possible. So we do the truncate here - * and redo the truncate later (a no-op) and update the sequence - * number whilst in the replay tx. Fortunately, it's safe to repeat - * a truncate if we crash and the truncate commits. A create over - * an existing file will also come in as a TX_TRUNCATE record. - * - * Note, remove of large files and renames over large files is - * handled by putting the deleted object on a stable list - * and if necessary force deleting the object outside of the replay - * transaction using the zr_replay_cleaner. + * The log block containing this lr may have been byteswapped + * so that we can easily examine common fields like lrc_txtype. + * However, the log is a mix of different record types, and only the + * replay vectors know how to byteswap their records. Therefore, if + * the lr was byteswapped, undo it before invoking the replay vector. */ - if (txtype == TX_TRUNCATE) { - *zr->zr_txgp = TXG_NOWAIT; - error = zr->zr_replay[TX_TRUNCATE](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap); - if (error) - goto bad; - zr->zr_byteswap = 0; /* only byteswap once */ - } + if (zr->zr_byteswap) + byteswap_uint64_array(zr->zr_lr, reclen); /* * We must now do two things atomically: replay this log record, - * and update the log header to reflect the fact that we did so. - * We use the DMU's ability to assign into a specific txg to do this. + * and update the log header sequence number to reflect the fact that + * we did so. At the end of each replay function the sequence number + * is updated if we are in replay mode. */ - for (pass = 1, sunk = B_FALSE; /* CONSTANTCONDITION */; pass++) { - uint64_t replay_txg; - dmu_tx_t *replay_tx; - - replay_tx = dmu_tx_create(zr->zr_os); - error = dmu_tx_assign(replay_tx, TXG_WAIT); - if (error) { - dmu_tx_abort(replay_tx); - break; - } - - replay_txg = dmu_tx_get_txg(replay_tx); - - if (txtype == 0 || txtype >= TX_MAX_TYPE) { - error = EINVAL; - } else { - /* - * On the first pass, arrange for the replay vector - * to fail its dmu_tx_assign(). That's the only way - * to ensure that those code paths remain well tested. - * - * Only byteswap (if needed) on the 1st pass. - */ - *zr->zr_txgp = replay_txg - (pass == 1); - error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lrbuf, - zr->zr_byteswap && pass == 1); - *zr->zr_txgp = TXG_NOWAIT; - } - - if (error == 0) { - dsl_dataset_dirty(dmu_objset_ds(zr->zr_os), replay_tx); - zilog->zl_replay_seq[replay_txg & TXG_MASK] = - lr->lrc_seq; - } - - dmu_tx_commit(replay_tx); - - if (!error) - return; - + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, zr->zr_byteswap); + if (error) { /* * The DMU's dnode layer doesn't see removes until the txg * commits, so a subsequent claim can spuriously fail with - * EEXIST. So if we receive any error other than ERESTART - * we try syncing out any removes then retrying the - * transaction. + * EEXIST. So if we receive any error we try syncing out + * any removes then retry the transaction. Note that we + * specify B_FALSE for byteswap now, so we don't do it twice. */ - if (error != ERESTART && !sunk) { - if (zr->zr_replay_cleaner) - zr->zr_replay_cleaner(zr->zr_arg); - txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); - sunk = B_TRUE; - continue; /* retry */ - } - - if (error != ERESTART) - break; - - if (pass != 1) - txg_wait_open(spa_get_dsl(zilog->zl_spa), - replay_txg + 1); - - dprintf("pass %d, retrying\n", pass); + txg_wait_synced(spa_get_dsl(zilog->zl_spa), 0); + error = zr->zr_replay[txtype](zr->zr_arg, zr->zr_lr, B_FALSE); + if (error) + return (zil_replay_error(zilog, lr, error)); } - -bad: - ASSERT(error && error != ERESTART); - name = kmem_alloc(MAXNAMELEN, KM_SLEEP); - dmu_objset_name(zr->zr_os, name); - cmn_err(CE_WARN, "ZFS replay transaction error %d, " - "dataset %s, seq 0x%llx, txtype %llu %s\n", - error, name, (u_longlong_t)lr->lrc_seq, (u_longlong_t)txtype, - (lr->lrc_txtype & TX_CI) ? "CI" : ""); - zilog->zl_stop_replay = 1; - kmem_free(name, MAXNAMELEN); + return (0); } /* ARGSUSED */ -static void +static int zil_incr_blks(zilog_t *zilog, blkptr_t *bp, void *arg, uint64_t claim_txg) { zilog->zl_replay_blks++; + + return (0); } /* * If this dataset has a non-empty intent log, replay it and destroy it. */ void -zil_replay(objset_t *os, void *arg, uint64_t *txgp, - zil_replay_func_t *replay_func[TX_MAX_TYPE], - zil_replay_cleaner_t *replay_cleaner) +zil_replay(objset_t *os, void *arg, zil_replay_func_t *replay_func[TX_MAX_TYPE]) { zilog_t *zilog = dmu_objset_zil(os); const zil_header_t *zh = zilog->zl_header; zil_replay_arg_t zr; - if (zil_empty(zilog)) { + if ((zh->zh_flags & ZIL_REPLAY_NEEDED) == 0) { zil_destroy(zilog, B_TRUE); return; } - zr.zr_os = os; zr.zr_replay = replay_func; - zr.zr_replay_cleaner = replay_cleaner; zr.zr_arg = arg; - zr.zr_txgp = txgp; zr.zr_byteswap = BP_SHOULD_BYTESWAP(&zh->zh_log); - zr.zr_lrbuf = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); + zr.zr_lr = kmem_alloc(2 * SPA_MAXBLOCKSIZE, KM_SLEEP); /* * Wait for in-progress removes to sync before starting replay. */ txg_wait_synced(zilog->zl_dmu_pool, 0); - zilog->zl_stop_replay = 0; - zilog->zl_replay_time = lbolt; + zilog->zl_replay = B_TRUE; + zilog->zl_replay_time = ddi_get_lbolt(); ASSERT(zilog->zl_replay_blks == 0); (void) zil_parse(zilog, zil_incr_blks, zil_replay_log_record, &zr, zh->zh_claim_txg); - kmem_free(zr.zr_lrbuf, 2 * SPA_MAXBLOCKSIZE); + kmem_free(zr.zr_lr, 2 * SPA_MAXBLOCKSIZE); zil_destroy(zilog, B_FALSE); txg_wait_synced(zilog->zl_dmu_pool, zilog->zl_destroy_txg); + zilog->zl_replay = B_FALSE; } -/* - * Report whether all transactions are committed - */ -int -zil_is_committed(zilog_t *zilog) +boolean_t +zil_replaying(zilog_t *zilog, dmu_tx_t *tx) { - lwb_t *lwb; - int ret; - - mutex_enter(&zilog->zl_lock); - while (zilog->zl_writer) - cv_wait(&zilog->zl_cv_writer, &zilog->zl_lock); + if (zilog == NULL) + return (B_TRUE); - /* recent unpushed intent log transactions? */ - if (!list_is_empty(&zilog->zl_itx_list)) { - ret = B_FALSE; - goto out; + if (zilog->zl_replay) { + dsl_dataset_dirty(dmu_objset_ds(zilog->zl_os), tx); + zilog->zl_replayed_seq[dmu_tx_get_txg(tx) & TXG_MASK] = + zilog->zl_replaying_seq; + return (B_TRUE); } - /* intent log never used? */ - lwb = list_head(&zilog->zl_lwb_list); - if (lwb == NULL) { - ret = B_TRUE; - goto out; - } + return (B_FALSE); +} - /* - * more than 1 log buffer means zil_sync() hasn't yet freed - * entries after a txg has committed - */ - if (list_next(&zilog->zl_lwb_list, lwb)) { - ret = B_FALSE; - goto out; - } +/* ARGSUSED */ +int +zil_vdev_offline(const char *osname, void *arg) +{ + objset_t *os; + zilog_t *zilog; + int error; - ASSERT(zil_empty(zilog)); - ret = B_TRUE; -out: - cv_broadcast(&zilog->zl_cv_writer); - mutex_exit(&zilog->zl_lock); - return (ret); + error = dmu_objset_hold(osname, FTAG, &os); + if (error) + return (error); + + zilog = dmu_objset_zil(os); + if (zil_suspend(zilog) != 0) + error = EEXIST; + else + zil_resume(zilog); + dmu_objset_rele(os, FTAG); + return (error); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c index d347920ea6bb7..4e481b16b7786 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zio.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,6 +32,9 @@ #include #include #include +#include +#include +#include /* * ========================================================================== @@ -42,11 +45,12 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { 0, /* ZIO_PRIORITY_NOW */ 0, /* ZIO_PRIORITY_SYNC_READ */ 0, /* ZIO_PRIORITY_SYNC_WRITE */ - 6, /* ZIO_PRIORITY_ASYNC_READ */ - 4, /* ZIO_PRIORITY_ASYNC_WRITE */ - 4, /* ZIO_PRIORITY_FREE */ - 0, /* ZIO_PRIORITY_CACHE_FILL */ 0, /* ZIO_PRIORITY_LOG_WRITE */ + 1, /* ZIO_PRIORITY_CACHE_FILL */ + 1, /* ZIO_PRIORITY_AGG */ + 4, /* ZIO_PRIORITY_FREE */ + 4, /* ZIO_PRIORITY_ASYNC_WRITE */ + 6, /* ZIO_PRIORITY_ASYNC_READ */ 10, /* ZIO_PRIORITY_RESILVER */ 20, /* ZIO_PRIORITY_SCRUB */ }; @@ -57,11 +61,9 @@ uint8_t zio_priority_table[ZIO_PRIORITY_TABLE_SIZE] = { * ========================================================================== */ char *zio_type_name[ZIO_TYPES] = { - "null", "read", "write", "free", "claim", "ioctl" }; - -#define SYNC_PASS_DEFERRED_FREE 1 /* defer frees after this pass */ -#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */ -#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */ + "zio_null", "zio_read", "zio_write", "zio_free", "zio_claim", + "zio_ioctl" +}; /* * ========================================================================== @@ -69,6 +71,7 @@ char *zio_type_name[ZIO_TYPES] = { * ========================================================================== */ kmem_cache_t *zio_cache; +kmem_cache_t *zio_link_cache; kmem_cache_t *zio_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; kmem_cache_t *zio_data_buf_cache[SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT]; @@ -80,8 +83,15 @@ extern vmem_t *zio_alloc_arena; * An allocating zio is one that either currently has the DVA allocate * stage set or will have it later in its lifetime. */ -#define IO_IS_ALLOCATING(zio) \ - ((zio)->io_orig_pipeline & (1U << ZIO_STAGE_DVA_ALLOCATE)) +#define IO_IS_ALLOCATING(zio) ((zio)->io_orig_pipeline & ZIO_STAGE_DVA_ALLOCATE) + +boolean_t zio_requeue_io_start_cut_in_line = B_TRUE; + +#ifdef ZFS_DEBUG +int zio_buf_debug_limit = 16384; +#else +int zio_buf_debug_limit = 0; +#endif void zio_init(void) @@ -92,8 +102,10 @@ zio_init(void) #ifdef _KERNEL data_alloc_arena = zio_alloc_arena; #endif - zio_cache = kmem_cache_create("zio_cache", sizeof (zio_t), 0, - NULL, NULL, NULL, NULL, NULL, 0); + zio_cache = kmem_cache_create("zio_cache", + sizeof (zio_t), 0, NULL, NULL, NULL, NULL, NULL, 0); + zio_link_cache = kmem_cache_create("zio_link_cache", + sizeof (zio_link_t), 0, NULL, NULL, NULL, NULL, NULL, 0); /* * For small buffers, we want a cache for each multiple of @@ -121,12 +133,13 @@ zio_init(void) char name[36]; (void) sprintf(name, "zio_buf_%lu", (ulong_t)size); zio_buf_cache[c] = kmem_cache_create(name, size, - align, NULL, NULL, NULL, NULL, NULL, KMC_NODEBUG); + align, NULL, NULL, NULL, NULL, NULL, + size > zio_buf_debug_limit ? KMC_NODEBUG : 0); (void) sprintf(name, "zio_data_buf_%lu", (ulong_t)size); zio_data_buf_cache[c] = kmem_cache_create(name, size, align, NULL, NULL, NULL, NULL, data_alloc_arena, - KMC_NODEBUG); + size > zio_buf_debug_limit ? KMC_NODEBUG : 0); } } @@ -164,6 +177,7 @@ zio_fini(void) zio_data_buf_cache[c] = NULL; } + kmem_cache_destroy(zio_link_cache); kmem_cache_destroy(zio_cache); zio_inject_fini(); @@ -260,7 +274,8 @@ zio_pop_transforms(zio_t *zio) zt->zt_transform(zio, zt->zt_orig_data, zt->zt_orig_size); - zio_buf_free(zio->io_data, zt->zt_bufsize); + if (zt->zt_bufsize != 0) + zio_buf_free(zio->io_data, zt->zt_bufsize); zio->io_data = zt->zt_orig_data; zio->io_size = zt->zt_orig_size; @@ -289,7 +304,7 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) { if (zio->io_error == 0 && zio_decompress_data(BP_GET_COMPRESS(zio->io_bp), - zio->io_data, zio->io_size, data, size) != 0) + zio->io_data, data, zio->io_size, size) != 0) zio->io_error = EIO; } @@ -298,41 +313,108 @@ zio_decompress(zio_t *zio, void *data, uint64_t size) * I/O parent/child relationships and pipeline interlocks * ========================================================================== */ +/* + * NOTE - Callers to zio_walk_parents() and zio_walk_children must + * continue calling these functions until they return NULL. + * Otherwise, the next caller will pick up the list walk in + * some indeterminate state. (Otherwise every caller would + * have to pass in a cookie to keep the state represented by + * io_walk_link, which gets annoying.) + */ +zio_t * +zio_walk_parents(zio_t *cio) +{ + zio_link_t *zl = cio->io_walk_link; + list_t *pl = &cio->io_parent_list; -static void -zio_add_child(zio_t *pio, zio_t *zio) + zl = (zl == NULL) ? list_head(pl) : list_next(pl, zl); + cio->io_walk_link = zl; + + if (zl == NULL) + return (NULL); + + ASSERT(zl->zl_child == cio); + return (zl->zl_parent); +} + +zio_t * +zio_walk_children(zio_t *pio) { + zio_link_t *zl = pio->io_walk_link; + list_t *cl = &pio->io_child_list; + + zl = (zl == NULL) ? list_head(cl) : list_next(cl, zl); + pio->io_walk_link = zl; + + if (zl == NULL) + return (NULL); + + ASSERT(zl->zl_parent == pio); + return (zl->zl_child); +} + +zio_t * +zio_unique_parent(zio_t *cio) +{ + zio_t *pio = zio_walk_parents(cio); + + VERIFY(zio_walk_parents(cio) == NULL); + return (pio); +} + +void +zio_add_child(zio_t *pio, zio_t *cio) +{ + zio_link_t *zl = kmem_cache_alloc(zio_link_cache, KM_SLEEP); + + /* + * Logical I/Os can have logical, gang, or vdev children. + * Gang I/Os can have gang or vdev children. + * Vdev I/Os can only have vdev children. + * The following ASSERT captures all of these constraints. + */ + ASSERT(cio->io_child_type <= pio->io_child_type); + + zl->zl_parent = pio; + zl->zl_child = cio; + + mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); - if (zio->io_stage < ZIO_STAGE_READY) - pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; - if (zio->io_stage < ZIO_STAGE_DONE) - pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; - zio->io_sibling_prev = NULL; - zio->io_sibling_next = pio->io_child; - if (pio->io_child != NULL) - pio->io_child->io_sibling_prev = zio; - pio->io_child = zio; - zio->io_parent = pio; + + ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); + + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w] += !cio->io_state[w]; + + list_insert_head(&pio->io_child_list, zl); + list_insert_head(&cio->io_parent_list, zl); + + pio->io_child_count++; + cio->io_parent_count++; + mutex_exit(&pio->io_lock); + mutex_exit(&cio->io_lock); } static void -zio_remove_child(zio_t *pio, zio_t *zio) +zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) { - zio_t *next, *prev; - - ASSERT(zio->io_parent == pio); + ASSERT(zl->zl_parent == pio); + ASSERT(zl->zl_child == cio); + mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); - next = zio->io_sibling_next; - prev = zio->io_sibling_prev; - if (next != NULL) - next->io_sibling_prev = prev; - if (prev != NULL) - prev->io_sibling_next = next; - if (pio->io_child == zio) - pio->io_child = next; + + list_remove(&pio->io_child_list, zl); + list_remove(&cio->io_parent_list, zl); + + pio->io_child_count--; + cio->io_parent_count--; + mutex_exit(&pio->io_lock); + mutex_exit(&cio->io_lock); + + kmem_cache_free(zio_link_cache, zl); } static boolean_t @@ -344,7 +426,7 @@ zio_wait_for_children(zio_t *zio, enum zio_child child, enum zio_wait_type wait) mutex_enter(&zio->io_lock); ASSERT(zio->io_stall == NULL); if (*countp != 0) { - zio->io_stage--; + zio->io_stage >>= 1; zio->io_stall = countp; waiting = B_TRUE; } @@ -386,10 +468,11 @@ zio_inherit_child_errors(zio_t *zio, enum zio_child c) * ========================================================================== */ static zio_t * -zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, +zio_create(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - zio_type_t type, int priority, int flags, vdev_t *vd, uint64_t offset, - const zbookmark_t *zb, uint8_t stage, uint32_t pipeline) + zio_type_t type, int priority, enum zio_flag flags, + vdev_t *vd, uint64_t offset, const zbookmark_t *zb, + enum zio_stage stage, enum zio_stage pipeline) { zio_t *zio; @@ -407,53 +490,58 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, mutex_init(&zio->io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&zio->io_cv, NULL, CV_DEFAULT, NULL); + list_create(&zio->io_parent_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_parent_node)); + list_create(&zio->io_child_list, sizeof (zio_link_t), + offsetof(zio_link_t, zl_child_node)); + if (vd != NULL) zio->io_child_type = ZIO_CHILD_VDEV; else if (flags & ZIO_FLAG_GANG_CHILD) zio->io_child_type = ZIO_CHILD_GANG; + else if (flags & ZIO_FLAG_DDT_CHILD) + zio->io_child_type = ZIO_CHILD_DDT; else zio->io_child_type = ZIO_CHILD_LOGICAL; if (bp != NULL) { - zio->io_bp = bp; + zio->io_bp = (blkptr_t *)bp; zio->io_bp_copy = *bp; zio->io_bp_orig = *bp; - if (type != ZIO_TYPE_WRITE) + if (type != ZIO_TYPE_WRITE || + zio->io_child_type == ZIO_CHILD_DDT) zio->io_bp = &zio->io_bp_copy; /* so caller can free */ - if (zio->io_child_type == ZIO_CHILD_LOGICAL) { - if (BP_IS_GANG(bp)) - pipeline |= ZIO_GANG_STAGES; + if (zio->io_child_type == ZIO_CHILD_LOGICAL) zio->io_logical = zio; - } + if (zio->io_child_type > ZIO_CHILD_GANG && BP_IS_GANG(bp)) + pipeline |= ZIO_GANG_STAGES; } zio->io_spa = spa; zio->io_txg = txg; - zio->io_data = data; - zio->io_size = size; zio->io_done = done; zio->io_private = private; zio->io_type = type; zio->io_priority = priority; zio->io_vd = vd; zio->io_offset = offset; + zio->io_orig_data = zio->io_data = data; + zio->io_orig_size = zio->io_size = size; zio->io_orig_flags = zio->io_flags = flags; zio->io_orig_stage = zio->io_stage = stage; zio->io_orig_pipeline = zio->io_pipeline = pipeline; + zio->io_state[ZIO_WAIT_READY] = (stage >= ZIO_STAGE_READY); + zio->io_state[ZIO_WAIT_DONE] = (stage >= ZIO_STAGE_DONE); + if (zb != NULL) zio->io_bookmark = *zb; if (pio != NULL) { - /* - * Logical I/Os can have logical, gang, or vdev children. - * Gang I/Os can have gang or vdev children. - * Vdev I/Os can only have vdev children. - * The following ASSERT captures all of these constraints. - */ - ASSERT(zio->io_child_type <= pio->io_child_type); if (zio->io_logical == NULL) zio->io_logical = pio->io_logical; + if (zio->io_child_type == ZIO_CHILD_GANG) + zio->io_gang_leader = pio->io_gang_leader; zio_add_child(pio, zio); } @@ -463,70 +551,53 @@ zio_create(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, static void zio_destroy(zio_t *zio) { - spa_t *spa = zio->io_spa; - uint8_t async_root = zio->io_async_root; - + list_destroy(&zio->io_parent_list); + list_destroy(&zio->io_child_list); mutex_destroy(&zio->io_lock); cv_destroy(&zio->io_cv); kmem_cache_free(zio_cache, zio); - - if (async_root) { - mutex_enter(&spa->spa_async_root_lock); - if (--spa->spa_async_root_count == 0) - cv_broadcast(&spa->spa_async_root_cv); - mutex_exit(&spa->spa_async_root_lock); - } } zio_t * -zio_null(zio_t *pio, spa_t *spa, zio_done_func_t *done, void *private, - int flags) +zio_null(zio_t *pio, spa_t *spa, vdev_t *vd, zio_done_func_t *done, + void *private, enum zio_flag flags) { zio_t *zio; zio = zio_create(pio, spa, 0, NULL, NULL, 0, done, private, - ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, NULL, 0, NULL, + ZIO_TYPE_NULL, ZIO_PRIORITY_NOW, flags, vd, 0, NULL, ZIO_STAGE_OPEN, ZIO_INTERLOCK_PIPELINE); return (zio); } zio_t * -zio_root(spa_t *spa, zio_done_func_t *done, void *private, int flags) +zio_root(spa_t *spa, zio_done_func_t *done, void *private, enum zio_flag flags) { - return (zio_null(NULL, spa, done, private, flags)); + return (zio_null(NULL, spa, NULL, done, private, flags)); } zio_t * zio_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, - int priority, int flags, const zbookmark_t *zb) + int priority, enum zio_flag flags, const zbookmark_t *zb) { zio_t *zio; - zio = zio_create(pio, spa, bp->blk_birth, (blkptr_t *)bp, + zio = zio_create(pio, spa, BP_PHYSICAL_BIRTH(bp), bp, data, size, done, private, ZIO_TYPE_READ, priority, flags, NULL, 0, zb, - ZIO_STAGE_OPEN, ZIO_READ_PIPELINE); + ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? + ZIO_DDT_CHILD_READ_PIPELINE : ZIO_READ_PIPELINE); return (zio); } -void -zio_skip_write(zio_t *zio) -{ - ASSERT(zio->io_type == ZIO_TYPE_WRITE); - ASSERT(zio->io_stage == ZIO_STAGE_READY); - ASSERT(!BP_IS_GANG(zio->io_bp)); - - zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; -} - zio_t * zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - void *data, uint64_t size, zio_prop_t *zp, + void *data, uint64_t size, const zio_prop_t *zp, zio_done_func_t *ready, zio_done_func_t *done, void *private, - int priority, int flags, const zbookmark_t *zb) + int priority, enum zio_flag flags, const zbookmark_t *zb) { zio_t *zio; @@ -536,13 +607,15 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zp->zp_compress < ZIO_COMPRESS_FUNCTIONS && zp->zp_type < DMU_OT_NUMTYPES && zp->zp_level < 32 && - zp->zp_ndvas > 0 && - zp->zp_ndvas <= spa_max_replication(spa)); - ASSERT(ready != NULL); + zp->zp_copies > 0 && + zp->zp_copies <= spa_max_replication(spa) && + zp->zp_dedup <= 1 && + zp->zp_dedup_verify <= 1); zio = zio_create(pio, spa, txg, bp, data, size, done, private, ZIO_TYPE_WRITE, priority, flags, NULL, 0, zb, - ZIO_STAGE_OPEN, ZIO_WRITE_PIPELINE); + ZIO_STAGE_OPEN, (flags & ZIO_FLAG_DDT_CHILD) ? + ZIO_DDT_CHILD_WRITE_PIPELINE : ZIO_WRITE_PIPELINE); zio->io_ready = ready; zio->io_prop = *zp; @@ -553,7 +626,7 @@ zio_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_t * zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, uint64_t size, zio_done_func_t *done, void *private, int priority, - int flags, zbookmark_t *zb) + enum zio_flag flags, zbookmark_t *zb) { zio_t *zio; @@ -564,33 +637,44 @@ zio_rewrite(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, void *data, return (zio); } +void +zio_write_override(zio_t *zio, blkptr_t *bp, int copies) +{ + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(zio->io_stage == ZIO_STAGE_OPEN); + ASSERT(zio->io_txg == spa_syncing_txg(zio->io_spa)); + + zio->io_prop.zp_copies = copies; + zio->io_bp_override = bp; +} + +void +zio_free(spa_t *spa, uint64_t txg, const blkptr_t *bp) +{ + bplist_enqueue_deferred(&spa->spa_free_bplist[txg & TXG_MASK], bp); +} + zio_t * -zio_free(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, int flags) +zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, + enum zio_flag flags) { zio_t *zio; ASSERT(!BP_IS_HOLE(bp)); - - if (bp->blk_fill == BLK_FILL_ALREADY_FREED) - return (zio_null(pio, spa, NULL, NULL, flags)); - - if (txg == spa->spa_syncing_txg && - spa_sync_pass(spa) > SYNC_PASS_DEFERRED_FREE) { - bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); - return (zio_null(pio, spa, NULL, NULL, flags)); - } + ASSERT(spa_syncing_txg(spa) == txg); + ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE); zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), - done, private, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, + NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags, NULL, 0, NULL, ZIO_STAGE_OPEN, ZIO_FREE_PIPELINE); return (zio); } zio_t * -zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, - zio_done_func_t *done, void *private, int flags) +zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, + zio_done_func_t *done, void *private, enum zio_flag flags) { zio_t *zio; @@ -604,9 +688,11 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, * * All claims *must* be resolved in the first txg -- before the SPA * starts allocating blocks -- so that nothing is allocated twice. + * If txg == 0 we just verify that the block is claimable. */ ASSERT3U(spa->spa_uberblock.ub_rootbp.blk_birth, <, spa_first_txg(spa)); - ASSERT3U(spa_first_txg(spa), <=, txg); + ASSERT(txg == spa_first_txg(spa) || txg == 0); + ASSERT(!BP_GET_DEDUP(bp) || !spa_writeable(spa)); /* zdb(1M) */ zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp), done, private, ZIO_TYPE_CLAIM, ZIO_PRIORITY_NOW, flags, @@ -617,7 +703,7 @@ zio_claim(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, zio_t * zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, - zio_done_func_t *done, void *private, int priority, int flags) + zio_done_func_t *done, void *private, int priority, enum zio_flag flags) { zio_t *zio; int c; @@ -629,7 +715,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio->io_cmd = cmd; } else { - zio = zio_null(pio, spa, NULL, NULL, flags); + zio = zio_null(pio, spa, NULL, NULL, NULL, flags); for (c = 0; c < vd->vdev_children; c++) zio_nowait(zio_ioctl(zio, spa, vd->vdev_child[c], cmd, @@ -642,7 +728,7 @@ zio_ioctl(zio_t *pio, spa_t *spa, vdev_t *vd, int cmd, zio_t * zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, - int priority, int flags, boolean_t labels) + int priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -663,7 +749,7 @@ zio_read_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio_t * zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, void *data, int checksum, zio_done_func_t *done, void *private, - int priority, int flags, boolean_t labels) + int priority, enum zio_flag flags, boolean_t labels) { zio_t *zio; @@ -678,9 +764,9 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, zio->io_prop.zp_checksum = checksum; - if (zio_checksum_table[checksum].ci_zbt) { + if (zio_checksum_table[checksum].ci_eck) { /* - * zbt checksums are necessarily destructive -- they modify + * zec checksums are necessarily destructive -- they modify * the end of the write buffer to hold the verifier/checksum. * Therefore, we must make a local copy in case the data is * being written to multiple places in parallel. @@ -698,10 +784,10 @@ zio_write_phys(zio_t *pio, vdev_t *vd, uint64_t offset, uint64_t size, */ zio_t * zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, - void *data, uint64_t size, int type, int priority, int flags, + void *data, uint64_t size, int type, int priority, enum zio_flag flags, zio_done_func_t *done, void *private) { - uint32_t pipeline = ZIO_VDEV_CHILD_PIPELINE; + enum zio_stage pipeline = ZIO_VDEV_CHILD_PIPELINE; zio_t *zio; ASSERT(vd->vdev_parent == @@ -714,26 +800,33 @@ zio_vdev_child_io(zio_t *pio, blkptr_t *bp, vdev_t *vd, uint64_t offset, * detection as close to the leaves as possible and * eliminates redundant checksums in the interior nodes. */ - pipeline |= 1U << ZIO_STAGE_CHECKSUM_VERIFY; - pio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); + pipeline |= ZIO_STAGE_CHECKSUM_VERIFY; + pio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } if (vd->vdev_children == 0) offset += VDEV_LABEL_START_SIZE; + flags |= ZIO_VDEV_CHILD_FLAGS(pio) | ZIO_FLAG_DONT_PROPAGATE; + + /* + * If we've decided to do a repair, the write is not speculative -- + * even if the original read was. + */ + if (flags & ZIO_FLAG_IO_REPAIR) + flags &= ~ZIO_FLAG_SPECULATIVE; + zio = zio_create(pio, pio->io_spa, pio->io_txg, bp, data, size, - done, private, type, priority, - (pio->io_flags & ZIO_FLAG_VDEV_INHERIT) | - ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | flags, - vd, offset, &pio->io_bookmark, - ZIO_STAGE_VDEV_IO_START - 1, pipeline); + done, private, type, priority, flags, vd, offset, &pio->io_bookmark, + ZIO_STAGE_VDEV_IO_START >> 1, pipeline); return (zio); } zio_t * zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, - int type, int priority, int flags, zio_done_func_t *done, void *private) + int type, int priority, enum zio_flag flags, + zio_done_func_t *done, void *private) { zio_t *zio; @@ -743,7 +836,7 @@ zio_vdev_delegated_io(vdev_t *vd, uint64_t offset, void *data, uint64_t size, data, size, done, private, type, priority, flags | ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_RETRY, vd, offset, NULL, - ZIO_STAGE_VDEV_IO_START - 1, ZIO_VDEV_CHILD_PIPELINE); + ZIO_STAGE_VDEV_IO_START >> 1, ZIO_VDEV_CHILD_PIPELINE); return (zio); } @@ -756,6 +849,23 @@ zio_flush(zio_t *zio, vdev_t *vd) ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY)); } +void +zio_shrink(zio_t *zio, uint64_t size) +{ + ASSERT(zio->io_executor == NULL); + ASSERT(zio->io_orig_size == zio->io_size); + ASSERT(size <= zio->io_size); + + /* + * We don't shrink for raidz because of problems with the + * reconstruction when reading back less than the block size. + * Note, BP_IS_RAIDZ() assumes no compression. + */ + ASSERT(BP_GET_COMPRESS(zio->io_bp) == ZIO_COMPRESS_OFF); + if (!BP_IS_RAIDZ(zio->io_bp)) + zio->io_orig_size = zio->io_size = size; +} + /* * ========================================================================== * Prepare to read and write logical blocks @@ -767,29 +877,36 @@ zio_read_bp_init(zio_t *zio) { blkptr_t *bp = zio->io_bp; - if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && zio->io_logical == zio) { - uint64_t csize = BP_GET_PSIZE(bp); - void *cbuf = zio_buf_alloc(csize); + if (BP_GET_COMPRESS(bp) != ZIO_COMPRESS_OFF && + zio->io_child_type == ZIO_CHILD_LOGICAL && + !(zio->io_flags & ZIO_FLAG_RAW)) { + uint64_t psize = BP_GET_PSIZE(bp); + void *cbuf = zio_buf_alloc(psize); - zio_push_transform(zio, cbuf, csize, csize, zio_decompress); + zio_push_transform(zio, cbuf, psize, psize, zio_decompress); } if (!dmu_ot[BP_GET_TYPE(bp)].ot_metadata && BP_GET_LEVEL(bp) == 0) zio->io_flags |= ZIO_FLAG_DONT_CACHE; + if (BP_GET_TYPE(bp) == DMU_OT_DDT_ZAP) + zio->io_flags |= ZIO_FLAG_DONT_CACHE; + + if (BP_GET_DEDUP(bp) && zio->io_child_type == ZIO_CHILD_LOGICAL) + zio->io_pipeline = ZIO_DDT_READ_PIPELINE; + return (ZIO_PIPELINE_CONTINUE); } static int zio_write_bp_init(zio_t *zio) { + spa_t *spa = zio->io_spa; zio_prop_t *zp = &zio->io_prop; - int compress = zp->zp_compress; + enum zio_compress compress = zp->zp_compress; blkptr_t *bp = zio->io_bp; - void *cbuf; uint64_t lsize = zio->io_size; - uint64_t csize = lsize; - uint64_t cbufsize = 0; + uint64_t psize = lsize; int pass = 1; /* @@ -803,7 +920,29 @@ zio_write_bp_init(zio_t *zio) if (!IO_IS_ALLOCATING(zio)) return (ZIO_PIPELINE_CONTINUE); - ASSERT(compress != ZIO_COMPRESS_INHERIT); + ASSERT(zio->io_child_type != ZIO_CHILD_DDT); + + if (zio->io_bp_override) { + ASSERT(bp->blk_birth != zio->io_txg); + ASSERT(BP_GET_DEDUP(zio->io_bp_override) == 0); + + *bp = *zio->io_bp_override; + zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; + + if (BP_IS_HOLE(bp) || !zp->zp_dedup) + return (ZIO_PIPELINE_CONTINUE); + + ASSERT(zio_checksum_table[zp->zp_checksum].ci_dedup || + zp->zp_dedup_verify); + + if (BP_GET_CHECKSUM(bp) == zp->zp_checksum) { + BP_SET_DEDUP(bp, 1); + zio->io_pipeline |= ZIO_STAGE_DDT_WRITE; + return (ZIO_PIPELINE_CONTINUE); + } + zio->io_bp_override = NULL; + BP_ZERO(bp); + } if (bp->blk_birth == zio->io_txg) { /* @@ -815,28 +954,29 @@ zio_write_bp_init(zio_t *zio) * convergence take longer. Therefore, after the first * few passes, stop compressing to ensure convergence. */ - pass = spa_sync_pass(zio->io_spa); - ASSERT(pass > 1); + pass = spa_sync_pass(spa); + + ASSERT(zio->io_txg == spa_syncing_txg(spa)); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!BP_GET_DEDUP(bp)); if (pass > SYNC_PASS_DONT_COMPRESS) compress = ZIO_COMPRESS_OFF; - /* - * Only MOS (objset 0) data should need to be rewritten. - */ - ASSERT(zio->io_logical->io_bookmark.zb_objset == 0); - /* Make sure someone doesn't change their mind on overwrites */ - ASSERT(MIN(zp->zp_ndvas + BP_IS_GANG(bp), - spa_max_replication(zio->io_spa)) == BP_GET_NDVAS(bp)); + ASSERT(MIN(zp->zp_copies + BP_IS_GANG(bp), + spa_max_replication(spa)) == BP_GET_NDVAS(bp)); } if (compress != ZIO_COMPRESS_OFF) { - if (!zio_compress_data(compress, zio->io_data, zio->io_size, - &cbuf, &csize, &cbufsize)) { + void *cbuf = zio_buf_alloc(lsize); + psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; - } else if (csize != 0) { - zio_push_transform(zio, cbuf, csize, cbufsize, NULL); + zio_buf_free(cbuf, lsize); + } else { + ASSERT(psize < lsize); + zio_push_transform(zio, cbuf, psize, lsize, NULL); } } @@ -848,10 +988,10 @@ zio_write_bp_init(zio_t *zio) * spa_sync() to allocate new blocks, but force rewrites after that. * There should only be a handful of blocks after pass 1 in any case. */ - if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == csize && + if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize && pass > SYNC_PASS_REWRITE) { - ASSERT(csize != 0); - uint32_t gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; + ASSERT(psize != 0); + enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES; zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages; zio->io_flags |= ZIO_FLAG_IO_REWRITE; } else { @@ -859,17 +999,38 @@ zio_write_bp_init(zio_t *zio) zio->io_pipeline = ZIO_WRITE_PIPELINE; } - if (csize == 0) { + if (psize == 0) { zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; } else { ASSERT(zp->zp_checksum != ZIO_CHECKSUM_GANG_HEADER); BP_SET_LSIZE(bp, lsize); - BP_SET_PSIZE(bp, csize); + BP_SET_PSIZE(bp, psize); BP_SET_COMPRESS(bp, compress); BP_SET_CHECKSUM(bp, zp->zp_checksum); BP_SET_TYPE(bp, zp->zp_type); BP_SET_LEVEL(bp, zp->zp_level); + BP_SET_DEDUP(bp, zp->zp_dedup); BP_SET_BYTEORDER(bp, ZFS_HOST_BYTEORDER); + if (zp->zp_dedup) { + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(!(zio->io_flags & ZIO_FLAG_IO_REWRITE)); + zio->io_pipeline = ZIO_DDT_WRITE_PIPELINE; + } + } + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_free_bp_init(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (zio->io_child_type == ZIO_CHILD_LOGICAL) { + if (BP_GET_DEDUP(bp)) + zio->io_pipeline = ZIO_DDT_FREE_PIPELINE; + else + arc_free(zio->io_spa, bp); } return (ZIO_PIPELINE_CONTINUE); @@ -882,16 +1043,18 @@ zio_write_bp_init(zio_t *zio) */ static void -zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) +zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q, boolean_t cutinline) { + spa_t *spa = zio->io_spa; zio_type_t t = zio->io_type; + int flags = TQ_SLEEP | (cutinline ? TQ_FRONT : 0); /* - * If we're a config writer, the normal issue and interrupt threads - * may all be blocked waiting for the config lock. In this case, - * select the otherwise-unused taskq for ZIO_TYPE_NULL. + * If we're a config writer or a probe, the normal issue and + * interrupt threads may all be blocked waiting for the config lock. + * In this case, select the otherwise-unused taskq for ZIO_TYPE_NULL. */ - if (zio->io_flags & ZIO_FLAG_CONFIG_WRITER) + if (zio->io_flags & (ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_PROBE)) t = ZIO_TYPE_NULL; /* @@ -900,8 +1063,16 @@ zio_taskq_dispatch(zio_t *zio, enum zio_taskq_type q) if (t == ZIO_TYPE_WRITE && zio->io_vd && zio->io_vd->vdev_aux) t = ZIO_TYPE_NULL; - (void) taskq_dispatch(zio->io_spa->spa_zio_taskq[t][q], - (task_func_t *)zio_execute, zio, TQ_SLEEP); + /* + * If this is a high priority I/O, then use the high priority taskq. + */ + if (zio->io_priority == ZIO_PRIORITY_NOW && + spa->spa_zio_taskq[t][q + 1] != NULL) + q++; + + ASSERT3U(q, <, ZIO_TASKQ_TYPES); + (void) taskq_dispatch(spa->spa_zio_taskq[t][q], + (task_func_t *)zio_execute, zio, flags); } static boolean_t @@ -920,7 +1091,7 @@ zio_taskq_member(zio_t *zio, enum zio_taskq_type q) static int zio_issue_async(zio_t *zio) { - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); return (ZIO_PIPELINE_STOP); } @@ -928,7 +1099,7 @@ zio_issue_async(zio_t *zio) void zio_interrupt(zio_t *zio) { - zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT); + zio_taskq_dispatch(zio, ZIO_TASKQ_INTERRUPT, B_FALSE); } /* @@ -944,7 +1115,7 @@ zio_interrupt(zio_t *zio) * There's no locking on io_stage because there's no legitimate way * for multiple threads to be attempting to process the same I/O. */ -static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES]; +static zio_pipe_stage_t *zio_pipeline[]; void zio_execute(zio_t *zio) @@ -952,32 +1123,39 @@ zio_execute(zio_t *zio) zio->io_executor = curthread; while (zio->io_stage < ZIO_STAGE_DONE) { - uint32_t pipeline = zio->io_pipeline; - zio_stage_t stage = zio->io_stage; + enum zio_stage pipeline = zio->io_pipeline; + enum zio_stage stage = zio->io_stage; int rv; ASSERT(!MUTEX_HELD(&zio->io_lock)); + ASSERT(ISP2(stage)); + ASSERT(zio->io_stall == NULL); - while (((1U << ++stage) & pipeline) == 0) - continue; + do { + stage <<= 1; + } while ((stage & pipeline) == 0); ASSERT(stage <= ZIO_STAGE_DONE); - ASSERT(zio->io_stall == NULL); /* * If we are in interrupt context and this pipeline stage * will grab a config lock that is held across I/O, - * issue async to avoid deadlock. + * or may wait for an I/O that needs an interrupt thread + * to complete, issue async to avoid deadlock. + * + * For VDEV_IO_START, we cut in line so that the io will + * be sent to disk promptly. */ - if (((1U << stage) & ZIO_CONFIG_LOCK_BLOCKING_STAGES) && - zio->io_vd == NULL && + if ((stage & ZIO_BLOCKING_STAGES) && zio->io_vd == NULL && zio_taskq_member(zio, ZIO_TASKQ_INTERRUPT)) { - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + boolean_t cut = (stage == ZIO_STAGE_VDEV_IO_START) ? + zio_requeue_io_start_cut_in_line : B_FALSE; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, cut); return; } zio->io_stage = stage; - rv = zio_pipeline[stage](zio); + rv = zio_pipeline[highbit(stage) - 1](zio); if (rv == ZIO_PIPELINE_STOP) return; @@ -1019,17 +1197,16 @@ zio_nowait(zio_t *zio) { ASSERT(zio->io_executor == NULL); - if (zio->io_parent == NULL && zio->io_child_type == ZIO_CHILD_LOGICAL) { + if (zio->io_child_type == ZIO_CHILD_LOGICAL && + zio_unique_parent(zio) == NULL) { /* * This is a logical async I/O with no parent to wait for it. - * Attach it to the pool's global async root zio so that - * spa_unload() has a way of waiting for async I/O to finish. + * We add it to the spa_async_root_zio "Godfather" I/O which + * will ensure they complete prior to unloading the pool. */ spa_t *spa = zio->io_spa; - zio->io_async_root = B_TRUE; - mutex_enter(&spa->spa_async_root_lock); - spa->spa_async_root_count++; - mutex_exit(&spa->spa_async_root_lock); + + zio_add_child(spa->spa_async_zio_root, zio); } zio_execute(zio); @@ -1044,50 +1221,49 @@ zio_nowait(zio_t *zio) static void zio_reexecute(zio_t *pio) { - zio_t *zio, *zio_next; + zio_t *cio, *cio_next; + + ASSERT(pio->io_child_type == ZIO_CHILD_LOGICAL); + ASSERT(pio->io_orig_stage == ZIO_STAGE_OPEN); + ASSERT(pio->io_gang_leader == NULL); + ASSERT(pio->io_gang_tree == NULL); pio->io_flags = pio->io_orig_flags; pio->io_stage = pio->io_orig_stage; pio->io_pipeline = pio->io_orig_pipeline; pio->io_reexecute = 0; pio->io_error = 0; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_state[w] = 0; for (int c = 0; c < ZIO_CHILD_TYPES; c++) pio->io_child_error[c] = 0; - if (IO_IS_ALLOCATING(pio)) { - /* - * Remember the failed bp so that the io_ready() callback - * can update its accounting upon reexecution. The block - * was already freed in zio_done(); we indicate this with - * a fill count of -1 so that zio_free() knows to skip it. - */ - blkptr_t *bp = pio->io_bp; - ASSERT(bp->blk_birth == 0 || bp->blk_birth == pio->io_txg); - bp->blk_fill = BLK_FILL_ALREADY_FREED; - pio->io_bp_orig = *bp; - BP_ZERO(bp); - } + if (IO_IS_ALLOCATING(pio)) + BP_ZERO(pio->io_bp); /* * As we reexecute pio's children, new children could be created. - * New children go to the head of the io_child list, however, + * New children go to the head of pio's io_child_list, however, * so we will (correctly) not reexecute them. The key is that - * the remainder of the io_child list, from 'zio_next' onward, - * cannot be affected by any side effects of reexecuting 'zio'. + * the remainder of pio's io_child_list, from 'cio_next' onward, + * cannot be affected by any side effects of reexecuting 'cio'. */ - for (zio = pio->io_child; zio != NULL; zio = zio_next) { - zio_next = zio->io_sibling_next; + for (cio = zio_walk_children(pio); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio); mutex_enter(&pio->io_lock); - pio->io_children[zio->io_child_type][ZIO_WAIT_READY]++; - pio->io_children[zio->io_child_type][ZIO_WAIT_DONE]++; + for (int w = 0; w < ZIO_WAIT_TYPES; w++) + pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); - zio_reexecute(zio); + zio_reexecute(cio); } /* * Now that all children have been reexecuted, execute the parent. + * We don't reexecute "The Godfather" I/O here as it's the + * responsibility of the caller to wait on him. */ - zio_execute(pio); + if (!(pio->io_flags & ZIO_FLAG_GODFATHER)) + zio_execute(pio); } void @@ -1103,14 +1279,17 @@ zio_suspend(spa_t *spa, zio_t *zio) mutex_enter(&spa->spa_suspend_lock); if (spa->spa_suspend_zio_root == NULL) - spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, 0); + spa->spa_suspend_zio_root = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | + ZIO_FLAG_GODFATHER); spa->spa_suspended = B_TRUE; if (zio != NULL) { + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); ASSERT(zio != spa->spa_suspend_zio_root); ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - ASSERT(zio->io_parent == NULL); + ASSERT(zio_unique_parent(zio) == NULL); ASSERT(zio->io_stage == ZIO_STAGE_DONE); zio_add_child(spa->spa_suspend_zio_root, zio); } @@ -1118,10 +1297,10 @@ zio_suspend(spa_t *spa, zio_t *zio) mutex_exit(&spa->spa_suspend_lock); } -void +int zio_resume(spa_t *spa) { - zio_t *pio, *zio; + zio_t *pio; /* * Reexecute all previously suspended i/o. @@ -1134,17 +1313,10 @@ zio_resume(spa_t *spa) mutex_exit(&spa->spa_suspend_lock); if (pio == NULL) - return; - - while ((zio = pio->io_child) != NULL) { - zio_remove_child(pio, zio); - zio->io_parent = NULL; - zio_reexecute(zio); - } - - ASSERT(pio->io_children[ZIO_CHILD_LOGICAL][ZIO_WAIT_DONE] == 0); + return (0); - (void) zio_wait(pio); + zio_reexecute(pio); + return (zio_wait(pio)); } void @@ -1251,10 +1423,16 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) * (Presently, nothing actually uses interior data checksums; * this is just good hygiene.) */ - if (gn != pio->io_logical->io_gang_tree) { + if (gn != pio->io_gang_leader->io_gang_tree) { zio_checksum_compute(zio, BP_GET_CHECKSUM(bp), data, BP_GET_PSIZE(bp)); } + /* + * If we are here to damage data for testing purposes, + * leave the GBH alone so that we can detect the damage. + */ + if (pio->io_gang_leader->io_flags & ZIO_FLAG_INDUCE_DAMAGE) + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; } else { zio = zio_rewrite(pio, pio->io_spa, pio->io_txg, bp, data, BP_GET_PSIZE(bp), NULL, NULL, pio->io_priority, @@ -1268,8 +1446,8 @@ zio_rewrite_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) zio_t * zio_free_gang(zio_t *pio, blkptr_t *bp, zio_gang_node_t *gn, void *data) { - return (zio_free(pio, pio->io_spa, pio->io_txg, bp, - NULL, NULL, ZIO_GANG_CHILD_FLAGS(pio))); + return (zio_free_sync(pio, pio->io_spa, pio->io_txg, bp, + ZIO_GANG_CHILD_FLAGS(pio))); } /* ARGSUSED */ @@ -1333,27 +1511,27 @@ zio_gang_tree_free(zio_gang_node_t **gnpp) } static void -zio_gang_tree_assemble(zio_t *lio, blkptr_t *bp, zio_gang_node_t **gnpp) +zio_gang_tree_assemble(zio_t *gio, blkptr_t *bp, zio_gang_node_t **gnpp) { zio_gang_node_t *gn = zio_gang_node_alloc(gnpp); - ASSERT(lio->io_logical == lio); + ASSERT(gio->io_gang_leader == gio); ASSERT(BP_IS_GANG(bp)); - zio_nowait(zio_read(lio, lio->io_spa, bp, gn->gn_gbh, + zio_nowait(zio_read(gio, gio->io_spa, bp, gn->gn_gbh, SPA_GANGBLOCKSIZE, zio_gang_tree_assemble_done, gn, - lio->io_priority, ZIO_GANG_CHILD_FLAGS(lio), &lio->io_bookmark)); + gio->io_priority, ZIO_GANG_CHILD_FLAGS(gio), &gio->io_bookmark)); } static void zio_gang_tree_assemble_done(zio_t *zio) { - zio_t *lio = zio->io_logical; + zio_t *gio = zio->io_gang_leader; zio_gang_node_t *gn = zio->io_private; blkptr_t *bp = zio->io_bp; - ASSERT(zio->io_parent == lio); - ASSERT(zio->io_child == NULL); + ASSERT(gio == zio_unique_parent(zio)); + ASSERT(zio->io_child_count == 0); if (zio->io_error) return; @@ -1363,34 +1541,34 @@ zio_gang_tree_assemble_done(zio_t *zio) ASSERT(zio->io_data == gn->gn_gbh); ASSERT(zio->io_size == SPA_GANGBLOCKSIZE); - ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); + ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; if (!BP_IS_GANG(gbp)) continue; - zio_gang_tree_assemble(lio, gbp, &gn->gn_child[g]); + zio_gang_tree_assemble(gio, gbp, &gn->gn_child[g]); } } static void zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) { - zio_t *lio = pio->io_logical; + zio_t *gio = pio->io_gang_leader; zio_t *zio; ASSERT(BP_IS_GANG(bp) == !!gn); - ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(lio->io_bp)); - ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == lio->io_gang_tree); + ASSERT(BP_GET_CHECKSUM(bp) == BP_GET_CHECKSUM(gio->io_bp)); + ASSERT(BP_GET_LSIZE(bp) == BP_GET_PSIZE(bp) || gn == gio->io_gang_tree); /* * If you're a gang header, your data is in gn->gn_gbh. * If you're a gang member, your data is in 'data' and gn == NULL. */ - zio = zio_gang_issue_func[lio->io_type](pio, bp, gn, data); + zio = zio_gang_issue_func[gio->io_type](pio, bp, gn, data); if (gn != NULL) { - ASSERT(gn->gn_gbh->zg_tail.zbt_magic == ZBT_MAGIC); + ASSERT(gn->gn_gbh->zg_tail.zec_magic == ZEC_MAGIC); for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { blkptr_t *gbp = &gn->gn_gbh->zg_blkptr[g]; @@ -1401,8 +1579,8 @@ zio_gang_tree_issue(zio_t *pio, zio_gang_node_t *gn, blkptr_t *bp, void *data) } } - if (gn == lio->io_gang_tree) - ASSERT3P((char *)lio->io_data + lio->io_size, ==, data); + if (gn == gio->io_gang_tree) + ASSERT3P((char *)gio->io_data + gio->io_size, ==, data); if (zio != pio) zio_nowait(zio); @@ -1413,7 +1591,10 @@ zio_gang_assemble(zio_t *zio) { blkptr_t *bp = zio->io_bp; - ASSERT(BP_IS_GANG(bp) && zio == zio->io_logical); + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == NULL); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + + zio->io_gang_leader = zio; zio_gang_tree_assemble(zio, bp, &zio->io_gang_tree); @@ -1423,18 +1604,18 @@ zio_gang_assemble(zio_t *zio) static int zio_gang_issue(zio_t *zio) { - zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); - ASSERT(BP_IS_GANG(bp) && zio == lio); + ASSERT(BP_IS_GANG(bp) && zio->io_gang_leader == zio); + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); if (zio->io_child_error[ZIO_CHILD_GANG] == 0) - zio_gang_tree_issue(lio, lio->io_gang_tree, bp, lio->io_data); + zio_gang_tree_issue(zio, zio->io_gang_tree, bp, zio->io_data); else - zio_gang_tree_free(&lio->io_gang_tree); + zio_gang_tree_free(&zio->io_gang_tree); zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; @@ -1444,8 +1625,8 @@ zio_gang_issue(zio_t *zio) static void zio_write_gang_member_ready(zio_t *zio) { - zio_t *pio = zio->io_parent; - zio_t *lio = zio->io_logical; + zio_t *pio = zio_unique_parent(zio); + zio_t *gio = zio->io_gang_leader; dva_t *cdva = zio->io_bp->blk_dva; dva_t *pdva = pio->io_bp->blk_dva; uint64_t asize; @@ -1456,9 +1637,9 @@ zio_write_gang_member_ready(zio_t *zio) ASSERT(BP_IS_HOLE(&zio->io_bp_orig)); ASSERT(zio->io_child_type == ZIO_CHILD_GANG); - ASSERT3U(zio->io_prop.zp_ndvas, ==, lio->io_prop.zp_ndvas); - ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(zio->io_bp)); - ASSERT3U(pio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(pio->io_bp)); + ASSERT3U(zio->io_prop.zp_copies, ==, gio->io_prop.zp_copies); + ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(zio->io_bp)); + ASSERT3U(pio->io_prop.zp_copies, <=, BP_GET_NDVAS(pio->io_bp)); ASSERT3U(BP_GET_NDVAS(zio->io_bp), <=, BP_GET_NDVAS(pio->io_bp)); mutex_enter(&pio->io_lock); @@ -1476,28 +1657,28 @@ zio_write_gang_block(zio_t *pio) { spa_t *spa = pio->io_spa; blkptr_t *bp = pio->io_bp; - zio_t *lio = pio->io_logical; + zio_t *gio = pio->io_gang_leader; zio_t *zio; zio_gang_node_t *gn, **gnpp; zio_gbh_phys_t *gbh; uint64_t txg = pio->io_txg; uint64_t resid = pio->io_size; uint64_t lsize; - int ndvas = lio->io_prop.zp_ndvas; - int gbh_ndvas = MIN(ndvas + 1, spa_max_replication(spa)); + int copies = gio->io_prop.zp_copies; + int gbh_copies = MIN(copies + 1, spa_max_replication(spa)); zio_prop_t zp; int error; - error = metaslab_alloc(spa, spa->spa_normal_class, SPA_GANGBLOCKSIZE, - bp, gbh_ndvas, txg, pio == lio ? NULL : lio->io_bp, + error = metaslab_alloc(spa, spa_normal_class(spa), SPA_GANGBLOCKSIZE, + bp, gbh_copies, txg, pio == gio ? NULL : gio->io_bp, METASLAB_HINTBP_FAVOR | METASLAB_GANG_HEADER); if (error) { pio->io_error = error; return (ZIO_PIPELINE_CONTINUE); } - if (pio == lio) { - gnpp = &lio->io_gang_tree; + if (pio == gio) { + gnpp = &gio->io_gang_tree; } else { gnpp = pio->io_private; ASSERT(pio->io_ready == zio_write_gang_member_ready); @@ -1521,11 +1702,13 @@ zio_write_gang_block(zio_t *pio) SPA_MINBLOCKSIZE); ASSERT(lsize >= SPA_MINBLOCKSIZE && lsize <= resid); - zp.zp_checksum = lio->io_prop.zp_checksum; + zp.zp_checksum = gio->io_prop.zp_checksum; zp.zp_compress = ZIO_COMPRESS_OFF; zp.zp_type = DMU_OT_NONE; zp.zp_level = 0; - zp.zp_ndvas = lio->io_prop.zp_ndvas; + zp.zp_copies = gio->io_prop.zp_copies; + zp.zp_dedup = 0; + zp.zp_dedup_verify = 0; zio_nowait(zio_write(zio, spa, txg, &gbh->zg_blkptr[g], (char *)pio->io_data + (pio->io_size - resid), lsize, &zp, @@ -1546,26 +1729,397 @@ zio_write_gang_block(zio_t *pio) /* * ========================================================================== - * Allocate and free blocks + * Dedup * ========================================================================== */ +static void +zio_ddt_child_read_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp; + zio_t *pio = zio_unique_parent(zio); + + mutex_enter(&pio->io_lock); + ddp = ddt_phys_select(dde, bp); + if (zio->io_error == 0) + ddt_phys_clear(ddp); /* this ddp doesn't need repair */ + if (zio->io_error == 0 && dde->dde_repair_data == NULL) + dde->dde_repair_data = zio->io_data; + else + zio_buf_free(zio->io_data, zio->io_size); + mutex_exit(&pio->io_lock); +} + +static int +zio_ddt_read_start(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_PSIZE(bp) == zio->io_size); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (zio->io_child_error[ZIO_CHILD_DDT]) { + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = ddt_repair_start(ddt, bp); + ddt_phys_t *ddp = dde->dde_phys; + ddt_phys_t *ddp_self = ddt_phys_select(dde, bp); + blkptr_t blk; + + ASSERT(zio->io_vsd == NULL); + zio->io_vsd = dde; + + if (ddp_self == NULL) + return (ZIO_PIPELINE_CONTINUE); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || ddp == ddp_self) + continue; + ddt_bp_create(ddt->ddt_checksum, &dde->dde_key, ddp, + &blk); + zio_nowait(zio_read(zio, zio->io_spa, &blk, + zio_buf_alloc(zio->io_size), zio->io_size, + zio_ddt_child_read_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio) | ZIO_FLAG_DONT_PROPAGATE, + &zio->io_bookmark)); + } + return (ZIO_PIPELINE_CONTINUE); + } + + zio_nowait(zio_read(zio, zio->io_spa, bp, + zio->io_data, zio->io_size, NULL, NULL, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark)); + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_ddt_read_done(zio_t *zio) +{ + blkptr_t *bp = zio->io_bp; + + if (zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE)) + return (ZIO_PIPELINE_STOP); + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_PSIZE(bp) == zio->io_size); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + if (zio->io_child_error[ZIO_CHILD_DDT]) { + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = zio->io_vsd; + if (ddt == NULL) { + ASSERT(spa_load_state(zio->io_spa) != SPA_LOAD_NONE); + return (ZIO_PIPELINE_CONTINUE); + } + if (dde == NULL) { + zio->io_stage = ZIO_STAGE_DDT_READ_START >> 1; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, B_FALSE); + return (ZIO_PIPELINE_STOP); + } + if (dde->dde_repair_data != NULL) { + bcopy(dde->dde_repair_data, zio->io_data, zio->io_size); + zio->io_child_error[ZIO_CHILD_DDT] = 0; + } + ddt_repair_done(ddt, dde); + zio->io_vsd = NULL; + } + + ASSERT(zio->io_vsd == NULL); + + return (ZIO_PIPELINE_CONTINUE); +} + +static boolean_t +zio_ddt_collision(zio_t *zio, ddt_t *ddt, ddt_entry_t *dde) +{ + spa_t *spa = zio->io_spa; + + /* + * Note: we compare the original data, not the transformed data, + * because when zio->io_bp is an override bp, we will not have + * pushed the I/O transforms. That's an important optimization + * because otherwise we'd compress/encrypt all dmu_sync() data twice. + */ + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + zio_t *lio = dde->dde_lead_zio[p]; + + if (lio != NULL) { + return (lio->io_orig_size != zio->io_orig_size || + bcmp(zio->io_orig_data, lio->io_orig_data, + zio->io_orig_size) != 0); + } + } + + for (int p = DDT_PHYS_SINGLE; p <= DDT_PHYS_TRIPLE; p++) { + ddt_phys_t *ddp = &dde->dde_phys[p]; + + if (ddp->ddp_phys_birth != 0) { + arc_buf_t *abuf = NULL; + uint32_t aflags = ARC_WAIT; + blkptr_t blk = *zio->io_bp; + int error; + + ddt_bp_fill(ddp, &blk, ddp->ddp_phys_birth); + + ddt_exit(ddt); + + error = arc_read_nolock(NULL, spa, &blk, + arc_getbuf_func, &abuf, ZIO_PRIORITY_SYNC_READ, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE, + &aflags, &zio->io_bookmark); + + if (error == 0) { + if (arc_buf_size(abuf) != zio->io_orig_size || + bcmp(abuf->b_data, zio->io_orig_data, + zio->io_orig_size) != 0) + error = EEXIST; + VERIFY(arc_buf_remove_ref(abuf, &abuf) == 1); + } + + ddt_enter(ddt); + return (error != 0); + } + } + + return (B_FALSE); +} + +static void +zio_ddt_child_write_ready(zio_t *zio) +{ + int p = zio->io_prop.zp_copies; + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + zio_t *pio; + + if (zio->io_error) + return; + + ddt_enter(ddt); + + ASSERT(dde->dde_lead_zio[p] == zio); + + ddt_phys_fill(ddp, zio->io_bp); + + while ((pio = zio_walk_parents(zio)) != NULL) + ddt_bp_fill(ddp, pio->io_bp, zio->io_txg); + + ddt_exit(ddt); +} + +static void +zio_ddt_child_write_done(zio_t *zio) +{ + int p = zio->io_prop.zp_copies; + ddt_t *ddt = ddt_select(zio->io_spa, zio->io_bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + + ddt_enter(ddt); + + ASSERT(ddp->ddp_refcnt == 0); + ASSERT(dde->dde_lead_zio[p] == zio); + dde->dde_lead_zio[p] = NULL; + + if (zio->io_error == 0) { + while (zio_walk_parents(zio) != NULL) + ddt_phys_addref(ddp); + } else { + ddt_phys_clear(ddp); + } + + ddt_exit(ddt); +} + +static void +zio_ddt_ditto_write_done(zio_t *zio) +{ + int p = DDT_PHYS_DITTO; + zio_prop_t *zp = &zio->io_prop; + blkptr_t *bp = zio->io_bp; + ddt_t *ddt = ddt_select(zio->io_spa, bp); + ddt_entry_t *dde = zio->io_private; + ddt_phys_t *ddp = &dde->dde_phys[p]; + ddt_key_t *ddk = &dde->dde_key; + + ddt_enter(ddt); + + ASSERT(ddp->ddp_refcnt == 0); + ASSERT(dde->dde_lead_zio[p] == zio); + dde->dde_lead_zio[p] = NULL; + + if (zio->io_error == 0) { + ASSERT(ZIO_CHECKSUM_EQUAL(bp->blk_cksum, ddk->ddk_cksum)); + ASSERT(zp->zp_copies < SPA_DVAS_PER_BP); + ASSERT(zp->zp_copies == BP_GET_NDVAS(bp) - BP_IS_GANG(bp)); + if (ddp->ddp_phys_birth != 0) + ddt_phys_free(ddt, ddk, ddp, zio->io_txg); + ddt_phys_fill(ddp, bp); + } + + ddt_exit(ddt); +} +static int +zio_ddt_write(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + uint64_t txg = zio->io_txg; + zio_prop_t *zp = &zio->io_prop; + int p = zp->zp_copies; + int ditto_copies; + zio_t *cio = NULL; + zio_t *dio = NULL; + ddt_t *ddt = ddt_select(spa, bp); + ddt_entry_t *dde; + ddt_phys_t *ddp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(BP_GET_CHECKSUM(bp) == zp->zp_checksum); + ASSERT(BP_IS_HOLE(bp) || zio->io_bp_override); + + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_TRUE); + ddp = &dde->dde_phys[p]; + + if (zp->zp_dedup_verify && zio_ddt_collision(zio, ddt, dde)) { + /* + * If we're using a weak checksum, upgrade to a strong checksum + * and try again. If we're already using a strong checksum, + * we can't resolve it, so just convert to an ordinary write. + * (And automatically e-mail a paper to Nature?) + */ + if (!zio_checksum_table[zp->zp_checksum].ci_dedup) { + zp->zp_checksum = spa_dedup_checksum(spa); + zio_pop_transforms(zio); + zio->io_stage = ZIO_STAGE_OPEN; + BP_ZERO(bp); + } else { + zp->zp_dedup = 0; + } + zio->io_pipeline = ZIO_WRITE_PIPELINE; + ddt_exit(ddt); + return (ZIO_PIPELINE_CONTINUE); + } + + ditto_copies = ddt_ditto_copies_needed(ddt, dde, ddp); + ASSERT(ditto_copies < SPA_DVAS_PER_BP); + + if (ditto_copies > ddt_ditto_copies_present(dde) && + dde->dde_lead_zio[DDT_PHYS_DITTO] == NULL) { + zio_prop_t czp = *zp; + + czp.zp_copies = ditto_copies; + + /* + * If we arrived here with an override bp, we won't have run + * the transform stack, so we won't have the data we need to + * generate a child i/o. So, toss the override bp and restart. + * This is safe, because using the override bp is just an + * optimization; and it's rare, so the cost doesn't matter. + */ + if (zio->io_bp_override) { + zio_pop_transforms(zio); + zio->io_stage = ZIO_STAGE_OPEN; + zio->io_pipeline = ZIO_WRITE_PIPELINE; + zio->io_bp_override = NULL; + BP_ZERO(bp); + ddt_exit(ddt); + return (ZIO_PIPELINE_CONTINUE); + } + + dio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + zio->io_orig_size, &czp, NULL, + zio_ddt_ditto_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(dio, zio->io_data, zio->io_size, 0, NULL); + dde->dde_lead_zio[DDT_PHYS_DITTO] = dio; + } + + if (ddp->ddp_phys_birth != 0 || dde->dde_lead_zio[p] != NULL) { + if (ddp->ddp_phys_birth != 0) + ddt_bp_fill(ddp, bp, txg); + if (dde->dde_lead_zio[p] != NULL) + zio_add_child(zio, dde->dde_lead_zio[p]); + else + ddt_phys_addref(ddp); + } else if (zio->io_bp_override) { + ASSERT(bp->blk_birth == txg); + ASSERT(BP_EQUAL(bp, zio->io_bp_override)); + ddt_phys_fill(ddp, bp); + ddt_phys_addref(ddp); + } else { + cio = zio_write(zio, spa, txg, bp, zio->io_orig_data, + zio->io_orig_size, zp, zio_ddt_child_write_ready, + zio_ddt_child_write_done, dde, zio->io_priority, + ZIO_DDT_CHILD_FLAGS(zio), &zio->io_bookmark); + + zio_push_transform(cio, zio->io_data, zio->io_size, 0, NULL); + dde->dde_lead_zio[p] = cio; + } + + ddt_exit(ddt); + + if (cio) + zio_nowait(cio); + if (dio) + zio_nowait(dio); + + return (ZIO_PIPELINE_CONTINUE); +} + +static int +zio_ddt_free(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + ddt_t *ddt = ddt_select(spa, bp); + ddt_entry_t *dde; + ddt_phys_t *ddp; + + ASSERT(BP_GET_DEDUP(bp)); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); + + ddt_enter(ddt); + dde = ddt_lookup(ddt, bp, B_TRUE); + ddp = ddt_phys_select(dde, bp); + ddt_phys_decref(ddp); + ddt_exit(ddt); + + return (ZIO_PIPELINE_CONTINUE); +} + +/* + * ========================================================================== + * Allocate and free blocks + * ========================================================================== + */ static int zio_dva_allocate(zio_t *zio) { spa_t *spa = zio->io_spa; - metaslab_class_t *mc = spa->spa_normal_class; + metaslab_class_t *mc = spa_normal_class(spa); blkptr_t *bp = zio->io_bp; int error; + if (zio->io_gang_leader == NULL) { + ASSERT(zio->io_child_type > ZIO_CHILD_GANG); + zio->io_gang_leader = zio; + } + ASSERT(BP_IS_HOLE(bp)); ASSERT3U(BP_GET_NDVAS(bp), ==, 0); - ASSERT3U(zio->io_prop.zp_ndvas, >, 0); - ASSERT3U(zio->io_prop.zp_ndvas, <=, spa_max_replication(spa)); + ASSERT3U(zio->io_prop.zp_copies, >, 0); + ASSERT3U(zio->io_prop.zp_copies, <=, spa_max_replication(spa)); ASSERT3U(zio->io_size, ==, BP_GET_PSIZE(bp)); error = metaslab_alloc(spa, mc, zio->io_size, bp, - zio->io_prop.zp_ndvas, zio->io_txg, NULL, 0); + zio->io_prop.zp_copies, zio->io_txg, NULL, 0); if (error) { if (error == ENOSPC && zio->io_size > SPA_MINBLOCKSIZE) @@ -1604,36 +2158,11 @@ zio_dva_claim(zio_t *zio) static void zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) { - spa_t *spa = zio->io_spa; - boolean_t now = !(zio->io_flags & ZIO_FLAG_IO_REWRITE); - ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); - - if (zio->io_bp == bp && !now) { - /* - * This is a rewrite for sync-to-convergence. - * We can't do a metaslab_free(NOW) because bp wasn't allocated - * during this sync pass, which means that metaslab_sync() - * already committed the allocation. - */ - ASSERT(DVA_EQUAL(BP_IDENTITY(bp), - BP_IDENTITY(&zio->io_bp_orig))); - ASSERT(spa_sync_pass(spa) > 1); - - if (BP_IS_GANG(bp) && gn == NULL) { - /* - * This is a gang leader whose gang header(s) we - * couldn't read now, so defer the free until later. - * The block should still be intact because without - * the headers, we'd never even start the rewrite. - */ - bplist_enqueue_deferred(&spa->spa_sync_bplist, bp); - return; - } - } + ASSERT(zio->io_bp_override == NULL); if (!BP_IS_HOLE(bp)) - metaslab_free(spa, bp, bp->blk_birth, now); + metaslab_free(zio->io_spa, bp, bp->blk_birth, B_TRUE); if (gn != NULL) { for (int g = 0; g < SPA_GBH_NBLKPTRS; g++) { @@ -1647,25 +2176,31 @@ zio_dva_unallocate(zio_t *zio, zio_gang_node_t *gn, blkptr_t *bp) * Try to allocate an intent log block. Return 0 on success, errno on failure. */ int -zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, - uint64_t txg) +zio_alloc_zil(spa_t *spa, uint64_t txg, blkptr_t *new_bp, blkptr_t *old_bp, + uint64_t size, boolean_t use_slog) { - int error; + int error = 1; + + ASSERT(txg > spa_syncing_txg(spa)); - error = metaslab_alloc(spa, spa->spa_log_class, size, - new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); + if (use_slog) + error = metaslab_alloc(spa, spa_log_class(spa), size, + new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); if (error) - error = metaslab_alloc(spa, spa->spa_normal_class, size, + error = metaslab_alloc(spa, spa_normal_class(spa), size, new_bp, 1, txg, old_bp, METASLAB_HINTBP_AVOID); if (error == 0) { BP_SET_LSIZE(new_bp, size); BP_SET_PSIZE(new_bp, size); BP_SET_COMPRESS(new_bp, ZIO_COMPRESS_OFF); - BP_SET_CHECKSUM(new_bp, ZIO_CHECKSUM_ZILOG); + BP_SET_CHECKSUM(new_bp, + spa_version(spa) >= SPA_VERSION_SLIM_ZIL + ? ZIO_CHECKSUM_ZILOG2 : ZIO_CHECKSUM_ZILOG); BP_SET_TYPE(new_bp, DMU_OT_INTENT_LOG); BP_SET_LEVEL(new_bp, 0); + BP_SET_DEDUP(new_bp, 0); BP_SET_BYTEORDER(new_bp, ZFS_HOST_BYTEORDER); } @@ -1673,15 +2208,15 @@ zio_alloc_blk(spa_t *spa, uint64_t size, blkptr_t *new_bp, blkptr_t *old_bp, } /* - * Free an intent log block. We know it can't be a gang block, so there's - * nothing to do except metaslab_free() it. + * Free an intent log block. */ void -zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) +zio_free_zil(spa_t *spa, uint64_t txg, blkptr_t *bp) { + ASSERT(BP_GET_TYPE(bp) == DMU_OT_INTENT_LOG); ASSERT(!BP_IS_GANG(bp)); - metaslab_free(spa, bp, txg, B_FALSE); + zio_free(spa, txg, bp); } /* @@ -1689,72 +2224,6 @@ zio_free_blk(spa_t *spa, blkptr_t *bp, uint64_t txg) * Read and write to physical devices * ========================================================================== */ - -static void -zio_vdev_io_probe_done(zio_t *zio) -{ - zio_t *dio; - vdev_t *vd = zio->io_private; - - mutex_enter(&vd->vdev_probe_lock); - ASSERT(vd->vdev_probe_zio == zio); - vd->vdev_probe_zio = NULL; - mutex_exit(&vd->vdev_probe_lock); - - while ((dio = zio->io_delegate_list) != NULL) { - zio->io_delegate_list = dio->io_delegate_next; - dio->io_delegate_next = NULL; - if (!vdev_accessible(vd, dio)) - dio->io_error = ENXIO; - zio_execute(dio); - } -} - -/* - * Probe the device to determine whether I/O failure is specific to this - * zio (e.g. a bad sector) or affects the entire vdev (e.g. unplugged). - */ -static int -zio_vdev_io_probe(zio_t *zio) -{ - vdev_t *vd = zio->io_vd; - zio_t *pio = NULL; - boolean_t created_pio = B_FALSE; - - /* - * Don't probe the probe. - */ - if (zio->io_flags & ZIO_FLAG_PROBE) - return (ZIO_PIPELINE_CONTINUE); - - /* - * To prevent 'probe storms' when a device fails, we create - * just one probe i/o at a time. All zios that want to probe - * this vdev will join the probe zio's io_delegate_list. - */ - mutex_enter(&vd->vdev_probe_lock); - - if ((pio = vd->vdev_probe_zio) == NULL) { - vd->vdev_probe_zio = pio = zio_root(zio->io_spa, - zio_vdev_io_probe_done, vd, ZIO_FLAG_CANFAIL); - created_pio = B_TRUE; - vd->vdev_probe_wanted = B_TRUE; - spa_async_request(zio->io_spa, SPA_ASYNC_PROBE); - } - - zio->io_delegate_next = pio->io_delegate_list; - pio->io_delegate_list = zio; - - mutex_exit(&vd->vdev_probe_lock); - - if (created_pio) { - zio_nowait(vdev_probe(vd, pio)); - zio_nowait(pio); - } - - return (ZIO_PIPELINE_STOP); -} - static int zio_vdev_io_start(zio_t *zio) { @@ -1790,13 +2259,35 @@ zio_vdev_io_start(zio_t *zio) ASSERT(P2PHASE(zio->io_offset, align) == 0); ASSERT(P2PHASE(zio->io_size, align) == 0); - ASSERT(zio->io_type != ZIO_TYPE_WRITE || (spa_mode & FWRITE)); + ASSERT(zio->io_type != ZIO_TYPE_WRITE || spa_writeable(spa)); + + /* + * If this is a repair I/O, and there's no self-healing involved -- + * that is, we're just resilvering what we expect to resilver -- + * then don't do the I/O unless zio's txg is actually in vd's DTL. + * This prevents spurious resilvering with nested replication. + * For example, given a mirror of mirrors, (A+B)+(C+D), if only + * A is out of date, we'll read from C+D, then use the data to + * resilver A+B -- but we don't actually want to resilver B, just A. + * The top-level mirror has no way to know this, so instead we just + * discard unnecessary repairs as we work our way down the vdev tree. + * The same logic applies to any form of nested replication: + * ditto + mirror, RAID-Z + replacing, etc. This covers them all. + */ + if ((zio->io_flags & ZIO_FLAG_IO_REPAIR) && + !(zio->io_flags & ZIO_FLAG_SELF_HEAL) && + zio->io_txg != 0 && /* not a delegated i/o */ + !vdev_dtl_contains(vd, DTL_PARTIAL, zio->io_txg, 1)) { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + zio_vdev_io_bypass(zio); + return (ZIO_PIPELINE_CONTINUE); + } if (vd->vdev_ops->vdev_op_leaf && (zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_WRITE)) { if (zio->io_type == ZIO_TYPE_READ && vdev_cache_read(zio) == 0) - return (ZIO_PIPELINE_STOP); + return (ZIO_PIPELINE_CONTINUE); if ((zio = vdev_queue_io(zio)) == NULL) return (ZIO_PIPELINE_STOP); @@ -1806,7 +2297,6 @@ zio_vdev_io_start(zio_t *zio) zio_interrupt(zio); return (ZIO_PIPELINE_STOP); } - } return (vd->vdev_ops->vdev_op_io_start(zio)); @@ -1832,7 +2322,8 @@ zio_vdev_io_done(zio_t *zio) vdev_cache_write(zio); if (zio_injection_enabled && zio->io_error == 0) - zio->io_error = zio_handle_device_injection(vd, EIO); + zio->io_error = zio_handle_device_injection(vd, + zio, EIO); if (zio_injection_enabled && zio->io_error == 0) zio->io_error = zio_handle_label_injection(zio, EIO); @@ -1849,11 +2340,37 @@ zio_vdev_io_done(zio_t *zio) ops->vdev_op_io_done(zio); if (unexpected_error) - return (zio_vdev_io_probe(zio)); + VERIFY(vdev_probe(vd, zio) == NULL); return (ZIO_PIPELINE_CONTINUE); } +/* + * For non-raidz ZIOs, we can just copy aside the bad data read from the + * disk, and use that to finish the checksum ereport later. + */ +static void +zio_vsd_default_cksum_finish(zio_cksum_report_t *zcr, + const void *good_buf) +{ + /* no processing needed */ + zfs_ereport_finish_checksum(zcr, good_buf, zcr->zcr_cbdata, B_FALSE); +} + +/*ARGSUSED*/ +void +zio_vsd_default_cksum_report(zio_t *zio, zio_cksum_report_t *zcr, void *ignored) +{ + void *buf = zio_buf_alloc(zio->io_size); + + bcopy(zio->io_data, buf, zio->io_size); + + zcr->zcr_cbinfo = zio->io_size; + zcr->zcr_cbdata = buf; + zcr->zcr_finish = zio_vsd_default_cksum_finish; + zcr->zcr_free = zio_buf_free; +} + static int zio_vdev_io_assess(zio_t *zio) { @@ -1866,7 +2383,7 @@ zio_vdev_io_assess(zio_t *zio) spa_config_exit(zio->io_spa, SCL_ZIO, zio); if (zio->io_vsd != NULL) { - zio->io_vsd_free(zio); + zio->io_vsd_ops->vsd_free(zio); zio->io_vsd = NULL; } @@ -1875,6 +2392,9 @@ zio_vdev_io_assess(zio_t *zio) /* * If the I/O failed, determine whether we should attempt to retry it. + * + * On retry, we cut in line in the issue queue, since we don't want + * compression/checksumming/etc. work to prevent our (cheap) IO reissue. */ if (zio->io_error && vd == NULL && !(zio->io_flags & (ZIO_FLAG_DONT_RETRY | ZIO_FLAG_IO_RETRY))) { @@ -1883,8 +2403,9 @@ zio_vdev_io_assess(zio_t *zio) zio->io_error = 0; zio->io_flags |= ZIO_FLAG_IO_RETRY | ZIO_FLAG_DONT_CACHE | ZIO_FLAG_DONT_AGGREGATE; - zio->io_stage = ZIO_STAGE_VDEV_IO_START - 1; - zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE); + zio->io_stage = ZIO_STAGE_VDEV_IO_START >> 1; + zio_taskq_dispatch(zio, ZIO_TASKQ_ISSUE, + zio_requeue_io_start_cut_in_line); return (ZIO_PIPELINE_STOP); } @@ -1916,7 +2437,7 @@ zio_vdev_io_reissue(zio_t *zio) ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_START); ASSERT(zio->io_error == 0); - zio->io_stage--; + zio->io_stage >>= 1; } void @@ -1924,7 +2445,7 @@ zio_vdev_io_redone(zio_t *zio) { ASSERT(zio->io_stage == ZIO_STAGE_VDEV_IO_DONE); - zio->io_stage--; + zio->io_stage >>= 1; } void @@ -1934,7 +2455,7 @@ zio_vdev_io_bypass(zio_t *zio) ASSERT(zio->io_error == 0); zio->io_flags |= ZIO_FLAG_IO_BYPASS; - zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS - 1; + zio->io_stage = ZIO_STAGE_VDEV_IO_ASSESS >> 1; } /* @@ -1976,9 +2497,12 @@ zio_checksum_generate(zio_t *zio) static int zio_checksum_verify(zio_t *zio) { + zio_bad_cksum_t info; blkptr_t *bp = zio->io_bp; int error; + ASSERT(zio->io_vd != NULL); + if (bp == NULL) { /* * This is zio_read_phys(). @@ -1990,11 +2514,12 @@ zio_checksum_verify(zio_t *zio) ASSERT(zio->io_prop.zp_checksum == ZIO_CHECKSUM_LABEL); } - if ((error = zio_checksum_error(zio)) != 0) { + if ((error = zio_checksum_error(zio, &info)) != 0) { zio->io_error = error; if (!(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { - zfs_ereport_post(FM_EREPORT_ZFS_CHECKSUM, - zio->io_spa, zio->io_vd, zio, 0, 0); + zfs_ereport_start_checksum(zio->io_spa, + zio->io_vd, zio, zio->io_offset, + zio->io_size, NULL, &info); } } @@ -2007,7 +2532,7 @@ zio_checksum_verify(zio_t *zio) void zio_checksum_verified(zio_t *zio) { - zio->io_pipeline &= ~(1U << ZIO_STAGE_CHECKSUM_VERIFY); + zio->io_pipeline &= ~ZIO_STAGE_CHECKSUM_VERIFY; } /* @@ -2045,13 +2570,13 @@ static int zio_ready(zio_t *zio) { blkptr_t *bp = zio->io_bp; - zio_t *pio = zio->io_parent; + zio_t *pio, *pio_next; - if (zio->io_ready) { - if (BP_IS_GANG(bp) && - zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY)) - return (ZIO_PIPELINE_STOP); + if (zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_READY) || + zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_READY)) + return (ZIO_PIPELINE_STOP); + if (zio->io_ready) { ASSERT(IO_IS_ALLOCATING(zio)); ASSERT(bp->blk_birth == zio->io_txg || BP_IS_HOLE(bp)); ASSERT(zio->io_children[ZIO_CHILD_GANG][ZIO_WAIT_READY] == 0); @@ -2065,8 +2590,35 @@ zio_ready(zio_t *zio) if (zio->io_error) zio->io_pipeline = ZIO_INTERLOCK_PIPELINE; - if (pio != NULL) + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_READY] = 1; + pio = zio_walk_parents(zio); + mutex_exit(&zio->io_lock); + + /* + * As we notify zio's parents, new parents could be added. + * New parents go to the head of zio's io_parent_list, however, + * so we will (correctly) not notify them. The remainder of zio's + * io_parent_list, from 'pio_next' onward, cannot change because + * all parents must wait for us to be done before they can be done. + */ + for (; pio != NULL; pio = pio_next) { + pio_next = zio_walk_parents(zio); zio_notify_parent(pio, zio, ZIO_WAIT_READY); + } + + if (zio->io_flags & ZIO_FLAG_NODATA) { + if (BP_IS_GANG(bp)) { + zio->io_flags &= ~ZIO_FLAG_NODATA; + } else { + ASSERT((uintptr_t)zio->io_data < SPA_MAXBLOCKSIZE); + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; + } + } + + if (zio_injection_enabled && + zio->io_spa->spa_syncing_txg == zio->io_txg) + zio_handle_ignored_writes(zio); return (ZIO_PIPELINE_CONTINUE); } @@ -2075,18 +2627,19 @@ static int zio_done(zio_t *zio) { spa_t *spa = zio->io_spa; - zio_t *pio = zio->io_parent; zio_t *lio = zio->io_logical; blkptr_t *bp = zio->io_bp; vdev_t *vd = zio->io_vd; uint64_t psize = zio->io_size; + zio_t *pio, *pio_next; /* - * If our of children haven't all completed, + * If our children haven't all completed, * wait for them and then repeat this pipeline stage. */ if (zio_wait_for_children(zio, ZIO_CHILD_VDEV, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_GANG, ZIO_WAIT_DONE) || + zio_wait_for_children(zio, ZIO_CHILD_DDT, ZIO_WAIT_DONE) || zio_wait_for_children(zio, ZIO_CHILD_LOGICAL, ZIO_WAIT_DONE)) return (ZIO_PIPELINE_STOP); @@ -2097,23 +2650,51 @@ zio_done(zio_t *zio) if (bp != NULL) { ASSERT(bp->blk_pad[0] == 0); ASSERT(bp->blk_pad[1] == 0); - ASSERT(bp->blk_pad[2] == 0); ASSERT(bcmp(bp, &zio->io_bp_copy, sizeof (blkptr_t)) == 0 || - (pio != NULL && bp == pio->io_bp)); + (bp == zio_unique_parent(zio)->io_bp)); if (zio->io_type == ZIO_TYPE_WRITE && !BP_IS_HOLE(bp) && + zio->io_bp_override == NULL && !(zio->io_flags & ZIO_FLAG_IO_REPAIR)) { ASSERT(!BP_SHOULD_BYTESWAP(bp)); - ASSERT3U(zio->io_prop.zp_ndvas, <=, BP_GET_NDVAS(bp)); + ASSERT3U(zio->io_prop.zp_copies, <=, BP_GET_NDVAS(bp)); ASSERT(BP_COUNT_GANG(bp) == 0 || (BP_COUNT_GANG(bp) == BP_GET_NDVAS(bp))); } } /* - * If there were child vdev or gang errors, they apply to us now. + * If there were child vdev/gang/ddt errors, they apply to us now. */ zio_inherit_child_errors(zio, ZIO_CHILD_VDEV); zio_inherit_child_errors(zio, ZIO_CHILD_GANG); + zio_inherit_child_errors(zio, ZIO_CHILD_DDT); + + /* + * If the I/O on the transformed data was successful, generate any + * checksum reports now while we still have the transformed data. + */ + if (zio->io_error == 0) { + while (zio->io_cksum_report != NULL) { + zio_cksum_report_t *zcr = zio->io_cksum_report; + uint64_t align = zcr->zcr_align; + uint64_t asize = P2ROUNDUP(psize, align); + char *abuf = zio->io_data; + + if (asize != psize) { + abuf = zio_buf_alloc(asize); + bcopy(zio->io_data, abuf, psize); + bzero(abuf + psize, asize - psize); + } + + zio->io_cksum_report = zcr->zcr_next; + zcr->zcr_next = NULL; + zcr->zcr_finish(zcr, abuf); + zfs_ereport_free_checksum(zcr); + + if (asize != psize) + zio_buf_free(abuf, asize); + } + } zio_pop_transforms(zio); /* note: may set zio->io_error */ @@ -2129,8 +2710,9 @@ zio_done(zio_t *zio) if (zio->io_error != ECKSUM && vd != NULL && !vdev_is_dead(vd)) zfs_ereport_post(FM_EREPORT_ZFS_IO, spa, vd, zio, 0, 0); - if ((zio->io_error == EIO || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) && zio == lio) { + if ((zio->io_error == EIO || !(zio->io_flags & + (ZIO_FLAG_SPECULATIVE | ZIO_FLAG_DONT_PROPAGATE))) && + zio == lio) { /* * For logical I/O requests, tell the SPA to log the * error and generate a logical data ereport. @@ -2147,21 +2729,33 @@ zio_done(zio_t *zio) * propagate all the way to the root via zio_notify_parent(). */ ASSERT(vd == NULL && bp != NULL); + ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - if (IO_IS_ALLOCATING(zio)) + if (IO_IS_ALLOCATING(zio) && + !(zio->io_flags & ZIO_FLAG_CANFAIL)) { if (zio->io_error != ENOSPC) zio->io_reexecute |= ZIO_REEXECUTE_NOW; else zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + } if ((zio->io_type == ZIO_TYPE_READ || zio->io_type == ZIO_TYPE_FREE) && zio->io_error == ENXIO && + spa_load_state(spa) == SPA_LOAD_NONE && spa_get_failmode(spa) != ZIO_FAILURE_MODE_CONTINUE) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; if (!(zio->io_flags & ZIO_FLAG_CANFAIL) && !zio->io_reexecute) zio->io_reexecute |= ZIO_REEXECUTE_SUSPEND; + + /* + * Here is a possibly good place to attempt to do + * either combinatorial reconstruction or error correction + * based on checksums. It also might be a good place + * to send out preliminary ereports before we suspend + * processing. + */ } /* @@ -2172,6 +2766,20 @@ zio_done(zio_t *zio) */ zio_inherit_child_errors(zio, ZIO_CHILD_LOGICAL); + if ((zio->io_error || zio->io_reexecute) && + IO_IS_ALLOCATING(zio) && zio->io_gang_leader == zio && + !(zio->io_flags & ZIO_FLAG_IO_REWRITE)) + zio_dva_unallocate(zio, zio->io_gang_tree, bp); + + zio_gang_tree_free(&zio->io_gang_tree); + + /* + * Godfather I/Os should never suspend. + */ + if ((zio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) + zio->io_reexecute = 0; + if (zio->io_reexecute) { /* * This is a logical I/O that wants to reexecute. @@ -2188,17 +2796,37 @@ zio_done(zio_t *zio) */ ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL); - if (IO_IS_ALLOCATING(zio)) - zio_dva_unallocate(zio, zio->io_gang_tree, bp); + zio->io_gang_leader = NULL; - zio_gang_tree_free(&zio->io_gang_tree); + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); + + /* + * "The Godfather" I/O monitors its children but is + * not a true parent to them. It will track them through + * the pipeline but severs its ties whenever they get into + * trouble (e.g. suspended). This allows "The Godfather" + * I/O to return status without blocking. + */ + for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { + zio_link_t *zl = zio->io_walk_link; + pio_next = zio_walk_parents(zio); + + if ((pio->io_flags & ZIO_FLAG_GODFATHER) && + (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND)) { + zio_remove_child(pio, zio, zl); + zio_notify_parent(pio, zio, ZIO_WAIT_DONE); + } + } - if (pio != NULL) { + if ((pio = zio_unique_parent(zio)) != NULL) { /* * We're not a root i/o, so there's nothing to do * but notify our parent. Don't propagate errors * upward since we haven't permanently failed yet. */ + ASSERT(!(zio->io_flags & ZIO_FLAG_GODFATHER)); zio->io_flags |= ZIO_FLAG_DONT_PROPAGATE; zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } else if (zio->io_reexecute & ZIO_REEXECUTE_SUSPEND) { @@ -2219,20 +2847,37 @@ zio_done(zio_t *zio) return (ZIO_PIPELINE_STOP); } - ASSERT(zio->io_child == NULL); + ASSERT(zio->io_child_count == 0); ASSERT(zio->io_reexecute == 0); ASSERT(zio->io_error == 0 || (zio->io_flags & ZIO_FLAG_CANFAIL)); + /* + * Report any checksum errors, since the I/O is complete. + */ + while (zio->io_cksum_report != NULL) { + zio_cksum_report_t *zcr = zio->io_cksum_report; + zio->io_cksum_report = zcr->zcr_next; + zcr->zcr_next = NULL; + zcr->zcr_finish(zcr, NULL); + zfs_ereport_free_checksum(zcr); + } + + /* + * It is the responsibility of the done callback to ensure that this + * particular zio is no longer discoverable for adoption, and as + * such, cannot acquire any new parents. + */ if (zio->io_done) zio->io_done(zio); - zio_gang_tree_free(&zio->io_gang_tree); - - ASSERT(zio->io_delegate_list == NULL); - ASSERT(zio->io_delegate_next == NULL); + mutex_enter(&zio->io_lock); + zio->io_state[ZIO_WAIT_DONE] = 1; + mutex_exit(&zio->io_lock); - if (pio != NULL) { - zio_remove_child(pio, zio); + for (pio = zio_walk_parents(zio); pio != NULL; pio = pio_next) { + zio_link_t *zl = zio->io_walk_link; + pio_next = zio_walk_parents(zio); + zio_remove_child(pio, zio, zl); zio_notify_parent(pio, zio, ZIO_WAIT_DONE); } @@ -2253,12 +2898,17 @@ zio_done(zio_t *zio) * I/O pipeline definition * ========================================================================== */ -static zio_pipe_stage_t *zio_pipeline[ZIO_STAGES] = { +static zio_pipe_stage_t *zio_pipeline[] = { NULL, - zio_issue_async, zio_read_bp_init, + zio_free_bp_init, + zio_issue_async, zio_write_bp_init, zio_checksum_generate, + zio_ddt_read_start, + zio_ddt_read_done, + zio_ddt_write, + zio_ddt_free, zio_gang_assemble, zio_gang_issue, zio_dva_allocate, diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c index bf7fe733fe0c8..699e5c87605ee 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_checksum.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -27,6 +27,7 @@ #include #include #include +#include /* * Checksum vectors. @@ -49,13 +50,13 @@ * we want the ability to take advantage of that hardware. * * Of course, we don't want a checksum upgrade to invalidate existing - * data, so we store the checksum *function* in five bits of the DVA. - * This gives us room for up to 32 different checksum functions. + * data, so we store the checksum *function* in eight bits of the bp. + * This gives us room for up to 256 different checksum functions. * * When writing a block, we always checksum it with the latest-and-greatest * checksum function of the appropriate strength. When reading a block, * we compare the expected checksum against the actual checksum, which we - * compute via the checksum function specified in the DVA encoding. + * compute via the checksum function specified by BP_GET_CHECKSUM(bp). */ /*ARGSUSED*/ @@ -66,19 +67,20 @@ zio_checksum_off(const void *buf, uint64_t size, zio_cksum_t *zcp) } zio_checksum_info_t zio_checksum_table[ZIO_CHECKSUM_FUNCTIONS] = { - {{NULL, NULL}, 0, 0, "inherit"}, - {{NULL, NULL}, 0, 0, "on"}, - {{zio_checksum_off, zio_checksum_off}, 0, 0, "off"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "label"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, "gang_header"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, "zilog"}, - {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, "fletcher2"}, - {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, "fletcher4"}, - {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, "SHA256"}, + {{NULL, NULL}, 0, 0, 0, "inherit"}, + {{NULL, NULL}, 0, 0, 0, "on"}, + {{zio_checksum_off, zio_checksum_off}, 0, 0, 0, "off"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "label"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 1, 0, "gang_header"}, + {{fletcher_2_native, fletcher_2_byteswap}, 0, 1, 0, "zilog"}, + {{fletcher_2_native, fletcher_2_byteswap}, 0, 0, 0, "fletcher2"}, + {{fletcher_4_native, fletcher_4_byteswap}, 1, 0, 0, "fletcher4"}, + {{zio_checksum_SHA256, zio_checksum_SHA256}, 1, 0, 1, "sha256"}, + {{fletcher_4_native, fletcher_4_byteswap}, 0, 1, 0, "zilog2"}, }; -uint8_t -zio_checksum_select(uint8_t child, uint8_t parent) +enum zio_checksum +zio_checksum_select(enum zio_checksum child, enum zio_checksum parent) { ASSERT(child < ZIO_CHECKSUM_FUNCTIONS); ASSERT(parent < ZIO_CHECKSUM_FUNCTIONS); @@ -93,6 +95,29 @@ zio_checksum_select(uint8_t child, uint8_t parent) return (child); } +enum zio_checksum +zio_checksum_dedup_select(spa_t *spa, enum zio_checksum child, + enum zio_checksum parent) +{ + ASSERT((child & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); + ASSERT((parent & ZIO_CHECKSUM_MASK) < ZIO_CHECKSUM_FUNCTIONS); + ASSERT(parent != ZIO_CHECKSUM_INHERIT && parent != ZIO_CHECKSUM_ON); + + if (child == ZIO_CHECKSUM_INHERIT) + return (parent); + + if (child == ZIO_CHECKSUM_ON) + return (spa_dedup_checksum(spa)); + + if (child == (ZIO_CHECKSUM_ON | ZIO_CHECKSUM_VERIFY)) + return (spa_dedup_checksum(spa) | ZIO_CHECKSUM_VERIFY); + + ASSERT(zio_checksum_table[child & ZIO_CHECKSUM_MASK].ci_dedup || + (child & ZIO_CHECKSUM_VERIFY) || child == ZIO_CHECKSUM_OFF); + + return (child); +} + /* * Set the external verifier for a gang block based on , * a tuple which is guaranteed to be unique for the life of the pool. @@ -101,7 +126,7 @@ static void zio_checksum_gang_verifier(zio_cksum_t *zcp, blkptr_t *bp) { dva_t *dva = BP_IDENTITY(bp); - uint64_t txg = bp->blk_birth; + uint64_t txg = BP_PHYSICAL_BIRTH(bp); ASSERT(BP_IS_GANG(bp)); @@ -128,47 +153,79 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, { blkptr_t *bp = zio->io_bp; uint64_t offset = zio->io_offset; - zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; - zio_cksum_t zbt_cksum; + zio_cksum_t cksum; ASSERT((uint_t)checksum < ZIO_CHECKSUM_FUNCTIONS); ASSERT(ci->ci_func[0] != NULL); - if (ci->ci_zbt) { + if (ci->ci_eck) { + zio_eck_t *eck; + + if (checksum == ZIO_CHECKSUM_ZILOG2) { + zil_chain_t *zilc = data; + + size = P2ROUNDUP_TYPED(zilc->zc_nused, ZIL_MIN_BLKSZ, + uint64_t); + eck = &zilc->zc_eck; + } else { + eck = (zio_eck_t *)((char *)data + size) - 1; + } if (checksum == ZIO_CHECKSUM_GANG_HEADER) - zio_checksum_gang_verifier(&zbt->zbt_cksum, bp); + zio_checksum_gang_verifier(&eck->zec_cksum, bp); else if (checksum == ZIO_CHECKSUM_LABEL) - zio_checksum_label_verifier(&zbt->zbt_cksum, offset); + zio_checksum_label_verifier(&eck->zec_cksum, offset); else - bp->blk_cksum = zbt->zbt_cksum; - zbt->zbt_magic = ZBT_MAGIC; - ci->ci_func[0](data, size, &zbt_cksum); - zbt->zbt_cksum = zbt_cksum; + bp->blk_cksum = eck->zec_cksum; + eck->zec_magic = ZEC_MAGIC; + ci->ci_func[0](data, size, &cksum); + eck->zec_cksum = cksum; } else { ci->ci_func[0](data, size, &bp->blk_cksum); } } int -zio_checksum_error(zio_t *zio) +zio_checksum_error(zio_t *zio, zio_bad_cksum_t *info) { blkptr_t *bp = zio->io_bp; uint_t checksum = (bp == NULL ? zio->io_prop.zp_checksum : (BP_IS_GANG(bp) ? ZIO_CHECKSUM_GANG_HEADER : BP_GET_CHECKSUM(bp))); int byteswap; - void *data = zio->io_data; + int error; uint64_t size = (bp == NULL ? zio->io_size : (BP_IS_GANG(bp) ? SPA_GANGBLOCKSIZE : BP_GET_PSIZE(bp))); uint64_t offset = zio->io_offset; - zio_block_tail_t *zbt = (zio_block_tail_t *)((char *)data + size) - 1; + void *data = zio->io_data; zio_checksum_info_t *ci = &zio_checksum_table[checksum]; zio_cksum_t actual_cksum, expected_cksum, verifier; if (checksum >= ZIO_CHECKSUM_FUNCTIONS || ci->ci_func[0] == NULL) return (EINVAL); - if (ci->ci_zbt) { + if (ci->ci_eck) { + zio_eck_t *eck; + + if (checksum == ZIO_CHECKSUM_ZILOG2) { + zil_chain_t *zilc = data; + uint64_t nused; + + eck = &zilc->zc_eck; + if (eck->zec_magic == ZEC_MAGIC) + nused = zilc->zc_nused; + else if (eck->zec_magic == BSWAP_64(ZEC_MAGIC)) + nused = BSWAP_64(zilc->zc_nused); + else + return (ECKSUM); + + if (nused > size) + return (ECKSUM); + + size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + } else { + eck = (zio_eck_t *)((char *)data + size) - 1; + } + if (checksum == ZIO_CHECKSUM_GANG_HEADER) zio_checksum_gang_verifier(&verifier, bp); else if (checksum == ZIO_CHECKSUM_LABEL) @@ -176,15 +233,15 @@ zio_checksum_error(zio_t *zio) else verifier = bp->blk_cksum; - byteswap = (zbt->zbt_magic == BSWAP_64(ZBT_MAGIC)); + byteswap = (eck->zec_magic == BSWAP_64(ZEC_MAGIC)); if (byteswap) byteswap_uint64_array(&verifier, sizeof (zio_cksum_t)); - expected_cksum = zbt->zbt_cksum; - zbt->zbt_cksum = verifier; + expected_cksum = eck->zec_cksum; + eck->zec_cksum = verifier; ci->ci_func[byteswap](data, size, &actual_cksum); - zbt->zbt_cksum = expected_cksum; + eck->zec_cksum = expected_cksum; if (byteswap) byteswap_uint64_array(&expected_cksum, @@ -196,11 +253,22 @@ zio_checksum_error(zio_t *zio) ci->ci_func[byteswap](data, size, &actual_cksum); } + info->zbc_expected = expected_cksum; + info->zbc_actual = actual_cksum; + info->zbc_checksum_name = ci->ci_name; + info->zbc_byteswapped = byteswap; + info->zbc_injected = 0; + info->zbc_has_cksum = 1; + if (!ZIO_CHECKSUM_EQUAL(actual_cksum, expected_cksum)) return (ECKSUM); - if (zio_injection_enabled && !zio->io_error) - return (zio_handle_fault_injection(zio, ECKSUM)); + if (zio_injection_enabled && !zio->io_error && + (error = zio_handle_fault_injection(zio, ECKSUM)) != 0) { + + info->zbc_injected = 1; + return (error); + } return (0); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c index c563be4eb9557..f148977c44680 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_compress.c @@ -20,12 +20,10 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -51,10 +49,11 @@ zio_compress_info_t zio_compress_table[ZIO_COMPRESS_FUNCTIONS] = { {gzip_compress, gzip_decompress, 7, "gzip-7"}, {gzip_compress, gzip_decompress, 8, "gzip-8"}, {gzip_compress, gzip_decompress, 9, "gzip-9"}, + {zle_compress, zle_decompress, 64, "zle"}, }; -uint8_t -zio_compress_select(uint8_t child, uint8_t parent) +enum zio_compress +zio_compress_select(enum zio_compress child, enum zio_compress parent) { ASSERT(child < ZIO_COMPRESS_FUNCTIONS); ASSERT(parent < ZIO_COMPRESS_FUNCTIONS); @@ -69,80 +68,65 @@ zio_compress_select(uint8_t child, uint8_t parent) return (child); } -int -zio_compress_data(int cpfunc, void *src, uint64_t srcsize, void **destp, - uint64_t *destsizep, uint64_t *destbufsizep) +size_t +zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) { uint64_t *word, *word_end; - uint64_t ciosize, gapsize, destbufsize; - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - char *dest; - uint_t allzero; + size_t c_len, d_len, r_len; + zio_compress_info_t *ci = &zio_compress_table[c]; - ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); - ASSERT((uint_t)cpfunc == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); + ASSERT((uint_t)c < ZIO_COMPRESS_FUNCTIONS); + ASSERT((uint_t)c == ZIO_COMPRESS_EMPTY || ci->ci_compress != NULL); /* * If the data is all zeroes, we don't even need to allocate - * a block for it. We indicate this by setting *destsizep = 0. + * a block for it. We indicate this by returning zero size. */ - allzero = 1; - word = src; - word_end = (uint64_t *)(uintptr_t)((uintptr_t)word + srcsize); - while (word < word_end) { - if (*word++ != 0) { - allzero = 0; + word_end = (uint64_t *)((char *)src + s_len); + for (word = src; word < word_end; word++) + if (*word != 0) break; - } - } - if (allzero) { - *destp = NULL; - *destsizep = 0; - *destbufsizep = 0; - return (1); - } - if (cpfunc == ZIO_COMPRESS_EMPTY) + if (word == word_end) return (0); + if (c == ZIO_COMPRESS_EMPTY) + return (s_len); + /* Compress at least 12.5% */ - destbufsize = P2ALIGN(srcsize - (srcsize >> 3), SPA_MINBLOCKSIZE); - if (destbufsize == 0) - return (0); - dest = zio_buf_alloc(destbufsize); + d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE); + if (d_len == 0) + return (s_len); - ciosize = ci->ci_compress(src, dest, (size_t)srcsize, - (size_t)destbufsize, ci->ci_level); - if (ciosize > destbufsize) { - zio_buf_free(dest, destbufsize); - return (0); - } + c_len = ci->ci_compress(src, dst, s_len, d_len, ci->ci_level); - /* Cool. We compressed at least as much as we were hoping to. */ + if (c_len > d_len) + return (s_len); - /* For security, make sure we don't write random heap crap to disk */ - gapsize = P2ROUNDUP(ciosize, SPA_MINBLOCKSIZE) - ciosize; - if (gapsize != 0) { - bzero(dest + ciosize, gapsize); - ciosize += gapsize; + /* + * Cool. We compressed at least as much as we were hoping to. + * For both security and repeatability, pad out the last sector. + */ + r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE); + if (r_len > c_len) { + bzero((char *)dst + c_len, r_len - c_len); + c_len = r_len; } - ASSERT3U(ciosize, <=, destbufsize); - ASSERT(P2PHASE(ciosize, SPA_MINBLOCKSIZE) == 0); - *destp = dest; - *destsizep = ciosize; - *destbufsizep = destbufsize; + ASSERT3U(c_len, <=, d_len); + ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0); - return (1); + return (c_len); } int -zio_decompress_data(int cpfunc, void *src, uint64_t srcsize, - void *dest, uint64_t destsize) +zio_decompress_data(enum zio_compress c, void *src, void *dst, + size_t s_len, size_t d_len) { - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + zio_compress_info_t *ci = &zio_compress_table[c]; - ASSERT((uint_t)cpfunc < ZIO_COMPRESS_FUNCTIONS); + if ((uint_t)c >= ZIO_COMPRESS_FUNCTIONS || ci->ci_decompress == NULL) + return (EINVAL); - return (ci->ci_decompress(src, dest, srcsize, destsize, ci->ci_level)); + return (ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level)); } diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c index b3469fdd5c243..fa040ea4b31a1 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zio_inject.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -43,8 +43,8 @@ #include #include #include -#include #include +#include #include uint32_t zio_injection_enabled; @@ -70,8 +70,9 @@ zio_match_handler(zbookmark_t *zb, uint64_t type, /* * Check for a match against the MOS, which is based on type */ - if (zb->zb_objset == 0 && record->zi_objset == 0 && - record->zi_object == 0) { + if (zb->zb_objset == DMU_META_OBJSET && + record->zi_objset == DMU_META_OBJSET && + record->zi_object == DMU_META_DNODE_OBJECT) { if (record->zi_type == DMU_OT_NONE || type == record->zi_type) return (record->zi_freq == 0 || @@ -95,6 +96,31 @@ zio_match_handler(zbookmark_t *zb, uint64_t type, return (B_FALSE); } +/* + * Panic the system when a config change happens in the function + * specified by tag. + */ +void +zio_handle_panic_injection(spa_t *spa, char *tag, uint64_t type) +{ + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + if (spa != handler->zi_spa) + continue; + + if (handler->zi_record.zi_type == type && + strcmp(tag, handler->zi_record.zi_func) == 0) + panic("Panic requested in function %s\n", tag); + } + + rw_exit(&inject_lock); +} + /* * Determine if the I/O in question should return failure. Returns the errno * to be returned to the caller. @@ -126,8 +152,10 @@ zio_handle_fault_injection(zio_t *zio, int error) if (zio->io_spa != handler->zi_spa) continue; - /* Ignore device errors */ - if (handler->zi_record.zi_guid != 0) + /* Ignore device errors and panic injection */ + if (handler->zi_record.zi_guid != 0 || + handler->zi_record.zi_func[0] != '\0' || + handler->zi_record.zi_duration != 0) continue; /* If this handler matches, return EIO */ @@ -159,7 +187,7 @@ zio_handle_label_injection(zio_t *zio, int error) int label; int ret = 0; - if (offset + zio->io_size > VDEV_LABEL_START_SIZE && + if (offset >= VDEV_LABEL_START_SIZE && offset < vd->vdev_psize - VDEV_LABEL_END_SIZE) return (0); @@ -170,8 +198,10 @@ zio_handle_label_injection(zio_t *zio, int error) uint64_t start = handler->zi_record.zi_start; uint64_t end = handler->zi_record.zi_end; - /* Ignore device only faults */ - if (handler->zi_record.zi_start == 0) + /* Ignore device only faults or panic injection */ + if (handler->zi_record.zi_start == 0 || + handler->zi_record.zi_func[0] != '\0' || + handler->zi_record.zi_duration != 0) continue; /* @@ -195,21 +225,50 @@ zio_handle_label_injection(zio_t *zio, int error) int -zio_handle_device_injection(vdev_t *vd, int error) +zio_handle_device_injection(vdev_t *vd, zio_t *zio, int error) { inject_handler_t *handler; int ret = 0; + /* + * We skip over faults in the labels unless it's during + * device open (i.e. zio == NULL). + */ + if (zio != NULL) { + uint64_t offset = zio->io_offset; + + if (offset < VDEV_LABEL_START_SIZE || + offset >= vd->vdev_psize - VDEV_LABEL_END_SIZE) + return (0); + } + rw_enter(&inject_lock, RW_READER); for (handler = list_head(&inject_handlers); handler != NULL; handler = list_next(&inject_handlers, handler)) { - /* Ignore label specific faults */ - if (handler->zi_record.zi_start != 0) + /* + * Ignore label specific faults, panic injection + * or fake writes + */ + if (handler->zi_record.zi_start != 0 || + handler->zi_record.zi_func[0] != '\0' || + handler->zi_record.zi_duration != 0) continue; if (vd->vdev_guid == handler->zi_record.zi_guid) { + if (handler->zi_record.zi_failfast && + (zio == NULL || (zio->io_flags & + (ZIO_FLAG_IO_RETRY | ZIO_FLAG_TRYHARD)))) { + continue; + } + + /* Handle type specific I/O failures */ + if (zio != NULL && + handler->zi_record.zi_iotype != ZIO_TYPES && + handler->zi_record.zi_iotype != zio->io_type) + continue; + if (handler->zi_record.zi_error == error) { /* * For a failed open, pretend like the device @@ -233,6 +292,84 @@ zio_handle_device_injection(vdev_t *vd, int error) return (ret); } +/* + * Simulate hardware that ignores cache flushes. For requested number + * of seconds nix the actual writing to disk. + */ +void +zio_handle_ignored_writes(zio_t *zio) +{ + inject_handler_t *handler; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + /* Ignore errors not destined for this pool */ + if (zio->io_spa != handler->zi_spa) + continue; + + if (handler->zi_record.zi_duration == 0) + continue; + + /* + * Positive duration implies # of seconds, negative + * a number of txgs + */ + if (handler->zi_record.zi_timer == 0) { + if (handler->zi_record.zi_duration > 0) + handler->zi_record.zi_timer = ddi_get_lbolt64(); + else + handler->zi_record.zi_timer = zio->io_txg; + } + + /* Have a "problem" writing 60% of the time */ + if (spa_get_random(100) < 60) + zio->io_pipeline &= ~ZIO_VDEV_IO_STAGES; + break; + } + + rw_exit(&inject_lock); +} + +void +spa_handle_ignored_writes(spa_t *spa) +{ + inject_handler_t *handler; + + if (zio_injection_enabled == 0) + return; + + rw_enter(&inject_lock, RW_READER); + + for (handler = list_head(&inject_handlers); handler != NULL; + handler = list_next(&inject_handlers, handler)) { + + /* Ignore errors not destined for this pool */ + if (spa != handler->zi_spa) + continue; + + if (handler->zi_record.zi_duration == 0) + continue; + + if (handler->zi_record.zi_duration > 0) { + VERIFY(handler->zi_record.zi_timer == 0 || + handler->zi_record.zi_timer + + handler->zi_record.zi_duration * hz > + ddi_get_lbolt64()); + } else { + /* duration is negative so the subtraction here adds */ + VERIFY(handler->zi_record.zi_timer == 0 || + handler->zi_record.zi_timer - + handler->zi_record.zi_duration >= + spa_syncing_txg(spa)); + } + } + + rw_exit(&inject_lock); +} + /* * Create a new handler for the given record. We add it to the list, adding * a reference to the spa_t in the process. We increment zio_injection_enabled, diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zle.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zle.c new file mode 100644 index 0000000000000..13c5673fbe267 --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zle.c @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Zero-length encoding. This is a fast and simple algorithm to eliminate + * runs of zeroes. Each chunk of compressed data begins with a length byte, b. + * If b < n (where n is the compression parameter) then the next b + 1 bytes + * are literal values. If b >= n then the next (256 - b + 1) bytes are zero. + */ +#include +#include + +size_t +zle_compress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *s_end = src + s_len; + uchar_t *d_end = dst + d_len; + + while (src < s_end && dst < d_end - 1) { + uchar_t *first = src; + uchar_t *len = dst++; + if (src[0] == 0) { + uchar_t *last = src + (256 - n); + while (src < MIN(last, s_end) && src[0] == 0) + src++; + *len = src - first - 1 + n; + } else { + uchar_t *last = src + n; + if (d_end - dst < n) + break; + while (src < MIN(last, s_end) - 1 && (src[0] | src[1])) + *dst++ = *src++; + if (src[0]) + *dst++ = *src++; + *len = src - first - 1; + } + } + return (src == s_end ? dst - (uchar_t *)d_start : s_len); +} + +int +zle_decompress(void *s_start, void *d_start, size_t s_len, size_t d_len, int n) +{ + uchar_t *src = s_start; + uchar_t *dst = d_start; + uchar_t *s_end = src + s_len; + uchar_t *d_end = dst + d_len; + + while (src < s_end && dst < d_end) { + int len = 1 + *src++; + if (len <= n) { + while (len-- != 0) + *dst++ = *src++; + } else { + len -= n; + while (len-- != 0) + *dst++ = 0; + } + } + return (dst == d_end ? 0 : -1); +} diff --git a/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c b/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c index 4e993060ceb27..2b4a0b2bdb93b 100644 --- a/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c +++ b/external/cddl/osnet/dist/uts/common/fs/zfs/zvol.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -32,7 +32,7 @@ * /dev/zvol/dsk// * /dev/zvol/rdsk// * - * These links are created by the ZFS-specific devfsadm link generator. + * These links are created by the /dev filesystem (sdev_zvolops.c). * Volumes are persistent through reboot. No user command needs to be * run before opening and using a device. */ @@ -75,10 +75,12 @@ #include #include #include +#include #include "zfs_namecheck.h" static void *zvol_state; +static char *zvol_tag = "zvol_tag"; #define ZVOL_DUMPSIZE "dumpsize" @@ -106,14 +108,12 @@ typedef struct zvol_state { uint64_t zv_volblocksize; /* volume block size */ minor_t zv_minor; /* minor number */ uint8_t zv_min_bs; /* minimum addressable block shift */ - uint8_t zv_flags; /* readonly; dumpified */ + uint8_t zv_flags; /* readonly, dumpified, etc. */ objset_t *zv_objset; /* objset handle */ - uint32_t zv_mode; /* DS_MODE_* flags at open time */ uint32_t zv_open_count[OTYPCNT]; /* open counts */ uint32_t zv_total_opens; /* total open count */ zilog_t *zv_zilog; /* ZIL handle */ list_t zv_extents; /* List of extents for dump */ - uint64_t zv_txg_assign; /* txg to assign during ZIL replay */ znode_t zv_znode; /* for range locking */ } zvol_state_t; @@ -123,27 +123,30 @@ typedef struct zvol_state { #define ZVOL_RDONLY 0x1 #define ZVOL_DUMPIFIED 0x2 #define ZVOL_EXCL 0x4 +#define ZVOL_WCE 0x8 /* * zvol maximum transfer in one DMU tx. */ int zvol_maxphys = DMU_MAX_ACCESS/2; -extern int zfs_set_prop_nvlist(const char *, nvlist_t *); +extern int zfs_set_prop_nvlist(const char *, zprop_source_t, + nvlist_t *, nvlist_t **); +static int zvol_remove_zv(zvol_state_t *); static int zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio); static int zvol_dumpify(zvol_state_t *zv); static int zvol_dump_fini(zvol_state_t *zv); static int zvol_dump_init(zvol_state_t *zv, boolean_t resize); static void -zvol_size_changed(zvol_state_t *zv, major_t maj) +zvol_size_changed(uint64_t volsize, major_t maj, minor_t min) { - dev_t dev = makedevice(maj, zv->zv_minor); + dev_t dev = makedevice(maj, min); VERIFY(ddi_prop_update_int64(dev, zfs_dip, - "Size", zv->zv_volsize) == DDI_SUCCESS); + "Size", volsize) == DDI_SUCCESS); VERIFY(ddi_prop_update_int64(dev, zfs_dip, - "Nblocks", lbtodb(zv->zv_volsize)) == DDI_SUCCESS); + "Nblocks", lbtodb(volsize)) == DDI_SUCCESS); /* Notify specfs to invalidate the cached size */ spec_size_invalidate(dev, VBLK); @@ -177,17 +180,6 @@ zvol_check_volblocksize(uint64_t volblocksize) return (0); } -static void -zvol_readonly_changed_cb(void *arg, uint64_t newval) -{ - zvol_state_t *zv = arg; - - if (newval) - zv->zv_flags |= ZVOL_RDONLY; - else - zv->zv_flags &= ~ZVOL_RDONLY; -} - int zvol_get_stats(objset_t *os, nvlist_t *nv) { @@ -195,7 +187,6 @@ zvol_get_stats(objset_t *os, nvlist_t *nv) dmu_object_info_t doi; uint64_t val; - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &val); if (error) return (error); @@ -256,8 +247,8 @@ struct maparg { /*ARGSUSED*/ static int -zvol_map_block(spa_t *spa, blkptr_t *bp, const zbookmark_t *zb, - const dnode_phys_t *dnp, void *arg) +zvol_map_block(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, + const zbookmark_t *zb, const dnode_phys_t *dnp, void *arg) { struct maparg *ma = arg; zvol_extent_t *ze; @@ -309,6 +300,7 @@ zvol_free_extents(zvol_state_t *zv) static int zvol_get_lbas(zvol_state_t *zv) { + objset_t *os = zv->zv_objset; struct maparg ma; int err; @@ -316,7 +308,9 @@ zvol_get_lbas(zvol_state_t *zv) ma.ma_blks = 0; zvol_free_extents(zv); - err = traverse_dataset(dmu_objset_ds(zv->zv_objset), 0, + /* commit any in-flight changes before traversing the dataset */ + txg_wait_synced(dmu_objset_pool(os), 0); + err = traverse_dataset(dmu_objset_ds(os), 0, TRAVERSE_PRE | TRAVERSE_PREFETCH_METADATA, zvol_map_block, &ma); if (err || ma.ma_blks != (zv->zv_volsize / zv->zv_volblocksize)) { zvol_free_extents(zv); @@ -371,21 +365,32 @@ zvol_replay_write(zvol_state_t *zv, lr_write_t *lr, boolean_t byteswap) { objset_t *os = zv->zv_objset; char *data = (char *)(lr + 1); /* data follows lr_write_t */ - uint64_t off = lr->lr_offset; - uint64_t len = lr->lr_length; + uint64_t offset, length; dmu_tx_t *tx; int error; if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + offset = lr->lr_offset; + length = lr->lr_length; + + /* If it's a dmu_sync() block, write the whole block */ + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + uint64_t blocksize = BP_GET_LSIZE(&lr->lr_blkptr); + if (length < blocksize) { + offset -= offset % blocksize; + length = blocksize; + } + } + tx = dmu_tx_create(os); - dmu_tx_hold_write(tx, ZVOL_OBJ, off, len); - error = dmu_tx_assign(tx, zv->zv_txg_assign); + dmu_tx_hold_write(tx, ZVOL_OBJ, offset, length); + error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); } else { - dmu_write(os, ZVOL_OBJ, off, len, data, tx); + dmu_write(os, ZVOL_OBJ, offset, length, data, tx); dmu_tx_commit(tx); } @@ -417,137 +422,99 @@ zil_replay_func_t *zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_TRUNCATE */ zvol_replay_err, /* TX_SETATTR */ zvol_replay_err, /* TX_ACL */ + zvol_replay_err, /* TX_CREATE_ACL */ + zvol_replay_err, /* TX_CREATE_ATTR */ + zvol_replay_err, /* TX_CREATE_ACL_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL */ + zvol_replay_err, /* TX_MKDIR_ATTR */ + zvol_replay_err, /* TX_MKDIR_ACL_ATTR */ + zvol_replay_err, /* TX_WRITE2 */ }; +int +zvol_name2minor(const char *name, minor_t *minor) +{ + zvol_state_t *zv; + + mutex_enter(&zvol_state_lock); + zv = zvol_minor_lookup(name); + if (minor && zv) + *minor = zv->zv_minor; + mutex_exit(&zvol_state_lock); + return (zv ? 0 : -1); +} + /* * Create a minor node (plus a whole lot more) for the specified volume. */ int -zvol_create_minor(const char *name, major_t maj) +zvol_create_minor(const char *name) { zvol_state_t *zv; objset_t *os; dmu_object_info_t doi; - uint64_t volsize; minor_t minor = 0; - struct pathname linkpath; - int ds_mode = DS_MODE_OWNER; - vnode_t *vp = NULL; - char *devpath; - size_t devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(name) + 1; char chrbuf[30], blkbuf[30]; int error; mutex_enter(&zvol_state_lock); - if ((zv = zvol_minor_lookup(name)) != NULL) { + if (zvol_minor_lookup(name) != NULL) { mutex_exit(&zvol_state_lock); return (EEXIST); } - if (strchr(name, '@') != 0) - ds_mode |= DS_MODE_READONLY; - - error = dmu_objset_open(name, DMU_OST_ZVOL, ds_mode, &os); + /* lie and say we're read-only */ + error = dmu_objset_own(name, DMU_OST_ZVOL, B_TRUE, zvol_tag, &os); if (error) { mutex_exit(&zvol_state_lock); return (error); } - error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); - - if (error) { - dmu_objset_close(os); - mutex_exit(&zvol_state_lock); - return (error); - } - - /* - * If there's an existing /dev/zvol symlink, try to use the - * same minor number we used last time. - */ - devpath = kmem_alloc(devpathlen, KM_SLEEP); - - (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, name); - - error = lookupname(devpath, UIO_SYSSPACE, NO_FOLLOW, NULL, &vp); - - kmem_free(devpath, devpathlen); - - if (error == 0 && vp->v_type != VLNK) - error = EINVAL; - - if (error == 0) { - pn_alloc(&linkpath); - error = pn_getsymlink(vp, &linkpath, kcred); - if (error == 0) { - char *ms = strstr(linkpath.pn_path, ZVOL_PSEUDO_DEV); - if (ms != NULL) { - ms += strlen(ZVOL_PSEUDO_DEV); - minor = stoi(&ms); - } - } - pn_free(&linkpath); - } - - if (vp != NULL) - VN_RELE(vp); - - /* - * If we found a minor but it's already in use, we must pick a new one. - */ - if (minor != 0 && ddi_get_soft_state(zvol_state, minor) != NULL) - minor = 0; - - if (minor == 0) - minor = zvol_minor_alloc(); - - if (minor == 0) { - dmu_objset_close(os); + if ((minor = zvol_minor_alloc()) == 0) { + dmu_objset_disown(os, zvol_tag); mutex_exit(&zvol_state_lock); return (ENXIO); } if (ddi_soft_state_zalloc(zvol_state, minor) != DDI_SUCCESS) { - dmu_objset_close(os); + dmu_objset_disown(os, zvol_tag); mutex_exit(&zvol_state_lock); return (EAGAIN); } - (void) ddi_prop_update_string(minor, zfs_dip, ZVOL_PROP_NAME, (char *)name); - (void) sprintf(chrbuf, "%uc,raw", minor); + (void) snprintf(chrbuf, sizeof (chrbuf), "%u,raw", minor); if (ddi_create_minor_node(zfs_dip, chrbuf, S_IFCHR, minor, DDI_PSEUDO, 0) == DDI_FAILURE) { ddi_soft_state_free(zvol_state, minor); - dmu_objset_close(os); + dmu_objset_disown(os, zvol_tag); mutex_exit(&zvol_state_lock); return (EAGAIN); } - (void) sprintf(blkbuf, "%uc", minor); + (void) snprintf(blkbuf, sizeof (blkbuf), "%u", minor); if (ddi_create_minor_node(zfs_dip, blkbuf, S_IFBLK, minor, DDI_PSEUDO, 0) == DDI_FAILURE) { ddi_remove_minor_node(zfs_dip, chrbuf); ddi_soft_state_free(zvol_state, minor); - dmu_objset_close(os); + dmu_objset_disown(os, zvol_tag); mutex_exit(&zvol_state_lock); return (EAGAIN); } zv = ddi_get_soft_state(zvol_state, minor); - (void) strcpy(zv->zv_name, name); + (void) strlcpy(zv->zv_name, name, MAXPATHLEN); zv->zv_min_bs = DEV_BSHIFT; zv->zv_minor = minor; - zv->zv_volsize = volsize; zv->zv_objset = os; - zv->zv_mode = ds_mode; - zv->zv_zilog = zil_open(os, zvol_get_data); + if (dmu_objset_is_snapshot(os)) + zv->zv_flags |= ZVOL_RDONLY; mutex_init(&zv->zv_znode.z_range_lock, NULL, MUTEX_DEFAULT, NULL); avl_create(&zv->zv_znode.z_range_avl, zfs_range_compare, sizeof (rl_t), offsetof(rl_t, r_node)); @@ -558,12 +525,9 @@ zvol_create_minor(const char *name, major_t maj) ASSERT(error == 0); zv->zv_volblocksize = doi.doi_data_block_size; - zil_replay(os, zv, &zv->zv_txg_assign, zvol_replay_vector, NULL); - zvol_size_changed(zv, maj); - - /* XXX this should handle the possible i/o error */ - VERIFY(dsl_prop_register(dmu_objset_ds(zv->zv_objset), - "readonly", zvol_readonly_changed_cb, zv) == 0); + zil_replay(os, zv, zvol_replay_vector); + dmu_objset_disown(os, zvol_tag); + zv->zv_objset = NULL; zvol_minors++; @@ -575,47 +539,88 @@ zvol_create_minor(const char *name, major_t maj) /* * Remove minor node for the specified volume. */ +static int +zvol_remove_zv(zvol_state_t *zv) +{ + char nmbuf[20]; + + ASSERT(MUTEX_HELD(&zvol_state_lock)); + if (zv->zv_total_opens != 0) + return (EBUSY); + + (void) snprintf(nmbuf, sizeof (nmbuf), "%u,raw", zv->zv_minor); + ddi_remove_minor_node(zfs_dip, nmbuf); + + (void) snprintf(nmbuf, sizeof (nmbuf), "%u", zv->zv_minor); + ddi_remove_minor_node(zfs_dip, nmbuf); + + avl_destroy(&zv->zv_znode.z_range_avl); + mutex_destroy(&zv->zv_znode.z_range_lock); + + ddi_soft_state_free(zvol_state, zv->zv_minor); + + zvol_minors--; + return (0); +} + int zvol_remove_minor(const char *name) { zvol_state_t *zv; - char namebuf[30]; + int rc; mutex_enter(&zvol_state_lock); - if ((zv = zvol_minor_lookup(name)) == NULL) { mutex_exit(&zvol_state_lock); return (ENXIO); } + rc = zvol_remove_zv(zv); + mutex_exit(&zvol_state_lock); + return (rc); +} - if (zv->zv_total_opens != 0) { - mutex_exit(&zvol_state_lock); - return (EBUSY); - } +int +zvol_first_open(zvol_state_t *zv) +{ + objset_t *os; + uint64_t volsize; + int error; + uint64_t readonly; - (void) sprintf(namebuf, "%uc,raw", zv->zv_minor); - ddi_remove_minor_node(zfs_dip, namebuf); + /* lie and say we're read-only */ + error = dmu_objset_own(zv->zv_name, DMU_OST_ZVOL, B_TRUE, + zvol_tag, &os); + if (error) + return (error); - (void) sprintf(namebuf, "%uc", zv->zv_minor); - ddi_remove_minor_node(zfs_dip, namebuf); + error = zap_lookup(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize); + if (error) { + ASSERT(error == 0); + dmu_objset_disown(os, zvol_tag); + return (error); + } + zv->zv_objset = os; + zv->zv_volsize = volsize; + zv->zv_zilog = zil_open(os, zvol_get_data); + zvol_size_changed(zv->zv_volsize, ddi_driver_major(zfs_dip), + zv->zv_minor); - VERIFY(dsl_prop_unregister(dmu_objset_ds(zv->zv_objset), - "readonly", zvol_readonly_changed_cb, zv) == 0); + VERIFY(dsl_prop_get_integer(zv->zv_name, "readonly", &readonly, + NULL) == 0); + if (readonly || dmu_objset_is_snapshot(os)) + zv->zv_flags |= ZVOL_RDONLY; + else + zv->zv_flags &= ~ZVOL_RDONLY; + return (error); +} +void +zvol_last_close(zvol_state_t *zv) +{ zil_close(zv->zv_zilog); zv->zv_zilog = NULL; - dmu_objset_close(zv->zv_objset); + dmu_objset_disown(zv->zv_objset, zvol_tag); zv->zv_objset = NULL; - avl_destroy(&zv->zv_znode.z_range_avl); - mutex_destroy(&zv->zv_znode.z_range_lock); - - ddi_soft_state_free(zvol_state, zv->zv_minor); - - zvol_minors--; - - mutex_exit(&zvol_state_lock); - - return (0); } int @@ -658,14 +663,14 @@ zvol_prealloc(zvol_state_t *zv) } int -zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize) +zvol_update_volsize(objset_t *os, uint64_t volsize) { dmu_tx_t *tx; int error; ASSERT(MUTEX_HELD(&zvol_state_lock)); - tx = dmu_tx_create(zv->zv_objset); + tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { @@ -673,127 +678,117 @@ zvol_update_volsize(zvol_state_t *zv, major_t maj, uint64_t volsize) return (error); } - error = zap_update(zv->zv_objset, ZVOL_ZAP_OBJ, "size", 8, 1, + error = zap_update(os, ZVOL_ZAP_OBJ, "size", 8, 1, &volsize, tx); dmu_tx_commit(tx); if (error == 0) - error = dmu_free_long_range(zv->zv_objset, + error = dmu_free_long_range(os, ZVOL_OBJ, volsize, DMU_OBJECT_END); + return (error); +} - /* - * If we are using a faked-up state (zv_minor == 0) then don't - * try to update the in-core zvol state. - */ - if (error == 0 && zv->zv_minor) { - zv->zv_volsize = volsize; - zvol_size_changed(zv, maj); +void +zvol_remove_minors(const char *name) +{ + zvol_state_t *zv; + char *namebuf; + minor_t minor; + + namebuf = kmem_zalloc(strlen(name) + 2, KM_SLEEP); + (void) strncpy(namebuf, name, strlen(name)); + (void) strcat(namebuf, "/"); + mutex_enter(&zvol_state_lock); + for (minor = 1; minor <= ZVOL_MAX_MINOR; minor++) { + + zv = ddi_get_soft_state(zvol_state, minor); + if (zv == NULL) + continue; + if (strncmp(namebuf, zv->zv_name, strlen(namebuf)) == 0) + (void) zvol_remove_zv(zv); } - return (error); + kmem_free(namebuf, strlen(name) + 2); + + mutex_exit(&zvol_state_lock); } int zvol_set_volsize(const char *name, major_t maj, uint64_t volsize) { - zvol_state_t *zv; + zvol_state_t *zv = NULL; + objset_t *os; int error; dmu_object_info_t doi; uint64_t old_volsize = 0ULL; - zvol_state_t state = { 0 }; + uint64_t readonly; mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - /* - * If we are doing a "zfs clone -o volsize=", then the - * minor node won't exist yet. - */ - error = dmu_objset_open(name, DMU_OST_ZVOL, DS_MODE_OWNER, - &state.zv_objset); - if (error != 0) - goto out; - zv = &state; + zv = zvol_minor_lookup(name); + if ((error = dmu_objset_hold(name, FTAG, &os)) != 0) { + mutex_exit(&zvol_state_lock); + return (error); } - old_volsize = zv->zv_volsize; - if ((error = dmu_object_info(zv->zv_objset, ZVOL_OBJ, &doi)) != 0 || + if ((error = dmu_object_info(os, ZVOL_OBJ, &doi)) != 0 || (error = zvol_check_volsize(volsize, doi.doi_data_block_size)) != 0) goto out; - if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { + VERIFY(dsl_prop_get_integer(name, "readonly", &readonly, + NULL) == 0); + if (readonly) { error = EROFS; goto out; } - error = zvol_update_volsize(zv, maj, volsize); - + error = zvol_update_volsize(os, volsize); /* * Reinitialize the dump area to the new size. If we - * failed to resize the dump area then restore the it back to - * it's original size. + * failed to resize the dump area then restore it back to + * its original size. */ - if (error == 0 && zv->zv_flags & ZVOL_DUMPIFIED) { - if ((error = zvol_dumpify(zv)) != 0 || - (error = dumpvp_resize()) != 0) { - (void) zvol_update_volsize(zv, maj, old_volsize); - error = zvol_dumpify(zv); + if (zv && error == 0) { + if (zv->zv_flags & ZVOL_DUMPIFIED) { + old_volsize = zv->zv_volsize; + zv->zv_volsize = volsize; + if ((error = zvol_dumpify(zv)) != 0 || + (error = dumpvp_resize()) != 0) { + (void) zvol_update_volsize(os, old_volsize); + zv->zv_volsize = old_volsize; + error = zvol_dumpify(zv); + } + } + if (error == 0) { + zv->zv_volsize = volsize; + zvol_size_changed(volsize, maj, zv->zv_minor); } } -out: - if (state.zv_objset) - dmu_objset_close(state.zv_objset); + /* + * Generate a LUN expansion event. + */ + if (zv && error == 0) { + sysevent_id_t eid; + nvlist_t *attr; + char *physpath = kmem_zalloc(MAXPATHLEN, KM_SLEEP); - mutex_exit(&zvol_state_lock); + (void) snprintf(physpath, MAXPATHLEN, "%s%u", ZVOL_PSEUDO_DEV, + zv->zv_minor); - return (error); -} + VERIFY(nvlist_alloc(&attr, NV_UNIQUE_NAME, KM_SLEEP) == 0); + VERIFY(nvlist_add_string(attr, DEV_PHYS_PATH, physpath) == 0); -int -zvol_set_volblocksize(const char *name, uint64_t volblocksize) -{ - zvol_state_t *zv; - dmu_tx_t *tx; - int error; - boolean_t needlock; + (void) ddi_log_sysevent(zfs_dip, SUNW_VENDOR, EC_DEV_STATUS, + ESC_DEV_DLE, attr, &eid, DDI_SLEEP); - /* - * The lock may already be held if we are being called from - * zvol_dump_init(). - */ - needlock = !MUTEX_HELD(&zvol_state_lock); - if (needlock) - mutex_enter(&zvol_state_lock); - - if ((zv = zvol_minor_lookup(name)) == NULL) { - if (needlock) - mutex_exit(&zvol_state_lock); - return (ENXIO); - } - if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) { - if (needlock) - mutex_exit(&zvol_state_lock); - return (EROFS); + nvlist_free(attr); + kmem_free(physpath, MAXPATHLEN); } - tx = dmu_tx_create(zv->zv_objset); - dmu_tx_hold_bonus(tx, ZVOL_OBJ); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - } else { - error = dmu_object_set_blocksize(zv->zv_objset, ZVOL_OBJ, - volblocksize, 0, tx); - if (error == ENOTSUP) - error = EBUSY; - dmu_tx_commit(tx); - if (error == 0) - zv->zv_volblocksize = volblocksize; - } +out: + dmu_objset_rele(os, FTAG); - if (needlock) - mutex_exit(&zvol_state_lock); + mutex_exit(&zvol_state_lock); return (error); } @@ -804,6 +799,7 @@ zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) { minor_t minor = getminor(*devp); zvol_state_t *zv; + int err = 0; if (minor == 0) /* This is the control device */ return (0); @@ -816,21 +812,24 @@ zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) return (ENXIO); } - ASSERT(zv->zv_objset != NULL); - - if ((flag & FWRITE) && - (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY))) { + if (zv->zv_total_opens == 0) + err = zvol_first_open(zv); + if (err) { mutex_exit(&zvol_state_lock); - return (EROFS); + return (err); + } + if ((flag & FWRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + err = EROFS; + goto out; } if (zv->zv_flags & ZVOL_EXCL) { - mutex_exit(&zvol_state_lock); - return (EBUSY); + err = EBUSY; + goto out; } if (flag & FEXCL) { if (zv->zv_total_opens != 0) { - mutex_exit(&zvol_state_lock); - return (EBUSY); + err = EBUSY; + goto out; } zv->zv_flags |= ZVOL_EXCL; } @@ -839,10 +838,14 @@ zvol_open(dev_t *devp, int flag, int otyp, cred_t *cr) zv->zv_open_count[otyp]++; zv->zv_total_opens++; } - mutex_exit(&zvol_state_lock); - return (0); + return (err); +out: + if (zv->zv_total_opens == 0) + zvol_last_close(zv); + mutex_exit(&zvol_state_lock); + return (err); } /*ARGSUSED*/ @@ -851,6 +854,7 @@ zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) { minor_t minor = getminor(dev); zvol_state_t *zv; + int error = 0; if (minor == 0) /* This is the control device */ return (0); @@ -881,20 +885,24 @@ zvol_close(dev_t dev, int flag, int otyp, cred_t *cr) zv->zv_open_count[otyp]--; zv->zv_total_opens--; - mutex_exit(&zvol_state_lock); + if (zv->zv_total_opens == 0) + zvol_last_close(zv); - return (0); + mutex_exit(&zvol_state_lock); + return (error); } static void -zvol_get_done(dmu_buf_t *db, void *vzgd) +zvol_get_done(zgd_t *zgd, int error) { - zgd_t *zgd = (zgd_t *)vzgd; - rl_t *rl = zgd->zgd_rl; + if (zgd->zgd_db) + dmu_buf_rele(zgd->zgd_db, zgd); + + zfs_range_unlock(zgd->zgd_rl); + + if (error == 0 && zgd->zgd_bp) + zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); - dmu_buf_rele(db, vzgd); - zfs_range_unlock(rl); - zil_add_block(zgd->zgd_zilog, zgd->zgd_bp); kmem_free(zgd, sizeof (zgd_t)); } @@ -906,15 +914,20 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) { zvol_state_t *zv = arg; objset_t *os = zv->zv_objset; + uint64_t object = ZVOL_OBJ; + uint64_t offset = lr->lr_offset; + uint64_t size = lr->lr_length; /* length of user data */ + blkptr_t *bp = &lr->lr_blkptr; dmu_buf_t *db; - rl_t *rl; zgd_t *zgd; - uint64_t boff; /* block starting offset */ - int dlen = lr->lr_length; /* length of user data */ int error; - ASSERT(zio); - ASSERT(dlen != 0); + ASSERT(zio != NULL); + ASSERT(size != 0); + + zgd = kmem_zalloc(sizeof (zgd_t), KM_SLEEP); + zgd->zgd_zilog = zv->zv_zilog; + zgd->zgd_rl = zfs_range_lock(&zv->zv_znode, offset, size, RL_READER); /* * Write records come in two flavors: immediate and indirect. @@ -923,39 +936,30 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) * sync the data and get a pointer to it (indirect) so that * we don't have to write the data twice. */ - if (buf != NULL) /* immediate write */ - return (dmu_read(os, ZVOL_OBJ, lr->lr_offset, dlen, buf)); + if (buf != NULL) { /* immediate write */ + error = dmu_read(os, object, offset, size, buf, + DMU_READ_NO_PREFETCH); + } else { + size = zv->zv_volblocksize; + offset = P2ALIGN(offset, size); + error = dmu_buf_hold(os, object, offset, zgd, &db); + if (error == 0) { + zgd->zgd_db = db; + zgd->zgd_bp = bp; - zgd = (zgd_t *)kmem_alloc(sizeof (zgd_t), KM_SLEEP); - zgd->zgd_zilog = zv->zv_zilog; - zgd->zgd_bp = &lr->lr_blkptr; + ASSERT(db->db_offset == offset); + ASSERT(db->db_size == size); - /* - * Lock the range of the block to ensure that when the data is - * written out and its checksum is being calculated that no other - * thread can change the block. - */ - boff = P2ALIGN_TYPED(lr->lr_offset, zv->zv_volblocksize, uint64_t); - rl = zfs_range_lock(&zv->zv_znode, boff, zv->zv_volblocksize, - RL_READER); - zgd->zgd_rl = rl; + error = dmu_sync(zio, lr->lr_common.lrc_txg, + zvol_get_done, zgd); + + if (error == 0) + return (0); + } + } + + zvol_get_done(zgd, error); - VERIFY(0 == dmu_buf_hold(os, ZVOL_OBJ, lr->lr_offset, zgd, &db)); - error = dmu_sync(zio, db, &lr->lr_blkptr, - lr->lr_common.lrc_txg, zvol_get_done, zgd); - if (error == 0) - zil_add_block(zv->zv_zilog, &lr->lr_blkptr); - /* - * If we get EINPROGRESS, then we need to wait for a - * write IO initiated by dmu_sync() to complete before - * we can release this dbuf. We will finish everything - * up in the zvol_get_done() callback. - */ - if (error == EINPROGRESS) - return (0); - dmu_buf_rele(db, zgd); - zfs_range_unlock(rl); - kmem_free(zgd, sizeof (zgd_t)); return (error); } @@ -968,28 +972,75 @@ zvol_get_data(void *arg, lr_write_t *lr, char *buf, zio_t *zio) ssize_t zvol_immediate_write_sz = 32768; static void -zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t len) +zvol_log_write(zvol_state_t *zv, dmu_tx_t *tx, offset_t off, ssize_t resid, + boolean_t sync) { uint32_t blocksize = zv->zv_volblocksize; - lr_write_t *lr; + zilog_t *zilog = zv->zv_zilog; + boolean_t slogging; + ssize_t immediate_write_sz; - while (len) { - ssize_t nbytes = MIN(len, blocksize - P2PHASE(off, blocksize)); - itx_t *itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + if (zil_disable) + return; - itx->itx_wr_state = - len > zvol_immediate_write_sz ? WR_INDIRECT : WR_NEED_COPY; - itx->itx_private = zv; + if (zil_replaying(zilog, tx)) + return; + + immediate_write_sz = (zilog->zl_logbias == ZFS_LOGBIAS_THROUGHPUT) + ? 0 : zvol_immediate_write_sz; + + slogging = spa_has_slogs(zilog->zl_spa) && + (zilog->zl_logbias == ZFS_LOGBIAS_LATENCY); + + while (resid) { + itx_t *itx; + lr_write_t *lr; + ssize_t len; + itx_wr_state_t write_state; + + /* + * Unlike zfs_log_write() we can be called with + * upto DMU_MAX_ACCESS/2 (5MB) writes. + */ + if (blocksize > immediate_write_sz && !slogging && + resid >= blocksize && off % blocksize == 0) { + write_state = WR_INDIRECT; /* uses dmu_sync */ + len = blocksize; + } else if (sync) { + write_state = WR_COPIED; + len = MIN(ZIL_MAX_LOG_DATA, resid); + } else { + write_state = WR_NEED_COPY; + len = MIN(ZIL_MAX_LOG_DATA, resid); + } + + itx = zil_itx_create(TX_WRITE, sizeof (*lr) + + (write_state == WR_COPIED ? len : 0)); lr = (lr_write_t *)&itx->itx_lr; + if (write_state == WR_COPIED && dmu_read(zv->zv_objset, + ZVOL_OBJ, off, len, lr + 1, DMU_READ_NO_PREFETCH) != 0) { + zil_itx_destroy(itx); + itx = zil_itx_create(TX_WRITE, sizeof (*lr)); + lr = (lr_write_t *)&itx->itx_lr; + write_state = WR_NEED_COPY; + } + + itx->itx_wr_state = write_state; + if (write_state == WR_NEED_COPY) + itx->itx_sod += len; lr->lr_foid = ZVOL_OBJ; lr->lr_offset = off; - lr->lr_length = nbytes; - lr->lr_blkoff = off - P2ALIGN_TYPED(off, blocksize, uint64_t); + lr->lr_length = len; + lr->lr_blkoff = 0; BP_ZERO(&lr->lr_blkptr); - (void) zil_itx_assign(zv->zv_zilog, itx, tx); - len -= nbytes; - off += nbytes; + itx->itx_private = zv; + itx->itx_sync = sync; + + (void) zil_itx_assign(zilog, itx, tx); + + off += len; + resid -= len; } } @@ -1002,7 +1053,9 @@ zvol_dumpio_vdev(vdev_t *vd, void *addr, uint64_t offset, uint64_t size, int numerrors = 0; for (c = 0; c < vd->vdev_children; c++) { - ASSERT(vd->vdev_ops == &vdev_mirror_ops); + ASSERT(vd->vdev_ops == &vdev_mirror_ops || + vd->vdev_ops == &vdev_replacing_ops || + vd->vdev_ops == &vdev_spare_ops); int err = zvol_dumpio_vdev(vd->vdev_child[c], addr, offset, size, doread, isdump); if (err != 0) { @@ -1078,6 +1131,7 @@ zvol_strategy(buf_t *bp) int error = 0; boolean_t doread = bp->b_flags & B_READ; boolean_t is_dump = zv->zv_flags & ZVOL_DUMPIFIED; + boolean_t sync; if (zv == NULL) { bioerror(bp, ENXIO); @@ -1091,9 +1145,7 @@ zvol_strategy(buf_t *bp) return (0); } - if (!(bp->b_flags & B_READ) && - (zv->zv_flags & ZVOL_RDONLY || - zv->zv_mode & DS_MODE_READONLY)) { + if (!(bp->b_flags & B_READ) && (zv->zv_flags & ZVOL_RDONLY)) { bioerror(bp, EROFS); biodone(bp); return (0); @@ -1115,6 +1167,9 @@ zvol_strategy(buf_t *bp) return (0); } + sync = !(bp->b_flags & B_ASYNC) && !doread && !is_dump && + !(zv->zv_flags & ZVOL_WCE) && !zil_disable; + /* * There must be no buffer changes when doing a dmu_sync() because * we can't change the data whilst calculating the checksum. @@ -1129,7 +1184,8 @@ zvol_strategy(buf_t *bp) error = zvol_dumpio(zv, addr, off, size, doread, B_FALSE); } else if (doread) { - error = dmu_read(os, ZVOL_OBJ, off, size, addr); + error = dmu_read(os, ZVOL_OBJ, off, size, addr, + DMU_READ_PREFETCH); } else { dmu_tx_t *tx = dmu_tx_create(os); dmu_tx_hold_write(tx, ZVOL_OBJ, off, size); @@ -1138,7 +1194,7 @@ zvol_strategy(buf_t *bp) dmu_tx_abort(tx); } else { dmu_write(os, ZVOL_OBJ, off, size, addr, tx); - zvol_log_write(zv, tx, off, size); + zvol_log_write(zv, tx, off, size, sync); dmu_tx_commit(tx); } } @@ -1157,7 +1213,7 @@ zvol_strategy(buf_t *bp) if ((bp->b_resid = resid) == bp->b_bcount) bioerror(bp, off > volsize ? EINVAL : error); - if (!(bp->b_flags & B_ASYNC) && !doread && !zil_disable && !is_dump) + if (sync) zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); biodone(bp); @@ -1272,6 +1328,7 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) uint64_t volsize; rl_t *rl; int error = 0; + boolean_t sync; if (minor == 0) /* This is the control device */ return (ENXIO); @@ -1291,6 +1348,8 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) return (error); } + sync = !(zv->zv_flags & ZVOL_WCE) && !zil_disable; + rl = zfs_range_lock(&zv->zv_znode, uio->uio_loffset, uio->uio_resid, RL_WRITER); while (uio->uio_resid > 0 && uio->uio_loffset < volsize) { @@ -1309,13 +1368,15 @@ zvol_write(dev_t dev, uio_t *uio, cred_t *cr) } error = dmu_write_uio(zv->zv_objset, ZVOL_OBJ, uio, bytes, tx); if (error == 0) - zvol_log_write(zv, tx, off, bytes); + zvol_log_write(zv, tx, off, bytes, sync); dmu_tx_commit(tx); if (error) break; } zfs_range_unlock(rl); + if (sync) + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); return (error); } @@ -1398,6 +1459,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) mutex_exit(&zvol_state_lock); return (ENXIO); } + ASSERT(zv->zv_total_opens > 0); switch (cmd) { @@ -1406,6 +1468,7 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) (void) strcpy(dki.dki_cname, "zvol"); (void) strcpy(dki.dki_dname, "zvol"); dki.dki_ctype = DKC_UNKNOWN; + dki.dki_unit = getminor(dev); dki.dki_maxtransfer = 1 << (SPA_MAXBLOCKSHIFT - zv->zv_min_bs); mutex_exit(&zvol_state_lock); if (ddi_copyout(&dki, (void *)arg, sizeof (dki), flag)) @@ -1434,12 +1497,40 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) case DKIOCFLUSHWRITECACHE: dkc = (struct dk_callback *)arg; + mutex_exit(&zvol_state_lock); zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); if ((flag & FKIOCTL) && dkc != NULL && dkc->dkc_callback) { (*dkc->dkc_callback)(dkc->dkc_cookie, error); error = 0; } - break; + return (error); + + case DKIOCGETWCE: + { + int wce = (zv->zv_flags & ZVOL_WCE) ? 1 : 0; + if (ddi_copyout(&wce, (void *)arg, sizeof (int), + flag)) + error = EFAULT; + break; + } + case DKIOCSETWCE: + { + int wce; + if (ddi_copyin((void *)arg, &wce, sizeof (int), + flag)) { + error = EFAULT; + break; + } + if (wce) { + zv->zv_flags |= ZVOL_WCE; + mutex_exit(&zvol_state_lock); + } else { + zv->zv_flags &= ~ZVOL_WCE; + mutex_exit(&zvol_state_lock); + zil_commit(zv->zv_zilog, UINT64_MAX, ZVOL_OBJ); + } + return (0); + } case DKIOCGGEOM: case DKIOCGVTOC: @@ -1458,6 +1549,8 @@ zvol_ioctl(dev_t dev, int cmd, intptr_t arg, int flag, cred_t *cr, int *rvalp) break; case DKIOCDUMPFINI: + if (!(zv->zv_flags & ZVOL_DUMPIFIED)) + break; rl = zfs_range_lock(&zv->zv_znode, 0, zv->zv_volsize, RL_WRITER); error = zvol_dump_fini(zv); @@ -1493,29 +1586,6 @@ zvol_fini(void) ddi_soft_state_fini(&zvol_state); } -static boolean_t -zvol_is_swap(zvol_state_t *zv) -{ - vnode_t *vp; - boolean_t ret = B_FALSE; - char *devpath; - size_t devpathlen; - int error; - - devpathlen = strlen(ZVOL_FULL_DEV_DIR) + strlen(zv->zv_name) + 1; - devpath = kmem_alloc(devpathlen, KM_SLEEP); - (void) sprintf(devpath, "%s%s", ZVOL_FULL_DEV_DIR, zv->zv_name); - error = lookupname(devpath, UIO_SYSSPACE, FOLLOW, NULLVPP, &vp); - kmem_free(devpath, devpathlen); - - ret = !error && IS_SWAPVP(common_specvp(vp)); - - if (vp != NULL) - VN_RELE(vp); - - return (ret); -} - static int zvol_dump_init(zvol_state_t *zv, boolean_t resize) { @@ -1523,11 +1593,17 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) int error = 0; objset_t *os = zv->zv_objset; nvlist_t *nv = NULL; + uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset)); ASSERT(MUTEX_HELD(&zvol_state_lock)); + error = dmu_free_long_range(zv->zv_objset, ZVOL_OBJ, 0, + DMU_OBJECT_END); + /* wait for dmu_free_long_range to actually free the blocks */ + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); tx = dmu_tx_create(os); dmu_tx_hold_zap(tx, ZVOL_ZAP_OBJ, TRUE, NULL); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); error = dmu_tx_assign(tx, TXG_WAIT); if (error) { dmu_tx_abort(tx); @@ -1545,7 +1621,7 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) zfs_prop_to_name(ZFS_PROP_REFRESERVATION), 8, 1, &zv->zv_volsize, tx); } else { - uint64_t checksum, compress, refresrv, vbs; + uint64_t checksum, compress, refresrv, vbs, dedup; error = dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_COMPRESSION), &compress, NULL); @@ -1555,6 +1631,11 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) zfs_prop_to_name(ZFS_PROP_REFRESERVATION), &refresrv, NULL); error = error ? error : dsl_prop_get_integer(zv->zv_name, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), &vbs, NULL); + if (version >= SPA_VERSION_DEDUP) { + error = error ? error : + dsl_prop_get_integer(zv->zv_name, + zfs_prop_to_name(ZFS_PROP_DEDUP), &dedup, NULL); + } error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_COMPRESSION), 8, 1, @@ -1567,17 +1648,18 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), 8, 1, &vbs, tx); + error = error ? error : dmu_object_set_blocksize( + os, ZVOL_OBJ, SPA_MAXBLOCKSIZE, 0, tx); + if (version >= SPA_VERSION_DEDUP) { + error = error ? error : zap_update(os, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, + &dedup, tx); + } + if (error == 0) + zv->zv_volblocksize = SPA_MAXBLOCKSIZE; } dmu_tx_commit(tx); - /* Truncate the file */ - if (!error) - error = dmu_free_long_range(zv->zv_objset, - ZVOL_OBJ, 0, DMU_OBJECT_END); - - if (error) - return (error); - /* * We only need update the zvol's property if we are initializing * the dump area for the first time. @@ -1592,11 +1674,14 @@ zvol_dump_init(zvol_state_t *zv, boolean_t resize) VERIFY(nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_CHECKSUM), ZIO_CHECKSUM_OFF) == 0); - VERIFY(nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), - SPA_MAXBLOCKSIZE) == 0); + if (version >= SPA_VERSION_DEDUP) { + VERIFY(nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_DEDUP), + ZIO_CHECKSUM_OFF) == 0); + } - error = zfs_set_prop_nvlist(zv->zv_name, nv); + error = zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL, + nv, NULL); nvlist_free(nv); if (error) @@ -1616,15 +1701,9 @@ zvol_dumpify(zvol_state_t *zv) dmu_tx_t *tx; objset_t *os = zv->zv_objset; - if (zv->zv_flags & ZVOL_RDONLY || (zv->zv_mode & DS_MODE_READONLY)) + if (zv->zv_flags & ZVOL_RDONLY) return (EROFS); - /* - * We do not support swap devices acting as dump devices. - */ - if (zvol_is_swap(zv)) - return (ENOTSUP); - if (zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, ZVOL_DUMPSIZE, 8, 1, &dumpsize) != 0 || dumpsize != zv->zv_volsize) { boolean_t resize = (dumpsize > 0) ? B_TRUE : B_FALSE; @@ -1674,7 +1753,8 @@ zvol_dump_fini(zvol_state_t *zv) objset_t *os = zv->zv_objset; nvlist_t *nv; int error = 0; - uint64_t checksum, compress, refresrv, vbs; + uint64_t checksum, compress, refresrv, vbs, dedup; + uint64_t version = spa_version(dmu_objset_spa(zv->zv_objset)); /* * Attempt to restore the zvol back to its pre-dumpified state. @@ -1709,14 +1789,31 @@ zvol_dump_fini(zvol_state_t *zv) zfs_prop_to_name(ZFS_PROP_COMPRESSION), compress); (void) nvlist_add_uint64(nv, zfs_prop_to_name(ZFS_PROP_REFRESERVATION), refresrv); - (void) nvlist_add_uint64(nv, - zfs_prop_to_name(ZFS_PROP_VOLBLOCKSIZE), vbs); - (void) zfs_set_prop_nvlist(zv->zv_name, nv); + if (version >= SPA_VERSION_DEDUP && + zap_lookup(zv->zv_objset, ZVOL_ZAP_OBJ, + zfs_prop_to_name(ZFS_PROP_DEDUP), 8, 1, &dedup) == 0) { + (void) nvlist_add_uint64(nv, + zfs_prop_to_name(ZFS_PROP_DEDUP), dedup); + } + (void) zfs_set_prop_nvlist(zv->zv_name, ZPROP_SRC_LOCAL, + nv, NULL); nvlist_free(nv); zvol_free_extents(zv); zv->zv_flags &= ~ZVOL_DUMPIFIED; (void) dmu_free_long_range(os, ZVOL_OBJ, 0, DMU_OBJECT_END); + /* wait for dmu_free_long_range to actually free the blocks */ + txg_wait_synced(dmu_objset_pool(zv->zv_objset), 0); + tx = dmu_tx_create(os); + dmu_tx_hold_bonus(tx, ZVOL_OBJ); + error = dmu_tx_assign(tx, TXG_WAIT); + if (error) { + dmu_tx_abort(tx); + return (error); + } + if (dmu_object_set_blocksize(os, ZVOL_OBJ, vbs, 0, tx) == 0) + zv->zv_volblocksize = vbs; + dmu_tx_commit(tx); return (0); } diff --git a/external/cddl/osnet/dist/uts/common/rpc/xdr.c b/external/cddl/osnet/dist/uts/common/rpc/xdr.c index 8514f67300bb7..6720324cddbc0 100644 --- a/external/cddl/osnet/dist/uts/common/rpc/xdr.c +++ b/external/cddl/osnet/dist/uts/common/rpc/xdr.c @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -306,6 +306,29 @@ xdr_char(XDR *xdrs, char *cp) return (TRUE); } +/* + * XDR an unsigned char + */ +bool_t +xdr_u_char(XDR *xdrs, uchar_t *cp) +{ + int i; + + switch (xdrs->x_op) { + case XDR_ENCODE: + i = (*cp); + return (XDR_PUTINT32(xdrs, &i)); + case XDR_DECODE: + if (!XDR_GETINT32(xdrs, &i)) + return (FALSE); + *cp = (uchar_t)i; + return (TRUE); + case XDR_FREE: + return (TRUE); + } + return (FALSE); +} + /* * XDR booleans * @@ -607,6 +630,32 @@ xdr_string(XDR *xdrs, char **cpp, const uint_t maxsize) return (FALSE); } +/* + * xdr_vector(): + * + * XDR a fixed length array. Unlike variable-length arrays, the storage + * of fixed length arrays is static and unfreeable. + * > basep: base of the array + * > size: size of the array + * > elemsize: size of each element + * > xdr_elem: routine to XDR each element + */ +bool_t +xdr_vector(XDR *xdrs, char *basep, const uint_t nelem, + const uint_t elemsize, const xdrproc_t xdr_elem) +{ + uint_t i; + char *elptr; + + elptr = basep; + for (i = 0; i < nelem; i++) { + if (!(*xdr_elem)(xdrs, elptr, LASTUNSIGNED)) + return (FALSE); + elptr += elemsize; + } + return (TRUE); +} + /* * Wrapper for xdr_string that can be called directly from * routines like clnt_call diff --git a/external/cddl/osnet/dist/uts/common/rpc/xdr.h b/external/cddl/osnet/dist/uts/common/rpc/xdr.h index e335e4b83c428..3db775893c88a 100644 --- a/external/cddl/osnet/dist/uts/common/rpc/xdr.h +++ b/external/cddl/osnet/dist/uts/common/rpc/xdr.h @@ -18,7 +18,7 @@ * * CDDL HEADER END * - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */ @@ -420,6 +420,8 @@ extern bool_t xdr_opaque(XDR *, caddr_t, const uint_t); extern bool_t xdr_string(XDR *, char **, const uint_t); extern bool_t xdr_union(XDR *, enum_t *, char *, const struct xdr_discrim *, const xdrproc_t); +extern bool_t xdr_vector(XDR *, char *, const uint_t, const uint_t, + const xdrproc_t); extern unsigned int xdr_sizeof(xdrproc_t, void *); extern bool_t xdr_hyper(XDR *, longlong_t *); @@ -428,6 +430,7 @@ extern bool_t xdr_u_hyper(XDR *, u_longlong_t *); extern bool_t xdr_u_longlong_t(XDR *, u_longlong_t *); extern bool_t xdr_char(XDR *, char *); +extern bool_t xdr_u_char(XDR *, uchar_t *); extern bool_t xdr_wrapstring(XDR *, char **); extern bool_t xdr_reference(XDR *, caddr_t *, uint_t, const xdrproc_t); extern bool_t xdr_pointer(XDR *, char **, uint_t, const xdrproc_t); @@ -446,9 +449,6 @@ extern bool_t xdr_uint64_t(XDR *, uint64_t *); #endif #ifndef _KERNEL -extern bool_t xdr_u_char(XDR *, uchar_t *); -extern bool_t xdr_vector(XDR *, char *, const uint_t, const uint_t, const -xdrproc_t); extern bool_t xdr_float(XDR *, float *); extern bool_t xdr_double(XDR *, double *); extern bool_t xdr_quadruple(XDR *, long double *); @@ -468,12 +468,14 @@ extern bool_t xdr_bytes(); extern bool_t xdr_opaque(); extern bool_t xdr_string(); extern bool_t xdr_union(); +extern bool_t xdr_vector(); extern bool_t xdr_hyper(); extern bool_t xdr_longlong_t(); extern bool_t xdr_u_hyper(); extern bool_t xdr_u_longlong_t(); extern bool_t xdr_char(); +extern bool_t xdr_u_char(); extern bool_t xdr_reference(); extern bool_t xdr_pointer(); extern void xdr_free(); @@ -492,8 +494,6 @@ extern bool_t xdr_uint64_t(); #endif #ifndef _KERNEL -extern bool_t xdr_u_char(); -extern bool_t xdr_vector(); extern bool_t xdr_float(); extern bool_t xdr_double(); extern bool_t xdr_quadruple(); @@ -585,6 +585,8 @@ extern uint_t xdrrec_readbytes(); #endif #else +#define DLEN(mp) (mp->b_cont ? msgdsize(mp) : (mp->b_wptr - mp->b_rptr)) + extern void xdrmem_create(XDR *, caddr_t, uint_t, enum xdr_op); extern void xdrmblk_init(XDR *, mblk_t *, enum xdr_op, int); extern bool_t xdrmblk_getmblk(XDR *, mblk_t **, uint_t *); diff --git a/external/cddl/osnet/dist/uts/common/sys/acl.h b/external/cddl/osnet/dist/uts/common/sys/acl.h index 27fd577371a97..35c9772b8e9be 100644 --- a/external/cddl/osnet/dist/uts/common/sys/acl.h +++ b/external/cddl/osnet/dist/uts/common/sys/acl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ACL_H #define _SYS_ACL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -156,6 +154,10 @@ typedef struct ace_object { ACE_WRITE_ATTRIBUTES|ACE_DELETE|ACE_READ_ACL|ACE_WRITE_ACL| \ ACE_WRITE_OWNER|ACE_SYNCHRONIZE) +#define ACE_ALL_WRITE_PERMS (ACE_WRITE_DATA|ACE_APPEND_DATA| \ + ACE_WRITE_ATTRIBUTES|ACE_WRITE_NAMED_ATTRS|ACE_WRITE_ACL| \ + ACE_WRITE_OWNER|ACE_DELETE|ACE_DELETE_CHILD) + #define ACE_READ_PERMS (ACE_READ_DATA|ACE_READ_ACL|ACE_READ_ATTRIBUTES| \ ACE_READ_NAMED_ATTRS) diff --git a/external/cddl/osnet/dist/uts/common/sys/attr.h b/external/cddl/osnet/dist/uts/common/sys/attr.h index 86c4cd5d6c80b..b312b5a4297f1 100644 --- a/external/cddl/osnet/dist/uts/common/sys/attr.h +++ b/external/cddl/osnet/dist/uts/common/sys/attr.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_ATTR_H #define _SYS_ATTR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -56,6 +54,7 @@ extern "C" { #define A_AV_SCANSTAMP "av_scanstamp" #define A_OWNERSID "ownersid" #define A_GROUPSID "groupsid" +#define A_REPARSE_POINT "reparse" /* Attribute option for utilities */ #define O_HIDDEN "H" @@ -68,6 +67,7 @@ extern "C" { #define O_NODUMP "d" #define O_AV_QUARANTINED "q" #define O_AV_MODIFIED "m" +#define O_REPARSE_POINT "r" #define O_NONE "" /* ownersid and groupsid are composed of two nvpairs */ @@ -92,6 +92,7 @@ typedef enum { F_OWNERSID, F_GROUPSID, F_FSID, + F_REPARSE, F_ATTR_ALL } f_attr_t; diff --git a/external/cddl/osnet/dist/uts/common/sys/avl.h b/external/cddl/osnet/dist/uts/common/sys/avl.h index 02263a5a0cf14..ba305c9082392 100644 --- a/external/cddl/osnet/dist/uts/common/sys/avl.h +++ b/external/cddl/osnet/dist/uts/common/sys/avl.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _AVL_H #define _AVL_H -#pragma ident "%Z%%M% %I% %E% SMI" - /* * This is a private header file. Applications should not directly include * this file. @@ -163,7 +161,7 @@ extern void avl_create(avl_tree_t *tree, * node - node that has the value being looked for * where - position for use with avl_nearest() or avl_insert(), may be NULL */ -extern void *avl_find(avl_tree_t *tree, void *node, avl_index_t *where); +extern void *avl_find(avl_tree_t *tree, const void *node, avl_index_t *where); /* * Insert a node into the tree. diff --git a/external/cddl/osnet/dist/uts/common/sys/byteorder.h b/external/cddl/osnet/dist/uts/common/sys/byteorder.h index 2f4545c65da36..fd9f8a1d98289 100644 --- a/external/cddl/osnet/dist/uts/common/sys/byteorder.h +++ b/external/cddl/osnet/dist/uts/common/sys/byteorder.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -181,14 +181,13 @@ extern uint64_t ntohll(uint64_t); #define BE_IN32(xa) htonl(*((uint32_t *)(void *)(xa))) #endif /* !__i386 && !__amd64 */ -#if !defined(_LP64) && !defined(_LONGLONG_TYPE) -#if (!defined(__i386) && !defined(__amd64)) /* sparc */ +#if (!defined(__i386) && !defined(__amd64)) || \ + (!defined(_LP64) && !defined(_LONGLONG_TYPE)) #define BE_IN64(xa) \ (((uint64_t)BE_IN32(xa) << 32) | BE_IN32((uint8_t *)(xa) + 4)) #else /* x86 */ #define BE_IN64(xa) htonll(*((uint64_t *)(void *)(xa))) -#endif /* (!__i386 && !__amd64) */ -#endif /* !_LP64 && !_LONGLONG_TYPE */ +#endif /* (!__i386 && !__amd64) || (!_LP64 && !_LONGLONG_TYPE) */ #define LE_IN8(xa) \ *((uint8_t *)(xa)) diff --git a/external/cddl/osnet/dist/uts/common/sys/callb.h b/external/cddl/osnet/dist/uts/common/sys/callb.h index b548f4ca23b2e..302f314b800ae 100644 --- a/external/cddl/osnet/dist/uts/common/sys/callb.h +++ b/external/cddl/osnet/dist/uts/common/sys/callb.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_CALLB_H #define _SYS_CALLB_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -69,7 +66,8 @@ extern "C" { #define CB_CL_MDBOOT CB_CL_UADMIN #define CB_CL_ENTER_DEBUGGER 14 #define CB_CL_CPR_POST_KERNEL 15 -#define NCBCLASS 16 /* CHANGE ME if classes are added/removed */ +#define CB_CL_CPU_DEEP_IDLE 16 +#define NCBCLASS 17 /* CHANGE ME if classes are added/removed */ /* * CB_CL_CPR_DAEMON class specific definitions are given below: diff --git a/external/cddl/osnet/dist/uts/common/sys/cpupart.h b/external/cddl/osnet/dist/uts/common/sys/cpupart.h index b9e0da4e1993e..508637fa2680b 100644 --- a/external/cddl/osnet/dist/uts/common/sys/cpupart.h +++ b/external/cddl/osnet/dist/uts/common/sys/cpupart.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_CPUPART_H #define _SYS_CPUPART_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -58,16 +56,6 @@ typedef int cpupartid_t; #define CP_ALL 0 /* return all cpu partitions */ #define CP_NONEMPTY 1 /* return only non-empty ones */ -#if defined(_MACHDEP) -struct mach_cpupart { - cpuset_t mc_haltset; -}; - -extern struct mach_cpupart cp_default_mach; -#else -struct mach_cpupart; -#endif - typedef struct cpupart { disp_t cp_kp_queue; /* partition-wide kpreempt queue */ cpupartid_t cp_id; /* partition ID */ @@ -103,8 +91,7 @@ typedef struct cpupart { lgrp_gen_t cp_gen; /* generation number */ lgrp_id_t cp_lgrp_hint; /* last home lgroup chosen */ bitset_t cp_cmt_pgs; /* CMT PGs represented */ - - struct mach_cpupart *cp_mach; /* mach-specific */ + bitset_t cp_haltset; /* halted CPUs */ } cpupart_t; typedef struct cpupart_kstat { diff --git a/external/cddl/osnet/dist/uts/common/sys/cpuvar.h b/external/cddl/osnet/dist/uts/common/sys/cpuvar.h index 0a038e00d0e44..b52192b4197f0 100644 --- a/external/cddl/osnet/dist/uts/common/sys/cpuvar.h +++ b/external/cddl/osnet/dist/uts/common/sys/cpuvar.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -168,7 +168,7 @@ typedef struct cpu { ftrace_data_t cpu_ftrace; /* per cpu ftrace data */ - clock_t cpu_deadman_lbolt; /* used by deadman() */ + clock_t cpu_deadman_counter; /* used by deadman() */ uint_t cpu_deadman_countdown; /* used by deadman() */ kmutex_t cpu_cpc_ctxlock; /* protects context for idle thread */ @@ -211,12 +211,27 @@ typedef struct cpu { uint64_t cpu_curr_clock; /* current clock freq in Hz */ char *cpu_supp_freqs; /* supported freqs in Hz */ + uintptr_t cpu_cpcprofile_pc; /* kernel PC in cpc interrupt */ + uintptr_t cpu_cpcprofile_upc; /* user PC in cpc interrupt */ + /* * Interrupt load factor used by dispatcher & softcall */ hrtime_t cpu_intrlast; /* total interrupt time (nsec) */ int cpu_intrload; /* interrupt load factor (0-99%) */ + uint_t cpu_rotor; /* for cheap pseudo-random numbers */ + + struct cu_cpu_info *cpu_cu_info; /* capacity & util. info */ + + /* + * cpu_generation is updated whenever CPU goes on-line or off-line. + * Updates to cpu_generation are protected by cpu_lock. + * + * See CPU_NEW_GENERATION() macro below. + */ + volatile uint_t cpu_generation; /* tracking on/off-line */ + /* * New members must be added /before/ this member, as the CTF tools * rely on this being the last field before cpu_m, so they can @@ -238,12 +253,13 @@ typedef struct cpu { * is up to the platform to assure that this is performed properly. Note that * the structure is sized to avoid false sharing. */ -#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uintptr_t) + \ - sizeof (kmutex_t)) +#define CPUC_SIZE (sizeof (uint16_t) + sizeof (uint8_t) + \ + sizeof (uintptr_t) + sizeof (kmutex_t)) #define CPUC_PADSIZE CPU_CACHE_COHERENCE_SIZE - CPUC_SIZE typedef struct cpu_core { uint16_t cpuc_dtrace_flags; /* DTrace flags */ + uint8_t cpuc_dcpc_intr_state; /* DCPC provider intr state */ uint8_t cpuc_pad[CPUC_PADSIZE]; /* padding */ uintptr_t cpuc_dtrace_illval; /* DTrace illegal value */ kmutex_t cpuc_pid_lock; /* DTrace pid provider lock */ @@ -261,6 +277,28 @@ extern cpu_core_t cpu_core[]; */ #define CPU_ON_INTR(cpup) ((cpup)->cpu_intr_actv >> (LOCK_LEVEL + 1)) +/* + * Check to see if an interrupt thread might be active at a given ipl. + * If so return true. + * We must be conservative--it is ok to give a false yes, but a false no + * will cause disaster. (But if the situation changes after we check it is + * ok--the caller is trying to ensure that an interrupt routine has been + * exited). + * This is used when trying to remove an interrupt handler from an autovector + * list in avintr.c. + */ +#define INTR_ACTIVE(cpup, level) \ + ((level) <= LOCK_LEVEL ? \ + ((cpup)->cpu_intr_actv & (1 << (level))) : (CPU_ON_INTR(cpup))) + +/* + * CPU_PSEUDO_RANDOM() returns a per CPU value that changes each time one + * looks at it. It's meant as a cheap mechanism to be incorporated in routines + * wanting to avoid biasing, but where true randomness isn't needed (just + * something that changes). + */ +#define CPU_PSEUDO_RANDOM() (CPU->cpu_rotor++) + #if defined(_KERNEL) || defined(_KMEMUSER) #define INTR_STACK_SIZE MAX(DEFAULTSTKSZ, PAGESIZE) @@ -352,7 +390,6 @@ extern cpu_core_t cpu_core[]; #define CPU_DISP_DONTSTEAL 0x01 /* CPU undergoing context swtch */ #define CPU_DISP_HALTED 0x02 /* CPU halted waiting for interrupt */ - #endif /* _KERNEL || _KMEMUSER */ #if (defined(_KERNEL) || defined(_KMEMUSER)) && defined(_MACHDEP) @@ -516,6 +553,7 @@ extern cpuset_t cpu_seqid_inuse; #if defined(_KERNEL) || defined(_KMEMUSER) extern struct cpu *cpu[]; /* indexed by CPU number */ +extern struct cpu **cpu_seq; /* indexed by sequential CPU id */ extern cpu_t *cpu_list; /* list of CPUs */ extern cpu_t *cpu_active; /* list of active CPUs */ extern int ncpus; /* number of CPUs present */ @@ -569,6 +607,13 @@ extern struct cpu *curcpup(void); #define CPU_STATS(cp, stat) \ ((cp)->cpu_stats.stat) +/* + * Increment CPU generation value. + * This macro should be called whenever CPU goes on-line or off-line. + * Updates to cpu_generation should be protected by cpu_lock. + */ +#define CPU_NEW_GENERATION(cp) ((cp)->cpu_generation++) + #endif /* _KERNEL || _KMEMUSER */ /* @@ -658,6 +703,7 @@ int cpu_get_state(cpu_t *); /* get current cpu state */ const char *cpu_get_state_str(cpu_t *); /* get current cpu state as string */ +void cpu_set_curr_clock(uint64_t); /* indicate the current CPU's freq */ void cpu_set_supp_freqs(cpu_t *, const char *); /* set the CPU supported */ /* frequencies */ @@ -697,6 +743,49 @@ void cpu_enable_intr(struct cpu *cp); /* start issuing interrupts to cpu */ */ extern kmutex_t cpu_lock; /* lock protecting CPU data */ +/* + * CPU state change events + * + * Various subsystems need to know when CPUs change their state. They get this + * information by registering CPU state change callbacks using + * register_cpu_setup_func(). Whenever any CPU changes its state, the callback + * function is called. The callback function is passed three arguments: + * + * Event, described by cpu_setup_t + * CPU ID + * Transparent pointer passed when registering the callback + * + * The callback function is called with cpu_lock held. The return value from the + * callback function is usually ignored, except for CPU_CONFIG and CPU_UNCONFIG + * events. For these two events, non-zero return value indicates a failure and + * prevents successful completion of the operation. + * + * New events may be added in the future. Callback functions should ignore any + * events that they do not understand. + * + * The following events provide notification callbacks: + * + * CPU_INIT A new CPU is started and added to the list of active CPUs + * This event is only used during boot + * + * CPU_CONFIG A newly inserted CPU is prepared for starting running code + * This event is called by DR code + * + * CPU_UNCONFIG CPU has been powered off and needs cleanup + * This event is called by DR code + * + * CPU_ON CPU is enabled but does not run anything yet + * + * CPU_INTR_ON CPU is enabled and has interrupts enabled + * + * CPU_OFF CPU is going offline but can still run threads + * + * CPU_CPUPART_OUT CPU is going to move out of its partition + * + * CPU_CPUPART_IN CPU is going to move to a new partition + * + * CPU_SETUP CPU is set up during boot and can run threads + */ typedef enum { CPU_INIT, CPU_CONFIG, @@ -704,7 +793,9 @@ typedef enum { CPU_ON, CPU_OFF, CPU_CPUPART_IN, - CPU_CPUPART_OUT + CPU_CPUPART_OUT, + CPU_SETUP, + CPU_INTR_ON } cpu_setup_t; typedef int cpu_setup_func_t(cpu_setup_t, int, void *); @@ -717,6 +808,13 @@ extern void register_cpu_setup_func(cpu_setup_func_t *, void *); extern void unregister_cpu_setup_func(cpu_setup_func_t *, void *); extern void cpu_state_change_notify(int, cpu_setup_t); +/* + * Call specified function on the given CPU + */ +typedef void (*cpu_call_func_t)(uintptr_t, uintptr_t); +extern void cpu_call(cpu_t *, cpu_call_func_t, uintptr_t, uintptr_t); + + /* * Create various strings that describe the given CPU for the * processor_info system call and configuration-related kstats. diff --git a/external/cddl/osnet/dist/uts/common/sys/cred.h b/external/cddl/osnet/dist/uts/common/sys/cred.h index e84f1e04305dd..5056f9a511053 100644 --- a/external/cddl/osnet/dist/uts/common/sys/cred.h +++ b/external/cddl/osnet/dist/uts/common/sys/cred.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -34,8 +34,6 @@ #ifndef _SYS_CRED_H #define _SYS_CRED_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -58,6 +56,7 @@ struct prcred; struct ksid; struct ksidlist; struct credklpd; +struct credgrp; struct auditinfo_addr; /* cred.h is included in audit.h */ @@ -79,6 +78,7 @@ extern cred_t *crdup(cred_t *); extern void crdup_to(cred_t *, cred_t *); extern cred_t *crgetcred(void); extern void crset(struct proc *, cred_t *); +extern void crset_zone_privall(cred_t *); extern int groupmember(gid_t, const cred_t *); extern int supgroupmember(gid_t, const cred_t *); extern int hasprocperm(const cred_t *, const cred_t *); @@ -104,6 +104,7 @@ extern struct auditinfo_addr *crgetauinfo_modifiable(cred_t *); extern uint_t crgetref(const cred_t *); extern const gid_t *crgetgroups(const cred_t *); +extern const gid_t *crgetggroups(const struct credgrp *); extern int crgetngroups(const cred_t *); @@ -120,7 +121,13 @@ extern int crsetresgid(cred_t *, gid_t, gid_t, gid_t); */ extern int crsetugid(cred_t *, uid_t, gid_t); +/* + * Functions to handle the supplemental group list. + */ extern int crsetgroups(cred_t *, int, gid_t *); +extern struct credgrp *crgrpcopyin(int, gid_t *); +extern void crgrprele(struct credgrp *); +extern void crsetcredgrp(cred_t *, struct credgrp *); /* * Private interface for setting zone association of credential. diff --git a/external/cddl/osnet/dist/uts/common/sys/debug.h b/external/cddl/osnet/dist/uts/common/sys/debug.h index c156e7c463c3b..4de39d255e71e 100644 --- a/external/cddl/osnet/dist/uts/common/sys/debug.h +++ b/external/cddl/osnet/dist/uts/common/sys/debug.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,21 +19,19 @@ * CDDL HEADER END */ /* - * Copyright 2005 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */ /* All Rights Reserved */ - #ifndef _SYS_DEBUG_H #define _SYS_DEBUG_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include +#include #ifdef __cplusplus extern "C" { @@ -51,7 +48,7 @@ extern "C" { extern int assfail(const char *, const char *, int); #define VERIFY(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__))) #if DEBUG -#define ASSERT(EX) VERIFY(EX) +#define ASSERT(EX) ((void)((EX) || assfail(#EX, __FILE__, __LINE__))) #else #define ASSERT(x) ((void)0) #endif @@ -59,7 +56,7 @@ extern int assfail(const char *, const char *, int); extern int assfail(); #define VERIFY(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__))) #if DEBUG -#define ASSERT(EX) VERIFY(EX) +#define ASSERT(EX) ((void)((EX) || assfail("EX", __FILE__, __LINE__))) #else #define ASSERT(x) ((void)0) #endif @@ -76,6 +73,25 @@ extern int assfail(); #define ASSERT32(x) ASSERT(x) #endif +/* + * IMPLY and EQUIV are assertions of the form: + * + * if (a) then (b) + * and + * if (a) then (b) *AND* if (b) then (a) + */ +#if DEBUG +#define IMPLY(A, B) \ + ((void)(((!(A)) || (B)) || \ + assfail("(" #A ") implies (" #B ")", __FILE__, __LINE__))) +#define EQUIV(A, B) \ + ((void)((!!(A) == !!(B)) || \ + assfail("(" #A ") is equivalent to (" #B ")", __FILE__, __LINE__))) +#else +#define IMPLY(A, B) ((void)0) +#define EQUIV(A, B) ((void)0) +#endif + /* * ASSERT3() behaves like ASSERT() except that it is an explicit conditional, * and prints out the values of the left and right hand expressions as part of @@ -99,9 +115,9 @@ _NOTE(CONSTCOND) } while (0) #define VERIFY3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t) #define VERIFY3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t) #if DEBUG -#define ASSERT3S(x, y, z) VERIFY3S(x, y, z) -#define ASSERT3U(x, y, z) VERIFY3U(x, y, z) -#define ASSERT3P(x, y, z) VERIFY3P(x, y, z) +#define ASSERT3S(x, y, z) VERIFY3_IMPL(x, y, z, int64_t) +#define ASSERT3U(x, y, z) VERIFY3_IMPL(x, y, z, uint64_t) +#define ASSERT3P(x, y, z) VERIFY3_IMPL(x, y, z, uintptr_t) #else #define ASSERT3S(x, y, z) ((void)0) #define ASSERT3U(x, y, z) ((void)0) diff --git a/external/cddl/osnet/dist/uts/common/sys/dklabel.h b/external/cddl/osnet/dist/uts/common/sys/dklabel.h index 01baa7157cafc..457c1ecadc938 100644 --- a/external/cddl/osnet/dist/uts/common/sys/dklabel.h +++ b/external/cddl/osnet/dist/uts/common/sys/dklabel.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -207,7 +207,7 @@ struct dk_label { uint16_t dkl_ncyl; /* # of data cylinders */ uint16_t dkl_acyl; /* # of alternate cylinders */ uint16_t dkl_nhead; /* # of heads in this partition */ - uint16_t dkl_nsect; /* # of 512 byte sectors per track */ + uint16_t dkl_nsect; /* # of sectors per track */ uint16_t dkl_obs3; /* obsolete */ uint16_t dkl_obs4; /* obsolete */ struct dk_map32 dkl_map[NDKMAP]; /* logical partition headers */ diff --git a/external/cddl/osnet/dist/uts/common/sys/dtrace.h b/external/cddl/osnet/dist/uts/common/sys/dtrace.h index b6e52ec1c4da5..007502d7d8562 100644 --- a/external/cddl/osnet/dist/uts/common/sys/dtrace.h +++ b/external/cddl/osnet/dist/uts/common/sys/dtrace.h @@ -20,15 +20,13 @@ */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_DTRACE_H #define _SYS_DTRACE_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -664,6 +662,20 @@ typedef struct dof_sec { #define DOF_SECF_LOAD 1 /* section should be loaded */ +#define DOF_SEC_ISLOADABLE(x) \ + (((x) == DOF_SECT_ECBDESC) || ((x) == DOF_SECT_PROBEDESC) || \ + ((x) == DOF_SECT_ACTDESC) || ((x) == DOF_SECT_DIFOHDR) || \ + ((x) == DOF_SECT_DIF) || ((x) == DOF_SECT_STRTAB) || \ + ((x) == DOF_SECT_VARTAB) || ((x) == DOF_SECT_RELTAB) || \ + ((x) == DOF_SECT_TYPTAB) || ((x) == DOF_SECT_URELHDR) || \ + ((x) == DOF_SECT_KRELHDR) || ((x) == DOF_SECT_OPTDESC) || \ + ((x) == DOF_SECT_PROVIDER) || ((x) == DOF_SECT_PROBES) || \ + ((x) == DOF_SECT_PRARGS) || ((x) == DOF_SECT_PROFFS) || \ + ((x) == DOF_SECT_INTTAB) || ((x) == DOF_SECT_XLTAB) || \ + ((x) == DOF_SECT_XLMEMBERS) || ((x) == DOF_SECT_XLIMPORT) || \ + ((x) == DOF_SECT_XLIMPORT) || ((x) == DOF_SECT_XLEXPORT) || \ + ((x) == DOF_SECT_PREXPORT) || ((x) == DOF_SECT_PRENOFFS)) + typedef struct dof_ecbdesc { dof_secidx_t dofe_probes; /* link to DOF_SECT_PROBEDESC */ dof_secidx_t dofe_pred; /* link to DOF_SECT_DIFOHDR */ @@ -1382,7 +1394,7 @@ typedef struct dof_helper { * dtps_provide_module(); see "Arguments and Notes" for dtrace_register(), * below. * - * 1.4 void dtps_enable(void *arg, dtrace_id_t id, void *parg) + * 1.4 int dtps_enable(void *arg, dtrace_id_t id, void *parg) * * 1.4.1 Overview * @@ -1403,7 +1415,8 @@ typedef struct dof_helper { * * 1.4.3 Return value * - * None. + * On success, dtps_enable() should return 0. On failure, -1 should be + * returned. * * 1.4.4 Caller's context * @@ -1957,7 +1970,7 @@ typedef struct dof_helper { typedef struct dtrace_pops { void (*dtps_provide)(void *arg, const dtrace_probedesc_t *spec); void (*dtps_provide_module)(void *arg, struct modctl *mp); - void (*dtps_enable)(void *arg, dtrace_id_t id, void *parg); + int (*dtps_enable)(void *arg, dtrace_id_t id, void *parg); void (*dtps_disable)(void *arg, dtrace_id_t id, void *parg); void (*dtps_suspend)(void *arg, dtrace_id_t id, void *parg); void (*dtps_resume)(void *arg, dtrace_id_t id, void *parg); diff --git a/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h b/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h index 66ca9c5d7108b..c752edc99bbd9 100644 --- a/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h +++ b/external/cddl/osnet/dist/uts/common/sys/fm/fs/zfs.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FM_FS_ZFS_H #define _SYS_FM_FS_ZFS_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -57,6 +55,7 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_VDEV_TYPE "vdev_type" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH "vdev_path" #define FM_EREPORT_PAYLOAD_ZFS_VDEV_DEVID "vdev_devid" +#define FM_EREPORT_PAYLOAD_ZFS_VDEV_FRU "vdev_fru" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_GUID "parent_guid" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_TYPE "parent_type" #define FM_EREPORT_PAYLOAD_ZFS_PARENT_PATH "parent_path" @@ -69,6 +68,18 @@ extern "C" { #define FM_EREPORT_PAYLOAD_ZFS_ZIO_OFFSET "zio_offset" #define FM_EREPORT_PAYLOAD_ZFS_ZIO_SIZE "zio_size" #define FM_EREPORT_PAYLOAD_ZFS_PREV_STATE "prev_state" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_EXPECTED "cksum_expected" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ACTUAL "cksum_actual" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_ALGO "cksum_algorithm" +#define FM_EREPORT_PAYLOAD_ZFS_CKSUM_BYTESWAP "cksum_byteswap" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_OFFSET_RANGES "bad_ranges" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_MIN_GAP "bad_ranges_min_gap" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_SETS "bad_range_sets" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_RANGE_CLEARS "bad_range_clears" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_BITS "bad_set_bits" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_BITS "bad_cleared_bits" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_SET_HISTOGRAM "bad_set_histogram" +#define FM_EREPORT_PAYLOAD_ZFS_BAD_CLEARED_HISTOGRAM "bad_cleared_histogram" #define FM_EREPORT_FAILMODE_WAIT "wait" #define FM_EREPORT_FAILMODE_CONTINUE "continue" @@ -76,6 +87,7 @@ extern "C" { #define FM_RESOURCE_REMOVED "removed" #define FM_RESOURCE_AUTOREPLACE "autoreplace" +#define FM_RESOURCE_STATECHANGE "statechange" #ifdef __cplusplus } diff --git a/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h b/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h index 388951bfce65f..fbf614caa20e9 100644 --- a/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h +++ b/external/cddl/osnet/dist/uts/common/sys/fm/protocol.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -47,6 +47,7 @@ extern "C" { /* FM event class values */ #define FM_EREPORT_CLASS "ereport" #define FM_FAULT_CLASS "fault" +#define FM_DEFECT_CLASS "defect" #define FM_RSRC_CLASS "resource" #define FM_LIST_EVENT "list" @@ -83,9 +84,11 @@ extern "C" { #define FM_SUSPECT_FAULT_LIST "fault-list" #define FM_SUSPECT_FAULT_SZ "fault-list-sz" #define FM_SUSPECT_FAULT_STATUS "fault-status" +#define FM_SUSPECT_INJECTED "__injected" #define FM_SUSPECT_MESSAGE "message" #define FM_SUSPECT_RETIRE "retire" #define FM_SUSPECT_RESPONSE "response" +#define FM_SUSPECT_SEVERITY "severity" #define FM_SUSPECT_VERS0 0 #define FM_SUSPECT_VERSION FM_SUSPECT_VERS0 @@ -121,6 +124,7 @@ extern "C" { #define FM_RSRC_ASRU_REPAIRED "repaired" #define FM_RSRC_ASRU_REPLACED "replaced" #define FM_RSRC_ASRU_ACQUITTED "acquitted" +#define FM_RSRC_ASRU_RESOLVED "resolved" #define FM_RSRC_ASRU_UNUSABLE "unusable" #define FM_RSRC_ASRU_EVENT "event" @@ -129,6 +133,8 @@ extern "C" { #define FM_RSRC_XPRT_VERSION FM_RSRC_XPRT_VERS0 #define FM_RSRC_XPRT_UUID "uuid" #define FM_RSRC_XPRT_SUBCLASS "subclass" +#define FM_RSRC_XPRT_FAULT_STATUS "fault-status" +#define FM_RSRC_XPRT_FAULT_HAS_ASRU "fault-has-asru" /* * FM ENA Format Macros @@ -167,6 +173,7 @@ extern "C" { /* FMRI authority-type member names */ #define FM_FMRI_AUTH_CHASSIS "chassis-id" +#define FM_FMRI_AUTH_PRODUCT_SN "product-sn" #define FM_FMRI_AUTH_PRODUCT "product-id" #define FM_FMRI_AUTH_DOMAIN "domain-id" #define FM_FMRI_AUTH_SERVER "server-id" @@ -205,6 +212,8 @@ extern "C" { #define FM_PKG_SCHEME_VERSION PKG_SCHEME_VERSION0 #define LEGACY_SCHEME_VERSION0 0 #define FM_LEGACY_SCHEME_VERSION LEGACY_SCHEME_VERSION0 +#define SVC_SCHEME_VERSION0 0 +#define FM_SVC_SCHEME_VERSION SVC_SCHEME_VERSION0 #define ZFS_SCHEME_VERSION0 0 #define FM_ZFS_SCHEME_VERSION ZFS_SCHEME_VERSION0 @@ -246,14 +255,13 @@ extern "C" { #define FM_FMRI_PKG_VERSION "pkg-version" /* svc scheme member names */ -#define FM_FMRI_SVC_NAME "service-name" -#define FM_FMRI_SVC_VERSION "service-version" -#define FM_FMRI_SVC_INSTANCE "instance" -#define FM_FMRI_SVC_CONTRACT_ID "contract-id" +#define FM_FMRI_SVC_NAME "svc-name" +#define FM_FMRI_SVC_INSTANCE "svc-instance" +#define FM_FMRI_SVC_CONTRACT_ID "svc-contract-id" /* svc-authority member names */ #define FM_FMRI_SVC_AUTH_SCOPE "scope" -#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-FQN" +#define FM_FMRI_SVC_AUTH_SYSTEM_FQN "system-fqn" /* cpu scheme member names */ #define FM_FMRI_CPU_ID "cpuid" @@ -316,6 +324,8 @@ extern void fm_fmri_mem_set(nvlist_t *, int, const nvlist_t *, const char *, extern void fm_authority_set(nvlist_t *, int, const char *, const char *, const char *, const char *); extern void fm_fmri_zfs_set(nvlist_t *, int, uint64_t, uint64_t); +extern void fm_fmri_hc_create(nvlist_t *, int, const nvlist_t *, nvlist_t *, + nvlist_t *, int, ...); extern uint64_t fm_ena_increment(uint64_t); extern uint64_t fm_ena_generate(uint64_t, uchar_t); diff --git a/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h b/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h index 95f04d842efa1..e986759a2d3eb 100644 --- a/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h +++ b/external/cddl/osnet/dist/uts/common/sys/fs/zfs.h @@ -18,14 +18,17 @@ * * CDDL HEADER END */ + /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_FS_ZFS_H #define _SYS_FS_ZFS_H +#include + #ifdef __cplusplus extern "C" { #endif @@ -48,6 +51,10 @@ typedef enum { #define ZFS_TYPE_DATASET \ (ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME | ZFS_TYPE_SNAPSHOT) +#define ZAP_MAXNAMELEN 256 +#define ZAP_MAXVALUELEN (1024 * 8) +#define ZAP_OLDMAXVALUELEN 1024 + /* * Dataset properties are identified by these constants and must be added to * the end of this list to ensure that external consumers are not affected @@ -105,9 +112,28 @@ typedef enum { ZFS_PROP_USEDDS, ZFS_PROP_USEDCHILD, ZFS_PROP_USEDREFRESERV, + ZFS_PROP_USERACCOUNTING, /* not exposed to the user */ + ZFS_PROP_STMF_SHAREINFO, /* not exposed to the user */ + ZFS_PROP_DEFER_DESTROY, + ZFS_PROP_USERREFS, + ZFS_PROP_LOGBIAS, + ZFS_PROP_UNIQUE, /* not exposed to the user */ + ZFS_PROP_OBJSETID, /* not exposed to the user */ + ZFS_PROP_DEDUP, + ZFS_PROP_MLSLABEL, ZFS_NUM_PROPS } zfs_prop_t; +typedef enum { + ZFS_PROP_USERUSED, + ZFS_PROP_USERQUOTA, + ZFS_PROP_GROUPUSED, + ZFS_PROP_GROUPQUOTA, + ZFS_NUM_USERQUOTA_PROPS +} zfs_userquota_prop_t; + +extern const char *zfs_userquota_prop_prefixes[ZFS_NUM_USERQUOTA_PROPS]; + /* * Pool properties are identified by these constants and must be added to the * end of this list to ensure that external consumers are not affected @@ -117,8 +143,6 @@ typedef enum { typedef enum { ZPOOL_PROP_NAME, ZPOOL_PROP_SIZE, - ZPOOL_PROP_USED, - ZPOOL_PROP_AVAILABLE, ZPOOL_PROP_CAPACITY, ZPOOL_PROP_ALTROOT, ZPOOL_PROP_HEALTH, @@ -130,6 +154,11 @@ typedef enum { ZPOOL_PROP_CACHEFILE, ZPOOL_PROP_FAILUREMODE, ZPOOL_PROP_LISTSNAPS, + ZPOOL_PROP_AUTOEXPAND, + ZPOOL_PROP_DEDUPDITTO, + ZPOOL_PROP_DEDUPRATIO, + ZPOOL_PROP_FREE, + ZPOOL_PROP_ALLOCATED, ZPOOL_NUM_PROPS } zpool_prop_t; @@ -144,10 +173,27 @@ typedef enum { ZPROP_SRC_DEFAULT = 0x2, ZPROP_SRC_TEMPORARY = 0x4, ZPROP_SRC_LOCAL = 0x8, - ZPROP_SRC_INHERITED = 0x10 + ZPROP_SRC_INHERITED = 0x10, + ZPROP_SRC_RECEIVED = 0x20 } zprop_source_t; -#define ZPROP_SRC_ALL 0x1f +#define ZPROP_SRC_ALL 0x3f + +#define ZPROP_SOURCE_VAL_RECVD "$recvd" +#define ZPROP_N_MORE_ERRORS "N_MORE_ERRORS" +/* + * Dataset flag implemented as a special entry in the props zap object + * indicating that the dataset has received properties on or after + * SPA_VERSION_RECVD_PROPS. The first such receive blows away local properties + * just as it did in earlier versions, and thereafter, local properties are + * preserved. + */ +#define ZPROP_HAS_RECVD "$hasrecvd" + +typedef enum { + ZPROP_ERR_NOCLEAR = 0x1, /* failure to clear existing props */ + ZPROP_ERR_NORESTORE = 0x2 /* failure to restore props on error */ +} zprop_errflags_t; typedef int (*zprop_func)(int, void *); @@ -169,8 +215,10 @@ boolean_t zfs_prop_setonce(zfs_prop_t); const char *zfs_prop_to_name(zfs_prop_t); zfs_prop_t zfs_name_to_prop(const char *); boolean_t zfs_prop_user(const char *); +boolean_t zfs_prop_userquota(const char *); int zfs_prop_index_to_string(zfs_prop_t, uint64_t, const char **); int zfs_prop_string_to_index(zfs_prop_t, const char *, uint64_t *); +uint64_t zfs_prop_random_value(zfs_prop_t, uint64_t seed); boolean_t zfs_prop_valid_for_type(int, zfs_type_t); /* @@ -183,6 +231,7 @@ uint64_t zpool_prop_default_numeric(zpool_prop_t); boolean_t zpool_prop_readonly(zpool_prop_t); int zpool_prop_index_to_string(zpool_prop_t, uint64_t, const char **); int zpool_prop_string_to_index(zpool_prop_t, const char *, uint64_t *); +uint64_t zpool_prop_random_value(zpool_prop_t, uint64_t seed); /* * Definitions for the Delegation. @@ -213,12 +262,22 @@ typedef enum { #define ZFS_DELEG_PERM_GID "gid" #define ZFS_DELEG_PERM_GROUPS "groups" +#define ZFS_MLSLABEL_DEFAULT "none" + +#define ZFS_SMB_ACL_SRC "src" +#define ZFS_SMB_ACL_TARGET "target" + typedef enum { ZFS_CANMOUNT_OFF = 0, ZFS_CANMOUNT_ON = 1, ZFS_CANMOUNT_NOAUTO = 2 } zfs_canmount_type_t; +typedef enum { + ZFS_LOGBIAS_LATENCY = 0, + ZFS_LOGBIAS_THROUGHPUT = 1 +} zfs_logbias_op_t; + typedef enum zfs_share_op { ZFS_SHARE_NFS = 0, ZFS_UNSHARE_NFS = 1, @@ -226,6 +285,13 @@ typedef enum zfs_share_op { ZFS_UNSHARE_SMB = 3 } zfs_share_op_t; +typedef enum zfs_smb_acl_op { + ZFS_SMB_ACL_ADD, + ZFS_SMB_ACL_REMOVE, + ZFS_SMB_ACL_RENAME, + ZFS_SMB_ACL_PURGE +} zfs_smb_acl_op_t; + typedef enum zfs_cache_type { ZFS_CACHE_NONE = 0, ZFS_CACHE_METADATA = 1, @@ -250,13 +316,23 @@ typedef enum zfs_cache_type { #define SPA_VERSION_12 12ULL #define SPA_VERSION_13 13ULL #define SPA_VERSION_14 14ULL +#define SPA_VERSION_15 15ULL +#define SPA_VERSION_16 16ULL +#define SPA_VERSION_17 17ULL +#define SPA_VERSION_18 18ULL +#define SPA_VERSION_19 19ULL +#define SPA_VERSION_20 20ULL +#define SPA_VERSION_21 21ULL +#define SPA_VERSION_22 22ULL +#define SPA_VERSION_23 23ULL /* * When bumping up SPA_VERSION, make sure GRUB ZFS understands the on-disk - * format change. Go to usr/src/grub/grub-0.95/stage2/{zfs-include/, fsys_zfs*}, - * and do the appropriate changes. + * format change. Go to usr/src/grub/grub-0.97/stage2/{zfs-include/, fsys_zfs*}, + * and do the appropriate changes. Also bump the version number in + * usr/src/grub/capability. */ -#define SPA_VERSION SPA_VERSION_14 -#define SPA_VERSION_STRING "14" +#define SPA_VERSION SPA_VERSION_23 +#define SPA_VERSION_STRING "23" /* * Symbolic names for the changes that caused a SPA_VERSION switch. @@ -272,7 +348,7 @@ typedef enum zfs_cache_type { #define SPA_VERSION_INITIAL SPA_VERSION_1 #define SPA_VERSION_DITTO_BLOCKS SPA_VERSION_2 #define SPA_VERSION_SPARES SPA_VERSION_3 -#define SPA_VERSION_RAID6 SPA_VERSION_3 +#define SPA_VERSION_RAIDZ2 SPA_VERSION_3 #define SPA_VERSION_BPLIST_ACCOUNT SPA_VERSION_3 #define SPA_VERSION_RAIDZ_DEFLATE SPA_VERSION_3 #define SPA_VERSION_DNODE_BYTES SPA_VERSION_3 @@ -292,6 +368,15 @@ typedef enum zfs_cache_type { #define SPA_VERSION_SNAP_PROPS SPA_VERSION_12 #define SPA_VERSION_USED_BREAKDOWN SPA_VERSION_13 #define SPA_VERSION_PASSTHROUGH_X SPA_VERSION_14 +#define SPA_VERSION_USERSPACE SPA_VERSION_15 +#define SPA_VERSION_STMF_PROP SPA_VERSION_16 +#define SPA_VERSION_RAIDZ3 SPA_VERSION_17 +#define SPA_VERSION_USERREFS SPA_VERSION_18 +#define SPA_VERSION_HOLES SPA_VERSION_19 +#define SPA_VERSION_ZLE_COMPRESSION SPA_VERSION_20 +#define SPA_VERSION_DEDUP SPA_VERSION_21 +#define SPA_VERSION_RECVD_PROPS SPA_VERSION_22 +#define SPA_VERSION_SLIM_ZIL SPA_VERSION_23 /* * ZPL version - rev'd whenever an incompatible on-disk format change @@ -299,19 +384,37 @@ typedef enum zfs_cache_type { * also update the version_table[] and help message in zfs_prop.c. * * When changing, be sure to teach GRUB how to read the new format! - * See usr/src/grub/grub-0.95/stage2/{zfs-include/,fsys_zfs*} + * See usr/src/grub/grub-0.97/stage2/{zfs-include/,fsys_zfs*} */ #define ZPL_VERSION_1 1ULL #define ZPL_VERSION_2 2ULL #define ZPL_VERSION_3 3ULL -#define ZPL_VERSION ZPL_VERSION_3 -#define ZPL_VERSION_STRING "3" +#define ZPL_VERSION_4 4ULL +#define ZPL_VERSION ZPL_VERSION_4 +#define ZPL_VERSION_STRING "4" #define ZPL_VERSION_INITIAL ZPL_VERSION_1 #define ZPL_VERSION_DIRENT_TYPE ZPL_VERSION_2 #define ZPL_VERSION_FUID ZPL_VERSION_3 #define ZPL_VERSION_NORMALIZATION ZPL_VERSION_3 #define ZPL_VERSION_SYSATTR ZPL_VERSION_3 +#define ZPL_VERSION_USERSPACE ZPL_VERSION_4 + +/* Rewind request information */ +#define ZPOOL_NO_REWIND 1 /* No policy - default behavior */ +#define ZPOOL_NEVER_REWIND 2 /* Do not search for best txg or rewind */ +#define ZPOOL_TRY_REWIND 4 /* Search for best txg, but do not rewind */ +#define ZPOOL_DO_REWIND 8 /* Rewind to best txg w/in deferred frees */ +#define ZPOOL_EXTREME_REWIND 16 /* Allow extreme measures to find best txg */ +#define ZPOOL_REWIND_MASK 28 /* All the possible rewind bits */ +#define ZPOOL_REWIND_POLICIES 31 /* All the possible policy bits */ + +typedef struct zpool_rewind_policy { + uint32_t zrp_request; /* rewind behavior requested */ + uint64_t zrp_maxmeta; /* max acceptable meta-data errors */ + uint64_t zrp_maxdata; /* max acceptable data errors */ + uint64_t zrp_txg; /* specific txg to load */ +} zpool_rewind_policy_t; /* * The following are configuration names used in the nvlist describing a pool's @@ -349,6 +452,16 @@ typedef enum zfs_cache_type { #define ZPOOL_CONFIG_PHYS_PATH "phys_path" #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_L2CACHE "l2cache" +#define ZPOOL_CONFIG_HOLE_ARRAY "hole_array" +#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" +#define ZPOOL_CONFIG_IS_HOLE "is_hole" +#define ZPOOL_CONFIG_DDT_HISTOGRAM "ddt_histogram" +#define ZPOOL_CONFIG_DDT_OBJ_STATS "ddt_object_stats" +#define ZPOOL_CONFIG_DDT_STATS "ddt_stats" +#define ZPOOL_CONFIG_SPLIT "splitcfg" +#define ZPOOL_CONFIG_ORIG_GUID "orig_guid" +#define ZPOOL_CONFIG_SPLIT_GUID "split_guid" +#define ZPOOL_CONFIG_SPLIT_LIST "guid_list" #define ZPOOL_CONFIG_SUSPENDED "suspended" /* not stored on disk */ #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_BOOTFS "bootfs" /* not stored on disk */ @@ -361,6 +474,20 @@ typedef enum zfs_cache_type { #define ZPOOL_CONFIG_FAULTED "faulted" #define ZPOOL_CONFIG_DEGRADED "degraded" #define ZPOOL_CONFIG_REMOVED "removed" +#define ZPOOL_CONFIG_FRU "fru" +#define ZPOOL_CONFIG_AUX_STATE "aux_state" + +/* Rewind policy parameters */ +#define ZPOOL_REWIND_POLICY "rewind-policy" +#define ZPOOL_REWIND_REQUEST "rewind-request" +#define ZPOOL_REWIND_REQUEST_TXG "rewind-request-txg" +#define ZPOOL_REWIND_META_THRESH "rewind-meta-thresh" +#define ZPOOL_REWIND_DATA_THRESH "rewind-data-thresh" + +/* Rewind data discovered */ +#define ZPOOL_CONFIG_LOAD_TIME "rewind_txg_ts" +#define ZPOOL_CONFIG_LOAD_DATA_ERRORS "verify_data_errors" +#define ZPOOL_CONFIG_REWIND_TIME "seconds_of_rewind" #define VDEV_TYPE_ROOT "root" #define VDEV_TYPE_MIRROR "mirror" @@ -369,6 +496,7 @@ typedef enum zfs_cache_type { #define VDEV_TYPE_DISK "disk" #define VDEV_TYPE_FILE "file" #define VDEV_TYPE_MISSING "missing" +#define VDEV_TYPE_HOLE "hole" #define VDEV_TYPE_SPARE "spare" #define VDEV_TYPE_LOG "log" #define VDEV_TYPE_L2CACHE "l2cache" @@ -418,7 +546,9 @@ typedef enum vdev_aux { VDEV_AUX_SPARED, /* hot spare used in another pool */ VDEV_AUX_ERR_EXCEEDED, /* too many errors */ VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ - VDEV_AUX_BAD_LOG /* cannot read log chain(s) */ + VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ + VDEV_AUX_EXTERNAL, /* external diagnosis */ + VDEV_AUX_SPLIT_POOL /* vdev was split off into another pool */ } vdev_aux_t; /* @@ -488,25 +618,45 @@ typedef struct vdev_stat { uint64_t vs_scrub_end; /* UTC scrub end time */ } vdev_stat_t; +/* + * DDT statistics. Note: all fields should be 64-bit because this + * is passed between kernel and userland as an nvlist uint64 array. + */ +typedef struct ddt_object { + uint64_t ddo_count; /* number of elments in ddt */ + uint64_t ddo_dspace; /* size of ddt on disk */ + uint64_t ddo_mspace; /* size of ddt in-core */ +} ddt_object_t; + +typedef struct ddt_stat { + uint64_t dds_blocks; /* blocks */ + uint64_t dds_lsize; /* logical size */ + uint64_t dds_psize; /* physical size */ + uint64_t dds_dsize; /* deflated allocated size */ + uint64_t dds_ref_blocks; /* referenced blocks */ + uint64_t dds_ref_lsize; /* referenced lsize * refcnt */ + uint64_t dds_ref_psize; /* referenced psize * refcnt */ + uint64_t dds_ref_dsize; /* referenced dsize * refcnt */ +} ddt_stat_t; + +typedef struct ddt_histogram { + ddt_stat_t ddh_stat[64]; /* power-of-two histogram buckets */ +} ddt_histogram_t; + #define ZVOL_DRIVER "zvol" #define ZFS_DRIVER "zfs" #define ZFS_DEV "/dev/zfs" -/* - * zvol paths. Irritatingly, the devfsadm interfaces want all these - * paths without the /dev prefix, but for some things, we want the - * /dev prefix. Below are the names without /dev. - */ -#define ZVOL_DEV_DIR "zvol/dsk" -#define ZVOL_RDEV_DIR "zvol/rdsk" - -/* - * And here are the things we need with /dev, etc. in front of them. - */ -#define ZVOL_PSEUDO_DEV "/devices/pseudo/zvol@0:" -#define ZVOL_FULL_DEV_DIR "/dev/" ZVOL_DEV_DIR "/" +/* general zvol path */ +#define ZVOL_DIR "/dev/zvol" +/* expansion */ +#define ZVOL_PSEUDO_DEV "/devices/pseudo/zfs@0:" +/* for dump and swap */ +#define ZVOL_FULL_DEV_DIR ZVOL_DIR "/dsk/" +#define ZVOL_FULL_RDEV_DIR ZVOL_DIR "/rdsk/" #define ZVOL_PROP_NAME "name" +#define ZVOL_DEFAULT_BLOCKSIZE 8192 /* * /dev/zfs ioctl numbers. @@ -531,13 +681,12 @@ typedef enum zfs_ioc { ZFS_IOC_VDEV_ATTACH, ZFS_IOC_VDEV_DETACH, ZFS_IOC_VDEV_SETPATH, + ZFS_IOC_VDEV_SETFRU, ZFS_IOC_OBJSET_STATS, ZFS_IOC_OBJSET_ZPLPROPS, ZFS_IOC_DATASET_LIST_NEXT, ZFS_IOC_SNAPSHOT_LIST_NEXT, ZFS_IOC_SET_PROP, - ZFS_IOC_CREATE_MINOR, - ZFS_IOC_REMOVE_MINOR, ZFS_IOC_CREATE, ZFS_IOC_DESTROY, ZFS_IOC_ROLLBACK, @@ -560,17 +709,28 @@ typedef enum zfs_ioc { ZFS_IOC_GET_FSACL, ZFS_IOC_ISCSI_PERM_CHECK, ZFS_IOC_SHARE, - ZFS_IOC_INHERIT_PROP + ZFS_IOC_INHERIT_PROP, + ZFS_IOC_SMB_ACL, + ZFS_IOC_USERSPACE_ONE, + ZFS_IOC_USERSPACE_MANY, + ZFS_IOC_USERSPACE_UPGRADE, + ZFS_IOC_HOLD, + ZFS_IOC_RELEASE, + ZFS_IOC_GET_HOLDS, + ZFS_IOC_OBJSET_RECVD_PROPS, + ZFS_IOC_VDEV_SPLIT } zfs_ioc_t; /* * Internal SPA load state. Used by FMA diagnosis engine. */ typedef enum { - SPA_LOAD_NONE, /* no load in progress */ - SPA_LOAD_OPEN, /* normal open */ - SPA_LOAD_IMPORT, /* import in progress */ - SPA_LOAD_TRYIMPORT /* tryimport in progress */ + SPA_LOAD_NONE, /* no load in progress */ + SPA_LOAD_OPEN, /* normal open */ + SPA_LOAD_IMPORT, /* import in progress */ + SPA_LOAD_TRYIMPORT, /* tryimport in progress */ + SPA_LOAD_RECOVER, /* recovery requested */ + SPA_LOAD_ERROR /* load failed */ } spa_load_state_t; /* @@ -602,6 +762,7 @@ typedef enum { #define ZFS_ONLINE_CHECKREMOVE 0x1 #define ZFS_ONLINE_UNSPARE 0x2 #define ZFS_ONLINE_FORCEFAULT 0x4 +#define ZFS_ONLINE_EXPAND 0x8 #define ZFS_OFFLINE_TEMPORARY 0x1 /* @@ -632,7 +793,7 @@ typedef enum { /* * Note: This is encoded on-disk, so new events must be added to the * end, and unused events can not be removed. Be sure to edit - * zpool_main.c: hist_event_table[]. + * libzfs_pool.c: hist_event_table[]. */ typedef enum history_internal_events { LOG_NO_EVENT = 0, @@ -673,6 +834,9 @@ typedef enum history_internal_events { LOG_DS_REFQUOTA, LOG_DS_REFRESERV, LOG_POOL_SCRUB_DONE, + LOG_DS_USER_HOLD, + LOG_DS_USER_RELEASE, + LOG_POOL_SPLIT, LOG_END } history_internal_events_t; diff --git a/external/cddl/osnet/dist/uts/common/sys/mnttab.h b/external/cddl/osnet/dist/uts/common/sys/mnttab.h index eeddd96a2efca..ff086370ec3ee 100644 --- a/external/cddl/osnet/dist/uts/common/sys/mnttab.h +++ b/external/cddl/osnet/dist/uts/common/sys/mnttab.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -24,15 +23,13 @@ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_MNTTAB_H #define _SYS_MNTTAB_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -53,6 +50,14 @@ extern "C" { #define putmntent(fd, mp) (-1) +/* + * The fields in struct extmnttab should match those in struct mnttab until new + * fields are encountered. This allows hasmntopt(), getmntent_common() and + * mntioctl() to cast one type to the other safely. + * + * The fields in struct mnttab, struct extmnttab and struct mntentbuf must all + * match those in the corresponding 32-bit versions defined in mntvnops.c. + */ struct mnttab { char *mnt_special; char *mnt_mountp; @@ -61,11 +66,6 @@ struct mnttab { char *mnt_time; }; -/* - * NOTE: fields in extmnttab should match struct mnttab till new fields - * are encountered, this allows hasmntopt to work properly when its arg is - * a pointer to an extmnttab struct cast to a mnttab struct pointer. - */ struct extmnttab { char *mnt_special; char *mnt_mountp; @@ -76,6 +76,12 @@ struct extmnttab { uint_t mnt_minor; }; +struct mntentbuf { + struct extmnttab *mbuf_emp; + size_t mbuf_bufsize; + char *mbuf_buf; +}; + #if !defined(_KERNEL) #ifdef __STDC__ extern void resetmnttab(FILE *); diff --git a/external/cddl/osnet/dist/uts/common/sys/nvpair.h b/external/cddl/osnet/dist/uts/common/sys/nvpair.h index 9e768541f2e71..58037b06537e5 100644 --- a/external/cddl/osnet/dist/uts/common/sys/nvpair.h +++ b/external/cddl/osnet/dist/uts/common/sys/nvpair.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_NVPAIR_H #define _SYS_NVPAIR_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include #include @@ -199,6 +197,7 @@ int nvlist_add_double(nvlist_t *, const char *, double); int nvlist_remove(nvlist_t *, const char *, data_type_t); int nvlist_remove_all(nvlist_t *, const char *); +int nvlist_remove_nvpair(nvlist_t *, nvpair_t *); int nvlist_lookup_boolean(nvlist_t *, const char *); int nvlist_lookup_boolean_value(nvlist_t *, const char *, boolean_t *); @@ -237,9 +236,11 @@ int nvlist_lookup_nvpair(nvlist_t *, const char *, nvpair_t **); int nvlist_lookup_nvpair_embedded_index(nvlist_t *, const char *, nvpair_t **, int *, char **); boolean_t nvlist_exists(nvlist_t *, const char *); +boolean_t nvlist_empty(nvlist_t *); /* processing nvpair */ nvpair_t *nvlist_next_nvpair(nvlist_t *, nvpair_t *); +nvpair_t *nvlist_prev_nvpair(nvlist_t *, nvpair_t *); char *nvpair_name(nvpair_t *); data_type_t nvpair_type(nvpair_t *); int nvpair_type_is_array(nvpair_t *); diff --git a/external/cddl/osnet/dist/uts/common/sys/priv.h b/external/cddl/osnet/dist/uts/common/sys/priv.h index d9be377cd9fef..2683446bd236d 100644 --- a/external/cddl/osnet/dist/uts/common/sys/priv.h +++ b/external/cddl/osnet/dist/uts/common/sys/priv.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2010 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_PRIV_H #define _SYS_PRIV_H -#pragma ident "%Z%%M% %I% %E% SMI" /* from TSOL 8 */ - #include #include #include @@ -137,11 +135,12 @@ typedef struct priv_impl_info { #define __PROC_PROTECT 0x0008 /* Private */ #define NET_MAC_AWARE 0x0010 /* Is MAC aware */ #define NET_MAC_AWARE_INHERIT 0x0020 /* Inherit MAC aware */ +#define PRIV_AWARE_RESET 0x0040 /* Reset on setuid() */ #define PRIV_XPOLICY 0x0080 /* Extended policy */ /* user-settable flags: */ #define PRIV_USER (PRIV_DEBUG | NET_MAC_AWARE | NET_MAC_AWARE_INHERIT |\ - PRIV_XPOLICY) + PRIV_XPOLICY | PRIV_AWARE_RESET) /* * Header of the privilege info data structure; multiple structures can @@ -199,6 +198,9 @@ typedef struct priv_info_names { #define PRIV_ALLOC 0x1 +extern int priv_debug; +extern int priv_basic_test; + struct proc; struct prpriv; struct cred; @@ -234,6 +236,7 @@ extern void priv_inverse(priv_set_t *); extern void priv_set_PA(cred_t *); extern void priv_adjust_PA(cred_t *); +extern void priv_reset_PA(cred_t *, boolean_t); extern boolean_t priv_can_clear_PA(const cred_t *); extern int setpflags(uint_t, uint_t, cred_t *); diff --git a/external/cddl/osnet/dist/uts/common/sys/processor.h b/external/cddl/osnet/dist/uts/common/sys/processor.h new file mode 100644 index 0000000000000..3a76c8c9b4200 --- /dev/null +++ b/external/cddl/osnet/dist/uts/common/sys/processor.h @@ -0,0 +1,150 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T + * All Rights Reserved + * + */ + +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +#ifndef _SYS_PROCESSOR_H +#define _SYS_PROCESSOR_H + +#pragma ident "%Z%%M% %I% %E% SMI" + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Definitions for p_online, processor_info & lgrp system calls. + */ + +/* + * Type for an lgrpid + */ +typedef uint16_t lgrpid_t; + +/* + * Type for processor name (CPU number). + */ +typedef int processorid_t; +typedef int chipid_t; + +/* + * Flags and return values for p_online(2), and pi_state for processor_info(2). + * These flags are *not* for in-kernel examination of CPU states. + * See for appropriate informational functions. + */ +#define P_OFFLINE 0x0001 /* processor is offline, as quiet as possible */ +#define P_ONLINE 0x0002 /* processor is online */ +#define P_STATUS 0x0003 /* value passed to p_online to request status */ +#define P_FAULTED 0x0004 /* processor is offline, in faulted state */ +#define P_POWEROFF 0x0005 /* processor is powered off */ +#define P_NOINTR 0x0006 /* processor is online, but no I/O interrupts */ +#define P_SPARE 0x0007 /* processor is offline, can be reactivated */ +#define P_BAD P_FAULTED /* unused but defined by USL */ +#define P_FORCED 0x10000000 /* force processor offline */ + +/* + * String names for processor states defined above. + */ +#define PS_OFFLINE "off-line" +#define PS_ONLINE "on-line" +#define PS_FAULTED "faulted" +#define PS_POWEROFF "powered-off" +#define PS_NOINTR "no-intr" +#define PS_SPARE "spare" + +/* + * Structure filled in by processor_info(2). This structure + * SHOULD NOT BE MODIFIED. Changes to the structure would + * negate ABI compatibility. + * + * The string fields are guaranteed to contain a NULL. + * + * The pi_fputypes field contains a (possibly empty) comma-separated + * list of floating point identifier strings. + */ +#define PI_TYPELEN 16 /* max size of CPU type string */ +#define PI_FPUTYPE 32 /* max size of FPU types string */ + +typedef struct { + int pi_state; /* processor state, see above */ + char pi_processor_type[PI_TYPELEN]; /* ASCII CPU type */ + char pi_fputypes[PI_FPUTYPE]; /* ASCII FPU types */ + int pi_clock; /* CPU clock freq in MHz */ +} processor_info_t; + +/* + * Binding values for processor_bind(2) + */ +#define PBIND_NONE -1 /* LWP/thread is not bound */ +#define PBIND_QUERY -2 /* don't set, just return the binding */ +#define PBIND_HARD -3 /* prevents offlining CPU (default) */ +#define PBIND_SOFT -4 /* allows offlining CPU */ +#define PBIND_QUERY_TYPE -5 /* Return binding type */ + +/* + * User-level system call interface prototypes + */ +#ifndef _KERNEL +#ifdef __STDC__ + +extern int p_online(processorid_t processorid, int flag); +extern int processor_info(processorid_t processorid, + processor_info_t *infop); +extern int processor_bind(idtype_t idtype, id_t id, + processorid_t processorid, processorid_t *obind); +extern processorid_t getcpuid(void); +extern lgrpid_t gethomelgroup(void); + +#else + +extern int p_online(); +extern int processor_info(); +extern int processor_bind(); +extern processorid_t getcpuid(); +extern lgrpid_t gethomelgroup(); + +#endif /* __STDC__ */ + +#else /* _KERNEL */ + +/* + * Internal interface prototypes + */ +extern int p_online_internal(processorid_t, int, int *); + +#endif /* !_KERNEL */ + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_PROCESSOR_H */ diff --git a/external/cddl/osnet/dist/uts/common/sys/sdt.h b/external/cddl/osnet/dist/uts/common/sys/sdt.h index ff04802196533..6ca064c9782b1 100644 --- a/external/cddl/osnet/dist/uts/common/sys/sdt.h +++ b/external/cddl/osnet/dist/uts/common/sys/sdt.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SDT_H #define _SYS_SDT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #ifdef __cplusplus extern "C" { #endif @@ -131,6 +129,16 @@ extern "C" { (uintptr_t)(arg6), (uintptr_t)(arg7)); \ } +#define DTRACE_PROBE8(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6, type7, arg7, type8, arg8) { \ + extern void __dtrace_probe_##name(uintptr_t, uintptr_t, \ + uintptr_t, uintptr_t, uintptr_t, uintptr_t, \ + uintptr_t, uintptr_t); \ + __dtrace_probe_##name((uintptr_t)(arg1), (uintptr_t)(arg2), \ + (uintptr_t)(arg3), (uintptr_t)(arg4), (uintptr_t)(arg5), \ + (uintptr_t)(arg6), (uintptr_t)(arg7), (uintptr_t)(arg8)); \ +} + #define DTRACE_SCHED(name) \ DTRACE_PROBE(__sched_##name); @@ -182,6 +190,40 @@ extern "C" { DTRACE_PROBE4(__io_##name, type1, arg1, type2, arg2, \ type3, arg3, type4, arg4); +#define DTRACE_ISCSI_2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__iscsi_##name, type1, arg1, type2, arg2); + +#define DTRACE_ISCSI_3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__iscsi_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_ISCSI_4(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__iscsi_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4); + +#define DTRACE_ISCSI_5(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__iscsi_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5); + +#define DTRACE_ISCSI_6(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6) \ + DTRACE_PROBE6(__iscsi_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6); + +#define DTRACE_ISCSI_7(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6, type7, arg7) \ + DTRACE_PROBE7(__iscsi_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6, \ + type7, arg7); + +#define DTRACE_ISCSI_8(name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6, \ + type7, arg7, type8, arg8) \ + DTRACE_PROBE8(__iscsi_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6, \ + type7, arg7, type8, arg8); + #define DTRACE_NFSV3_3(name, type1, arg1, type2, arg2, \ type3, arg3) \ DTRACE_PROBE3(__nfsv3_##name, type1, arg1, type2, arg2, \ @@ -259,6 +301,59 @@ extern "C" { DTRACE_PROBE4(__xpv_##name, type1, arg1, type2, arg2, \ type3, arg3, type4, arg4); +#define DTRACE_FC_1(name, type1, arg1) \ + DTRACE_PROBE1(__fc_##name, type1, arg1); + +#define DTRACE_FC_2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__fc_##name, type1, arg1, type2, arg2); + +#define DTRACE_FC_3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__fc_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_FC_4(name, type1, arg1, type2, arg2, type3, arg3, type4, arg4) \ + DTRACE_PROBE4(__fc_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4); + +#define DTRACE_FC_5(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__fc_##name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5); + +#define DTRACE_SRP_1(name, type1, arg1) \ + DTRACE_PROBE1(__srp_##name, type1, arg1); + +#define DTRACE_SRP_2(name, type1, arg1, type2, arg2) \ + DTRACE_PROBE2(__srp_##name, type1, arg1, type2, arg2); + +#define DTRACE_SRP_3(name, type1, arg1, type2, arg2, type3, arg3) \ + DTRACE_PROBE3(__srp_##name, type1, arg1, type2, arg2, type3, arg3); + +#define DTRACE_SRP_4(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4) \ + DTRACE_PROBE4(__srp_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4); + +#define DTRACE_SRP_5(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + DTRACE_PROBE5(__srp_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5); + +#define DTRACE_SRP_6(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6) \ + DTRACE_PROBE6(__srp_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6); + +#define DTRACE_SRP_7(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6, type7, arg7) \ + DTRACE_PROBE7(__srp_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6, type7, arg7); + +#define DTRACE_SRP_8(name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6, type7, arg7, type8, arg8) \ + DTRACE_PROBE8(__srp_##name, type1, arg1, type2, arg2, \ + type3, arg3, type4, arg4, type5, arg5, type6, arg6, \ + type7, arg7, type8, arg8); + #endif /* _KERNEL */ extern const char *sdt_prefix; diff --git a/external/cddl/osnet/dist/uts/common/sys/sysevent.h b/external/cddl/osnet/dist/uts/common/sys/sysevent.h index 0a61e41de8490..44b5d069fb265 100644 --- a/external/cddl/osnet/dist/uts/common/sys/sysevent.h +++ b/external/cddl/osnet/dist/uts/common/sys/sysevent.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -19,16 +18,15 @@ * * CDDL HEADER END */ + /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_SYSEVENT_H #define _SYS_SYSEVENT_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef __cplusplus @@ -164,18 +162,50 @@ typedef struct sysevent_value { #define EVCH_QWAIT 0x0008 /* Wait for slot in event queue */ /* - * Meaning of flags for subscribe/unsubscribe. Bits 0 to 7 are dedicated to - * the consolidation private interface. + * Meaning of flags for subscribe. Bits 8 to 15 are dedicated to + * the consolidation private interface, so flags defined here are restricted + * to the LSB. + * + * EVCH_SUB_KEEP indicates that this subscription should persist even if + * this subscriber id should die unexpectedly; matching events will be + * queued (up to a limit) and will be delivered if/when we restart again + * with the same subscriber id. + */ +#define EVCH_SUB_KEEP 0x01 + +/* + * Subscriptions may be wildcarded, but we limit the number of + * wildcards permitted. + */ +#define EVCH_WILDCARD_MAX 10 + +/* + * Used in unsubscribe to indicate all subscriber ids for a channel. */ -#define EVCH_SUB_KEEP 0x0001 #define EVCH_ALLSUB "all_subs" /* * Meaning of flags parameter of channel bind function + * + * EVCH_CREAT indicates to create a channel if not already present. + * + * EVCH_HOLD_PEND indicates that events should be published to this + * channel even if there are no matching subscribers present; when + * a subscriber belatedly binds to the channel and registers their + * subscriptions they will receive events that predate their bind. + * If the channel is closed, however, with no remaining bindings then + * the channel is destroyed. + * + * EVCH_HOLD_PEND_INDEF is a stronger version of EVCH_HOLD_PEND - + * even if the channel has no remaining bindings it will not be + * destroyed so long as events remain unconsumed. This is suitable for + * use with short-lived event producers that may bind to (create) the + * channel and exit before the intended consumer has started. */ -#define EVCH_CREAT 0x0001 /* Create a channel if not present */ +#define EVCH_CREAT 0x0001 #define EVCH_HOLD_PEND 0x0002 -#define EVCH_B_FLAGS 0x0003 /* All valid bits */ +#define EVCH_HOLD_PEND_INDEF 0x0004 +#define EVCH_B_FLAGS 0x0007 /* All valid bits */ /* * Meaning of commands of evc_control function @@ -186,37 +216,62 @@ typedef struct sysevent_value { #define EVCH_CMD_LAST EVCH_SET_CHAN_LEN /* Last command */ /* - * Event channel interface definitions + * Shared user/kernel event channel interface definitions */ -int sysevent_evc_bind(const char *, evchan_t **, uint32_t); -void sysevent_evc_unbind(evchan_t *); -int sysevent_evc_subscribe(evchan_t *, const char *, const char *, +extern int sysevent_evc_bind(const char *, evchan_t **, uint32_t); +extern int sysevent_evc_unbind(evchan_t *); +extern int sysevent_evc_subscribe(evchan_t *, const char *, const char *, int (*)(sysevent_t *, void *), void *, uint32_t); -void sysevent_evc_unsubscribe(evchan_t *, const char *); -int sysevent_evc_publish(evchan_t *, const char *, const char *, +extern int sysevent_evc_unsubscribe(evchan_t *, const char *); +extern int sysevent_evc_publish(evchan_t *, const char *, const char *, const char *, const char *, nvlist_t *, uint32_t); -int sysevent_evc_control(evchan_t *, int, ...); +extern int sysevent_evc_control(evchan_t *, int, ...); -#ifdef _KERNEL +#ifndef _KERNEL + +/* + * Userland-only event channel interfaces + */ + +#include + +typedef struct sysevent_subattr sysevent_subattr_t; + +extern sysevent_subattr_t *sysevent_subattr_alloc(void); +extern void sysevent_subattr_free(sysevent_subattr_t *); + +extern void sysevent_subattr_thrattr(sysevent_subattr_t *, pthread_attr_t *); +extern void sysevent_subattr_sigmask(sysevent_subattr_t *, sigset_t *); + +extern void sysevent_subattr_thrcreate(sysevent_subattr_t *, + door_xcreate_server_func_t *, void *); +extern void sysevent_subattr_thrsetup(sysevent_subattr_t *, + door_xcreate_thrsetup_func_t *, void *); + +extern int sysevent_evc_xsubscribe(evchan_t *, const char *, const char *, + int (*)(sysevent_t *, void *), void *, uint32_t, sysevent_subattr_t *); + +#else /* * Kernel log_event interfaces. */ -int log_sysevent(sysevent_t *, int, sysevent_id_t *); - -sysevent_t *sysevent_alloc(char *, char *, char *, int); -void sysevent_free(sysevent_t *); -int sysevent_add_attr(sysevent_attr_list_t **, char *, sysevent_value_t *, int); -void sysevent_free_attr(sysevent_attr_list_t *); -int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *); -void sysevent_detach_attributes(sysevent_t *); -char *sysevent_get_class_name(sysevent_t *); -char *sysevent_get_subclass_name(sysevent_t *); -uint64_t sysevent_get_seq(sysevent_t *); -void sysevent_get_time(sysevent_t *, hrtime_t *); -size_t sysevent_get_size(sysevent_t *); -char *sysevent_get_pub(sysevent_t *); -int sysevent_get_attr_list(sysevent_t *, nvlist_t **); +extern int log_sysevent(sysevent_t *, int, sysevent_id_t *); + +extern sysevent_t *sysevent_alloc(char *, char *, char *, int); +extern void sysevent_free(sysevent_t *); +extern int sysevent_add_attr(sysevent_attr_list_t **, char *, + sysevent_value_t *, int); +extern void sysevent_free_attr(sysevent_attr_list_t *); +extern int sysevent_attach_attributes(sysevent_t *, sysevent_attr_list_t *); +extern void sysevent_detach_attributes(sysevent_t *); +extern char *sysevent_get_class_name(sysevent_t *); +extern char *sysevent_get_subclass_name(sysevent_t *); +extern uint64_t sysevent_get_seq(sysevent_t *); +extern void sysevent_get_time(sysevent_t *, hrtime_t *); +extern size_t sysevent_get_size(sysevent_t *); +extern char *sysevent_get_pub(sysevent_t *); +extern int sysevent_get_attr_list(sysevent_t *, nvlist_t **); #endif /* _KERNEL */ diff --git a/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h b/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h index ac21686e84b81..6a93416cc784e 100644 --- a/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h +++ b/external/cddl/osnet/dist/uts/common/sys/sysevent/eventdefs.h @@ -19,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -52,6 +52,7 @@ extern "C" { #define EC_FM "EC_fm" /* FMA error report event */ #define EC_ZFS "EC_zfs" /* ZFS event */ #define EC_DATALINK "EC_datalink" /* datalink event */ +#define EC_VRRP "EC_vrrp" /* VRRP event */ /* * The following event class is reserved for exclusive use @@ -179,6 +180,8 @@ extern "C" { /* Interface within an IPMP group has changed state or type */ #define ESC_IPMP_IF_CHANGE "ESC_ipmp_if_change" +/* IPMP probe has changed state */ +#define ESC_IPMP_PROBE_STATE "ESC_ipmp_probe_state" /* * EC_DEV_ADD and EC_DEV_REMOVE subclass definitions - supporting attributes @@ -200,9 +203,16 @@ extern "C" { /* device tree branch removed */ #define ESC_DEV_BRANCH_REMOVE "ESC_dev_branch_remove" -/* device capacity dynamically changed */ +/* + * EC_DEV_STATUS subclass definitions + * + * device capacity dynamically changed + */ #define ESC_DEV_DLE "ESC_dev_dle" +/* LUN has received an eject request from the user */ +#define ESC_DEV_EJECT_REQUEST "ESC_dev_eject_request" + /* FMA Fault and Error event protocol subclass */ #define ESC_FM_ERROR "ESC_FM_error" #define ESC_FM_ERROR_REPLAY "ESC_FM_error_replay" @@ -223,6 +233,15 @@ extern "C" { #define ESC_PWRCTL_BRIGHTNESS_UP "ESC_pwrctl_brightness_up" #define ESC_PWRCTL_BRIGHTNESS_DOWN "ESC_pwrctl_brightness_down" +/* EC_ACPIEV subclass definitions */ +#define EC_ACPIEV "EC_acpiev" +#define ESC_ACPIEV_DISPLAY_SWITCH "ESC_acpiev_display_switch" +#define ESC_ACPIEV_SCREEN_LOCK "ESC_acpiev_screen_lock" +#define ESC_ACPIEV_SLEEP "ESC_acpiev_sleep" +#define ESC_ACPIEV_AUDIO_MUTE "ESC_acpiev_audio_mute" +#define ESC_ACPIEV_WIFI "ESC_acpiev_wifi" +#define ESC_ACPIEV_TOUCHPAD "ESC_acpiev_touchpad" + /* * ZFS subclass definitions. supporting attributes (name/value paris) are found * in sys/fs/zfs.h @@ -234,12 +253,21 @@ extern "C" { #define ESC_ZFS_VDEV_CLEAR "ESC_ZFS_vdev_clear" #define ESC_ZFS_VDEV_CHECK "ESC_ZFS_vdev_check" #define ESC_ZFS_CONFIG_SYNC "ESC_ZFS_config_sync" +#define ESC_ZFS_SCRUB_START "ESC_ZFS_scrub_start" +#define ESC_ZFS_SCRUB_FINISH "ESC_ZFS_scrub_finish" +#define ESC_ZFS_VDEV_SPARE "ESC_ZFS_vdev_spare" /* * datalink subclass definitions. */ #define ESC_DATALINK_PHYS_ADD "ESC_datalink_phys_add" /* new physical link */ +/* + * VRRP subclass definitions. Supporting attributes (name/value paris) are + * found in sys/sysevent/vrrp.h + */ +#define ESC_VRRP_STATE_CHANGE "ESC_vrrp_state_change" + #ifdef __cplusplus } #endif diff --git a/external/cddl/osnet/dist/uts/common/sys/systeminfo.h b/external/cddl/osnet/dist/uts/common/sys/systeminfo.h index 73a9922f18f35..3f7a465aa51bc 100644 --- a/external/cddl/osnet/dist/uts/common/sys/systeminfo.h +++ b/external/cddl/osnet/dist/uts/common/sys/systeminfo.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,7 +19,7 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -31,8 +30,6 @@ #ifndef _SYS_SYSTEMINFO_H #define _SYS_SYSTEMINFO_H -#pragma ident "%Z%%M% %I% %E% SMI" /* SVr4.0 1.4 */ - #ifdef __cplusplus extern "C" { #endif @@ -40,7 +37,7 @@ extern "C" { #ifdef _KERNEL extern char architecture[]; extern char architecture_32[]; -extern char hw_serial[]; +extern char hw_serial[]; /* machine's 32-bit hostid; a decimal string */ extern char hw_provider[]; extern char srpc_domain[]; extern char platform[]; @@ -93,7 +90,12 @@ extern char platform[]; /* Solaris defined `set' commands (769-1024) (none currently assigned) */ -#define DOM_NM_LN 64 /* maximum length of domain name */ +#define HW_INVALID_HOSTID 0xFFFFFFFF /* an invalid hostid */ +#define HW_HOSTID_LEN 11 /* minimum buffer size needed */ + /* to hold a decimal or hex */ + /* hostid string */ +#define DOM_NM_LN 64 /* maximum length of domain */ + /* name */ #if !defined(_KERNEL) #if defined(__STDC__) diff --git a/external/cddl/osnet/dist/uts/common/sys/taskq.h b/external/cddl/osnet/dist/uts/common/sys/taskq.h index 1051531d9e4b7..8b601c86a5986 100644 --- a/external/cddl/osnet/dist/uts/common/sys/taskq.h +++ b/external/cddl/osnet/dist/uts/common/sys/taskq.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_TASKQ_H #define _SYS_TASKQ_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #include @@ -42,12 +39,16 @@ typedef struct taskq taskq_t; typedef uintptr_t taskqid_t; typedef void (task_func_t)(void *); +struct proc; + /* * Public flags for taskq_create(): bit range 0-15 */ #define TASKQ_PREPOPULATE 0x0001 /* Prepopulate with threads and data */ #define TASKQ_CPR_SAFE 0x0002 /* Use CPR safe protocol */ #define TASKQ_DYNAMIC 0x0004 /* Use dynamic thread scheduling */ +#define TASKQ_THREADS_CPU_PCT 0x0008 /* number of threads as % of ncpu */ +#define TASKQ_DC_BATCH 0x0010 /* Taskq uses SDC in batch mode */ /* * Flags for taskq_dispatch. TQ_SLEEP/TQ_NOSLEEP should be same as @@ -57,16 +58,22 @@ typedef void (task_func_t)(void *); #define TQ_NOSLEEP 0x01 /* cannot block for memory; may fail */ #define TQ_NOQUEUE 0x02 /* Do not enqueue if can't dispatch */ #define TQ_NOALLOC 0x04 /* cannot allocate memory; may fail */ +#define TQ_FRONT 0x08 /* Put task at the front of the queue */ #ifdef _KERNEL extern taskq_t *system_taskq; extern void taskq_init(void); +extern void taskq_mp_init(void); extern taskq_t *taskq_create(const char *, int, pri_t, int, int, uint_t); extern taskq_t *taskq_create_instance(const char *, int, int, pri_t, int, int, uint_t); +extern taskq_t *taskq_create_proc(const char *, int, pri_t, int, int, + struct proc *, uint_t); +extern taskq_t *taskq_create_sysdc(const char *, int, int, int, + struct proc *, uint_t, uint_t); extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern void nulltask(void *); extern void taskq_destroy(taskq_t *); diff --git a/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h b/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h index 07b7d1416b83e..a6f99fa3d969b 100644 --- a/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h +++ b/external/cddl/osnet/dist/uts/common/sys/taskq_impl.h @@ -2,9 +2,8 @@ * CDDL HEADER START * * The contents of this file are subject to the terms of the - * Common Development and Distribution License, Version 1.0 only - * (the "License"). You may not use this file except in compliance - * with the License. + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. * * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE * or http://www.opensolaris.org/os/licensing. @@ -20,17 +19,17 @@ * CDDL HEADER END */ /* - * Copyright 2004 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_TASKQ_IMPL_H #define _SYS_TASKQ_IMPL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include +#include #include +#include #include #ifdef __cplusplus @@ -84,12 +83,16 @@ struct taskq_bucket { #define TQBUCKET_CLOSE 0x01 #define TQBUCKET_SUSPEND 0x02 +#define TASKQ_INTERFACE_FLAGS 0x0000ffff /* defined in */ + /* * taskq implementation flags: bit range 16-31 */ -#define TASKQ_ACTIVE 0x00010000 -#define TASKQ_SUSPENDED 0x00020000 -#define TASKQ_NOINSTANCE 0x00040000 +#define TASKQ_CHANGING 0x00010000 /* nthreads != target */ +#define TASKQ_SUSPENDED 0x00020000 /* taskq is suspended */ +#define TASKQ_NOINSTANCE 0x00040000 /* no instance number */ +#define TASKQ_THREAD_CREATED 0x00080000 /* a thread has been created */ +#define TASKQ_DUTY_CYCLE 0x00100000 /* using the SDC class */ struct taskq { char tq_name[TASKQ_NAMELEN + 1]; @@ -97,16 +100,20 @@ struct taskq { krwlock_t tq_threadlock; kcondvar_t tq_dispatch_cv; kcondvar_t tq_wait_cv; + kcondvar_t tq_exit_cv; + pri_t tq_pri; /* Scheduling priority */ uint_t tq_flags; int tq_active; int tq_nthreads; + int tq_nthreads_target; + int tq_nthreads_max; + int tq_threads_ncpus_pct; int tq_nalloc; int tq_minalloc; int tq_maxalloc; taskq_ent_t *tq_freelist; taskq_ent_t tq_task; int tq_maxsize; - pri_t tq_pri; /* Scheduling priority */ taskq_bucket_t *tq_buckets; /* Per-cpu array of buckets */ int tq_instance; uint_t tq_nbuckets; /* # of buckets (2^n) */ @@ -114,13 +121,19 @@ struct taskq { kthread_t *_tq_thread; kthread_t **_tq_threadlist; } tq_thr; + + list_node_t tq_cpupct_link; /* linkage for taskq_cpupct_list */ + struct proc *tq_proc; /* process for taskq threads */ + int tq_cpupart; /* cpupart id bound to */ + uint_t tq_DC; /* duty cycle for SDC */ + /* * Statistics. */ kstat_t *tq_kstat; /* Exported statistics */ hrtime_t tq_totaltime; /* Time spent processing tasks */ - int tq_tasks; /* Total # of tasks posted */ - int tq_executed; /* Total # of tasks executed */ + uint64_t tq_tasks; /* Total # of tasks posted */ + uint64_t tq_executed; /* Total # of tasks executed */ int tq_maxtasks; /* Max number of tasks in the queue */ int tq_tcreates; int tq_tdeaths; @@ -129,6 +142,9 @@ struct taskq { #define tq_thread tq_thr._tq_thread #define tq_threadlist tq_thr._tq_threadlist +/* The MAX guarantees we have at least one thread */ +#define TASKQ_THREADS_PCT(ncpus, pct) MAX(((ncpus) * (pct)) / 100, 1) + #ifdef __cplusplus } #endif diff --git a/external/cddl/osnet/dist/uts/common/sys/tsol/label.h b/external/cddl/osnet/dist/uts/common/sys/tsol/label.h index b496737334d61..f88f40973d5bf 100644 --- a/external/cddl/osnet/dist/uts/common/sys/tsol/label.h +++ b/external/cddl/osnet/dist/uts/common/sys/tsol/label.h @@ -19,15 +19,13 @@ * CDDL HEADER END */ /* - * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ #ifndef _SYS_TSOL_LABEL_H #define _SYS_TSOL_LABEL_H -#pragma ident "%Z%%M% %I% %E% SMI" - #include #ifdef _KERNEL #include @@ -45,6 +43,10 @@ extern "C" { #define EQUALITY_CHECK 0 #define DOMINANCE_CHECK 1 +/* Manifest human readable label names */ +#define ADMIN_LOW "ADMIN_LOW" +#define ADMIN_HIGH "ADMIN_HIGH" + /* Binary Label Structure Definitions */ typedef struct _mac_label_impl m_label_t; @@ -105,7 +107,21 @@ typedef struct ts_label_s { #define DEFAULT_DOI 1 -#define TSLF_UNLABELED 0x00000001 /* source was unlabeled */ +/* + * TSLF_UNLABELED is set in tsl_flags for packets with no explicit label + * when the peer is unlabeled. + * + * TSLF_IMPLICIT_IN is set when a packet is received with no explicit label + * from a peer which is flagged in the tnrhdb as label-aware. + * + * TSLF_IMPLICIT_OUT is set when the packet should be sent without an + * explict label even if the peer or next-hop router is flagged in the + * tnrhdb as label-aware. + */ + +#define TSLF_UNLABELED 0x00000001 /* peer is unlabeled */ +#define TSLF_IMPLICIT_IN 0x00000002 /* inbound implicit */ +#define TSLF_IMPLICIT_OUT 0x00000004 /* outbound implicit */ #define CR_SL(cr) (label2bslabel(crgetlabel(cr))) @@ -116,21 +132,25 @@ extern int sys_labeling; extern void label_init(void); extern ts_label_t *labelalloc(const m_label_t *, uint32_t, int); +extern ts_label_t *labeldup(const ts_label_t *, int); extern void label_hold(ts_label_t *); extern void label_rele(ts_label_t *); extern m_label_t *label2bslabel(ts_label_t *); extern uint32_t label2doi(ts_label_t *); extern boolean_t label_equal(const ts_label_t *, const ts_label_t *); extern cred_t *newcred_from_bslabel(m_label_t *, uint32_t, int); -extern cred_t *copycred_from_bslabel(cred_t *, m_label_t *, +extern cred_t *copycred_from_bslabel(const cred_t *, m_label_t *, uint32_t, int); +extern cred_t *copycred_from_tslabel(const cred_t *, ts_label_t *, + int); extern ts_label_t *getflabel(vnode_t *); extern int getlabel(const char *, m_label_t *); extern int fgetlabel(int, m_label_t *); extern int _blinrange(const m_label_t *, const brange_t *); extern int blinlset(const m_label_t *, const blset_t); -extern ts_label_t *nfs_getflabel(vnode_t *); -extern boolean_t do_rfs_label_check(bslabel_t *, vnode_t *, int); + +extern int l_to_str_internal(const m_label_t *, char **); +extern int hexstr_to_label(const char *, m_label_t *); /* * The use of '!!' here prevents users from referencing this function-like diff --git a/external/cddl/osnet/dist/uts/common/sys/vtoc.h b/external/cddl/osnet/dist/uts/common/sys/vtoc.h index 3600fd85bd6d9..004b49097ae31 100644 --- a/external/cddl/osnet/dist/uts/common/sys/vtoc.h +++ b/external/cddl/osnet/dist/uts/common/sys/vtoc.h @@ -20,7 +20,7 @@ */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Copyright 2009 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ @@ -158,7 +158,7 @@ struct extvtoc { v.v_part[i].p_flag = extv.v_part[i].p_flag; \ v.v_part[i].p_start = (daddr_t)extv.v_part[i].p_start; \ v.v_part[i].p_size = (long)extv.v_part[i].p_size; \ - v.timestamp[i] = (time_t)v.timestamp[i]; \ + v.timestamp[i] = (time_t)extv.timestamp[i]; \ } \ bcopy(extv.v_asciilabel, v.v_asciilabel, LEN_DKL_ASCII); \ }