Skip to content

Commit

Permalink
4976 zfs should only avoid writing to a failing non-redundant top-lev…
Browse files Browse the repository at this point in the history
…el vdev

4977 mdb error in ::spa_space from space_cb() if a metaslab's ms_sm is NULL
4978 ztest fails in get_metaslab_refcount()
4979 extend free space histogram to device and pool
4980 metaslabs should have a fragmentation metric
4981 remove fragmented ops vector from block allocator
4982 space_map object should proactively upgrade when feature is enabled
4983 need to collect metaslab information via mdb
4984 device selection should use fragmentation metric
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Adam Leventhal <adam.leventhal@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Approved by: Garrett D'Amore <garrett@damore.org>
  • Loading branch information
grwilson authored and Christopher Siden committed Jul 19, 2014
1 parent 1a41ca2 commit 2e4c998
Show file tree
Hide file tree
Showing 17 changed files with 1,036 additions and 348 deletions.
424 changes: 314 additions & 110 deletions usr/src/cmd/mdb/common/modules/zfs/zfs.c

Large diffs are not rendered by default.

79 changes: 62 additions & 17 deletions usr/src/cmd/zdb/zdb.c
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,11 @@ static void
usage(void)
{
(void) fprintf(stderr,
"Usage: %s [-CumdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
"[-U config] [-M inflight I/Os] [-x dumpdir] poolname [object...]\n"
"Usage: %s [-CumMdibcsDvhLXFPA] [-t txg] [-e [-p path...]] "
"[-U config] [-I inflight I/Os] [-x dumpdir] poolname [object...]\n"
" %s [-divPA] [-e -p path...] [-U config] dataset "
"[object...]\n"
" %s -m [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
" %s -mM [-LXFPA] [-t txg] [-e [-p path...]] [-U config] "
"poolname [vdev [metaslab...]]\n"
" %s -R [-A] [-e [-p path...]] poolname "
"vdev:offset:size[:flags]\n"
Expand All @@ -138,6 +138,7 @@ usage(void)
(void) fprintf(stderr, " -h pool history\n");
(void) fprintf(stderr, " -b block statistics\n");
(void) fprintf(stderr, " -m metaslabs\n");
(void) fprintf(stderr, " -M metaslab groups\n");
(void) fprintf(stderr, " -c checksum all metadata (twice for "
"all data) blocks\n");
(void) fprintf(stderr, " -s report stats on zdb's I/O\n");
Expand Down Expand Up @@ -168,7 +169,7 @@ usage(void)
(void) fprintf(stderr, " -P print numbers in parseable form\n");
(void) fprintf(stderr, " -t <txg> -- highest txg to use when "
"searching for uberblocks\n");
(void) fprintf(stderr, " -M <number of inflight I/Os> -- "
(void) fprintf(stderr, " -I <number of inflight I/Os> -- "
"specify the maximum number of "
"checksumming I/Os [default is 200]\n");
(void) fprintf(stderr, "Specify an option more than once (e.g. -bb) "
Expand Down Expand Up @@ -548,7 +549,7 @@ get_metaslab_refcount(vdev_t *vd)
{
int refcount = 0;

if (vd->vdev_top == vd) {
if (vd->vdev_top == vd && !vd->vdev_removing) {
for (int m = 0; m < vd->vdev_ms_count; m++) {
space_map_t *sm = vd->vdev_ms[m]->ms_sm;

Expand Down Expand Up @@ -686,9 +687,10 @@ dump_metaslab(metaslab_t *msp)
* The space map histogram represents free space in chunks
* of sm_shift (i.e. bucket 0 refers to 2^sm_shift).
*/
(void) printf("\tOn-disk histogram:\n");
(void) printf("\tOn-disk histogram:\t\tfragmentation %llu\n",
(u_longlong_t)msp->ms_fragmentation);
dump_histogram(sm->sm_phys->smp_histogram,
SPACE_MAP_HISTOGRAM_SIZE(sm), sm->sm_shift);
SPACE_MAP_HISTOGRAM_SIZE, sm->sm_shift);
}

if (dump_opt['d'] > 5 || dump_opt['m'] > 3) {
Expand All @@ -712,6 +714,47 @@ print_vdev_metaslab_header(vdev_t *vd)
"---------------", "-------------");
}

static void
dump_metaslab_groups(spa_t *spa)
{
vdev_t *rvd = spa->spa_root_vdev;
metaslab_class_t *mc = spa_normal_class(spa);
uint64_t fragmentation;

metaslab_class_histogram_verify(mc);

for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;

if (mg->mg_class != mc)
continue;

metaslab_group_histogram_verify(mg);
mg->mg_fragmentation = metaslab_group_fragmentation(mg);

(void) printf("\tvdev %10llu\t\tmetaslabs%5llu\t\t"
"fragmentation",
(u_longlong_t)tvd->vdev_id,
(u_longlong_t)tvd->vdev_ms_count);
if (mg->mg_fragmentation == ZFS_FRAG_INVALID) {
(void) printf("%3s\n", "-");
} else {
(void) printf("%3llu%%\n",
(u_longlong_t)mg->mg_fragmentation);
}
dump_histogram(mg->mg_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
}

(void) printf("\tpool %s\tfragmentation", spa_name(spa));
fragmentation = metaslab_class_fragmentation(mc);
if (fragmentation == ZFS_FRAG_INVALID)
(void) printf("\t%3s\n", "-");
else
(void) printf("\t%3llu%%\n", (u_longlong_t)fragmentation);
dump_histogram(mc->mc_histogram, RANGE_TREE_HISTOGRAM_SIZE, 0);
}

static void
dump_metaslabs(spa_t *spa)
{
Expand Down Expand Up @@ -2340,8 +2383,7 @@ zdb_leak(void *arg, uint64_t start, uint64_t size)
}

static metaslab_ops_t zdb_metaslab_ops = {
NULL, /* alloc */
NULL /* fragmented */
NULL /* alloc */
};

static void
Expand Down Expand Up @@ -2836,6 +2878,8 @@ dump_zpool(spa_t *spa)

if (dump_opt['d'] > 2 || dump_opt['m'])
dump_metaslabs(spa);
if (dump_opt['M'])
dump_metaslab_groups(spa);

if (dump_opt['d'] || dump_opt['i']) {
dump_dir(dp->dp_meta_objset);
Expand Down Expand Up @@ -3330,7 +3374,7 @@ main(int argc, char **argv)
dprintf_setup(&argc, argv);

while ((c = getopt(argc, argv,
"bcdhilmM:suCDRSAFLXx:evp:t:U:P")) != -1) {
"bcdhilmMI:suCDRSAFLXx:evp:t:U:P")) != -1) {
switch (c) {
case 'b':
case 'c':
Expand All @@ -3343,6 +3387,7 @@ main(int argc, char **argv)
case 'u':
case 'C':
case 'D':
case 'M':
case 'R':
case 'S':
dump_opt[c]++;
Expand All @@ -3356,10 +3401,7 @@ main(int argc, char **argv)
case 'P':
dump_opt[c]++;
break;
case 'v':
verbose++;
break;
case 'M':
case 'I':
max_inflight = strtoull(optarg, NULL, 0);
if (max_inflight == 0) {
(void) fprintf(stderr, "maximum number "
Expand All @@ -3383,9 +3425,6 @@ main(int argc, char **argv)
}
searchdirs[nsearch++] = optarg;
break;
case 'x':
vn_dumpdir = optarg;
break;
case 't':
max_txg = strtoull(optarg, NULL, 0);
if (max_txg < TXG_INITIAL) {
Expand All @@ -3397,6 +3436,12 @@ main(int argc, char **argv)
case 'U':
spa_config_path = optarg;
break;
case 'v':
verbose++;
break;
case 'x':
vn_dumpdir = optarg;
break;
default:
usage();
break;
Expand Down
17 changes: 12 additions & 5 deletions usr/src/cmd/zpool/zpool_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -2754,10 +2754,15 @@ print_one_column(zpool_prop_t prop, uint64_t value, boolean_t scripted)
boolean_t fixed;
size_t width = zprop_width(prop, &fixed, ZFS_TYPE_POOL);

zfs_nicenum(value, propval, sizeof (propval));

if (prop == ZPOOL_PROP_EXPANDSZ && value == 0)
(void) strlcpy(propval, "-", sizeof (propval));
else if (prop == ZPOOL_PROP_FRAGMENTATION && value == ZFS_FRAG_INVALID)
(void) strlcpy(propval, "-", sizeof (propval));
else if (prop == ZPOOL_PROP_FRAGMENTATION)
(void) snprintf(propval, sizeof (propval), "%llu%%", value);
else
zfs_nicenum(value, propval, sizeof (propval));

if (scripted)
(void) printf("\t%s", propval);
Expand Down Expand Up @@ -2790,16 +2795,18 @@ print_list_stats(zpool_handle_t *zhp, const char *name, nvlist_t *nv,
/* only toplevel vdevs have capacity stats */
if (vs->vs_space == 0) {
if (scripted)
(void) printf("\t-\t-\t-");
(void) printf("\t-\t-\t-\t-");
else
(void) printf(" - - -");
(void) printf(" - - - -");
} else {
print_one_column(ZPOOL_PROP_SIZE, vs->vs_space,
scripted);
print_one_column(ZPOOL_PROP_CAPACITY, vs->vs_alloc,
scripted);
print_one_column(ZPOOL_PROP_FREE,
vs->vs_space - vs->vs_alloc, scripted);
print_one_column(ZPOOL_PROP_FRAGMENTATION,
vs->vs_fragmentation, scripted);
}
print_one_column(ZPOOL_PROP_EXPANDSZ, vs->vs_esize,
scripted);
Expand Down Expand Up @@ -2885,8 +2892,8 @@ zpool_do_list(int argc, char **argv)
int ret;
list_cbdata_t cb = { 0 };
static char default_props[] =
"name,size,allocated,free,expandsize,capacity,dedupratio,"
"health,altroot";
"name,size,allocated,free,fragmentation,expandsize,capacity,"
"dedupratio,health,altroot";
char *props = default_props;
unsigned long interval = 0, count = 0;
zpool_list_t *list;
Expand Down
4 changes: 3 additions & 1 deletion usr/src/common/zfs/zpool_prop.c
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
/*
* Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/

#include <sys/zio.h>
Expand Down Expand Up @@ -87,6 +87,8 @@ zpool_prop_init(void)
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "ALLOC");
zprop_register_number(ZPOOL_PROP_EXPANDSZ, "expandsize", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<size>", "EXPANDSZ");
zprop_register_number(ZPOOL_PROP_FRAGMENTATION, "fragmentation", 0,
PROP_READONLY, ZFS_TYPE_POOL, "<percent>", "FRAG");
zprop_register_number(ZPOOL_PROP_CAPACITY, "capacity", 0, PROP_READONLY,
ZFS_TYPE_POOL, "<size>", "CAP");
zprop_register_number(ZPOOL_PROP_GUID, "guid", 0, PROP_READONLY,
Expand Down
8 changes: 8 additions & 0 deletions usr/src/lib/libzfs/common/libzfs_pool.c
Original file line number Diff line number Diff line change
Expand Up @@ -294,6 +294,14 @@ zpool_get_prop(zpool_handle_t *zhp, zpool_prop_t prop, char *buf, size_t len,
(u_longlong_t)intval);
}
break;
case ZPOOL_PROP_FRAGMENTATION:
if (intval == UINT64_MAX) {
(void) strlcpy(buf, "-", len);
} else {
(void) snprintf(buf, len, "%llu%%",
(u_longlong_t)intval);
}
break;

case ZPOOL_PROP_DEDUPRATIO:
(void) snprintf(buf, len, "%llu.%02llux",
Expand Down
23 changes: 19 additions & 4 deletions usr/src/man/man1m/zdb.1m
Original file line number Diff line number Diff line change
Expand Up @@ -19,16 +19,16 @@
\fBzdb\fR - Display zpool debugging and consistency information

.SH "SYNOPSIS"
\fBzdb\fR [-CumdibcsDvhLXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR]
[-U \fIcache\fR] [-M \fIinflight I/Os\fR] [-x \fIdumpdir\fR]
\fBzdb\fR [-CumdibcsDvhLMXFPA] [-e [-p \fIpath\fR...]] [-t \fItxg\fR]
[-U \fIcache\fR] [-I \fIinflight I/Os\fR] [-x \fIdumpdir\fR]
[\fIpoolname\fR [\fIobject\fR ...]]

.P
\fBzdb\fR [-divPA] [-e [-p \fIpath\fR...]] [-U \fIcache\fR]
\fIdataset\fR [\fIobject\fR ...]

.P
\fBzdb\fR -m [-LXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR]
\fBzdb\fR -m [-MLXFPA] [-t \fItxg\fR] [-e [-p \fIpath\fR...]] [-U \fIcache\fR]
\fIpoolname\fR [\fIvdev\fR [\fImetaslab\fR ...]]

.P
Expand Down Expand Up @@ -194,6 +194,21 @@ verifies that all non-free blocks are referenced, which can be very expensive.
.sp .6
.RS 4n
Display the offset, spacemap, and free space of each metaslab.
When specified twice, also display information about the on-disk free
space histogram associated with each metaslab. When specified three time,
display the maximum contiguous free space, the in-core free space histogram,
and the percentage of free space in each space map. When specified
four times display every spacemap record.
.RE

.sp
.ne 2
.na
\fB-M\fR
.ad
.sp .6
.RS 4n
Display the offset, spacemap, and free space of each metaslab.
When specified twice, also display information about the maximum contiguous
free space and the percentage of free space in each space map. When specified
three times display every spacemap record.
Expand Down Expand Up @@ -380,7 +395,7 @@ transactions.
.sp
.ne 2
.na
\fB-M \fIinflight I/Os\fR \fR
\fB-I \fIinflight I/Os\fR \fR
.ad
.sp .6
.RS 4n
Expand Down
35 changes: 22 additions & 13 deletions usr/src/man/man1m/zpool.1m
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
'\" te
.\" Copyright (c) 2007, Sun Microsystems, Inc. All Rights Reserved.
.\" Copyright 2011, Nexenta Systems, Inc. All Rights Reserved.
.\" Copyright (c) 2012 by Delphix. All rights reserved.
.\" Copyright (c) 2013 by Delphix. All rights reserved.
.\" The contents of this file are subject to the terms of the Common Development
.\" and Distribution License (the "License"). You may not use this file except
.\" in compliance with the License. You can obtain a copy of the license at
Expand Down Expand Up @@ -570,6 +570,15 @@ any space on an EFI labeled vdev which has not been brought online
(i.e. zpool online -e). This space occurs when a LUN is dynamically expanded.
.RE

.sp
.ne 2
.na
\fB\fBfragmentation\fR\fR
.ad
.RS 20n
The amount of fragmentation in the pool.
.RE

.sp
.ne 2
.na
Expand Down Expand Up @@ -1648,7 +1657,7 @@ Display numbers in parsable (exact) values.
.RS 12n
Comma-separated list of properties to display. See the "Properties" section for
a list of valid properties. The default list is "name, size, used, available,
expandsize, capacity, dedupratio, health, altroot"
fragmentation, expandsize, capacity, dedupratio, health, altroot"
.RE

.sp
Expand Down Expand Up @@ -2035,10 +2044,10 @@ The results from this command are similar to the following:
.in +2
.nf
# \fBzpool list\fR
NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT
rpool 19.9G 8.43G 11.4G - 42% 1.00x ONLINE -
tank 61.5G 20.0G 41.5G - 32% 1.00x ONLINE -
zion - - - - - - FAULTED -
NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT
rpool 19.9G 8.43G 11.4G 33% - 42% 1.00x ONLINE -
tank 61.5G 20.0G 41.5G 48% - 32% 1.00x ONLINE -
zion - - - - - - - FAULTED -
.fi
.in -2
.sp
Expand Down Expand Up @@ -2259,20 +2268,20 @@ The command to remove the mirrored log \fBmirror-2\fR is:
.LP
The following command dipslays the detailed information for the \fIdata\fR
pool. This pool is comprised of a single \fIraidz\fR vdev where one of its
devices increased its capacity by 1GB. In this example, the pool will not
devices increased its capacity by 10GB. In this example, the pool will not
be able to utilized this extra capacity until all the devices under the
\fIraidz\fR vdev have been expanded.

.sp
.in +2
.nf
# \fBzpool list -v data\fR
NAME SIZE ALLOC FREE EXPANDSZ CAP DEDUP HEALTH ALTROOT
data 17.9G 174K 17.9G - 0% 1.00x ONLINE -
raidz1 17.9G 174K 17.9G -
c4t2d0 - - - 1G
c4t3d0 - - - -
c4t4d0 - - - -
NAME SIZE ALLOC FREE FRAG EXPANDSZ CAP DEDUP HEALTH ALTROOT
data 23.9G 14.6G 9.30G 48% - 61% 1.00x ONLINE -
raidz1 23.9G 14.6G 9.30G 48% -
c1t1d0 - - - - -
c1t2d0 - - - - 10G
c1t3d0 - - - - -
.fi
.in -2

Expand Down
Loading

0 comments on commit 2e4c998

Please sign in to comment.