Skip to content

Commit

Permalink
3835 zfs need not store 2 copies of all metadata
Browse files Browse the repository at this point in the history
Reviewed by: George Wilson <george.wilson@delphix.com>
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Richard Lowe <richlowe@richlowe.net>
  • Loading branch information
ahrens authored and Christopher Siden committed May 23, 2014
1 parent e56bd28 commit edf345e
Show file tree
Hide file tree
Showing 7 changed files with 107 additions and 23 deletions.
13 changes: 12 additions & 1 deletion usr/src/common/zfs/zfs_prop.c
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
Expand Down Expand Up @@ -198,7 +198,18 @@ zfs_prop_init(void)
{ NULL }
};

static zprop_index_t redundant_metadata_table[] = {
{ "all", ZFS_REDUNDANT_METADATA_ALL },
{ "most", ZFS_REDUNDANT_METADATA_MOST },
{ NULL }
};

/* inherit index properties */
zprop_register_index(ZFS_PROP_REDUNDANT_METADATA, "redundant_metadata",
ZFS_REDUNDANT_METADATA_ALL,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"all | most", "REDUND_MD",
redundant_metadata_table);
zprop_register_index(ZFS_PROP_SYNC, "sync", ZFS_SYNC_STANDARD,
PROP_INHERIT, ZFS_TYPE_FILESYSTEM | ZFS_TYPE_VOLUME,
"standard | always | disabled", "SYNC",
Expand Down
33 changes: 32 additions & 1 deletion usr/src/man/man1m/zfs.1m
Expand Up @@ -22,7 +22,7 @@
.\"
.\" Copyright (c) 2009 Sun Microsystems, Inc. All Rights Reserved.
.\" Copyright 2011 Joshua M. Clulow <josh@sysmgr.org>
.\" Copyright (c) 2013 by Delphix. All rights reserved.
.\" Copyright (c) 2014 by Delphix. All rights reserved.
.\" Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
.\" Copyright 2013 Nexenta Systems, Inc. All Rights Reserved.
.\" Copyright (c) 2014, Joyent, Inc. All rights reserved.
Expand Down Expand Up @@ -1253,6 +1253,37 @@ This property can also be referred to by its shortened column name,
\fBrecsize\fR.
.RE

.sp
.ne 2
.na
\fB\fBredundant_metadata\fR=\fBall\fR | \fBmost\fR\fR
.ad
.sp .6
.RS 4n
Controls what types of metadata are stored redundantly. ZFS stores an
extra copy of metadata, so that if a single block is corrupted, the
amount of user data lost is limited. This extra copy is in addition to
any redundancy provided at the pool level (e.g. by mirroring or RAID-Z),
and is in addition to an extra copy specified by the \fBcopies\fR
property (up to a total of 3 copies). For example if the pool is
mirrored, \fBcopies\fR=2, and \fBredundant_metadata\fR=most, then ZFS
stores 6 copies of most metadata, and 4 copies of data and some
metadata.
.sp
When set to \fBall\fR, ZFS stores an extra copy of all metadata. If a
single on-disk block is corrupt, at worst a single block of user data
(which is \fBrecordsize\fR bytes long) can be lost.
.sp
When set to \fBmost\fR, ZFS stores an extra copy of most types of
metadata. This can improve performance of random writes, because less
metadata must be written. In practice, at worst about 100 blocks (of
\fBrecordsize\fR bytes each) of user data can be lost if a single
on-disk block is corrupt. The exact behavior of which metadata blocks
are stored redundantly may change in future releases.
.sp
The default value is \fBall\fR.
.RE

.sp
.ne 2
.na
Expand Down
17 changes: 15 additions & 2 deletions usr/src/uts/common/fs/zfs/dmu.c
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
*/
/* Copyright (c) 2013 by Saso Kiselkov. All rights reserved. */
/* Copyright (c) 2013, Joyent, Inc. All rights reserved. */
Expand Down Expand Up @@ -1558,6 +1558,12 @@ dmu_object_set_compress(objset_t *os, uint64_t object, uint8_t compress,

int zfs_mdcomp_disable = 0;

/*
* When the "redundant_metadata" property is set to "most", only indirect
* blocks of this level and higher will have an additional ditto block.
*/
int zfs_redundant_metadata_most_ditto_level = 2;

void
dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
{
Expand Down Expand Up @@ -1597,6 +1603,13 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
if (zio_checksum_table[checksum].ci_correctable < 1 ||
zio_checksum_table[checksum].ci_eck)
checksum = ZIO_CHECKSUM_FLETCHER_4;

if (os->os_redundant_metadata == ZFS_REDUNDANT_METADATA_ALL ||
(os->os_redundant_metadata ==
ZFS_REDUNDANT_METADATA_MOST &&
(level >= zfs_redundant_metadata_most_ditto_level ||
DMU_OT_IS_METADATA(type) || (wp & WP_SPILL))))
copies++;
} else if (wp & WP_NOFILL) {
ASSERT(level == 0);

Expand Down Expand Up @@ -1644,7 +1657,7 @@ dmu_write_policy(objset_t *os, dnode_t *dn, int level, int wp, zio_prop_t *zp)
zp->zp_compress = compress;
zp->zp_type = (wp & WP_SPILL) ? dn->dn_bonustype : type;
zp->zp_level = level;
zp->zp_copies = MIN(copies + ismd, spa_max_replication(os->os_spa));
zp->zp_copies = MIN(copies, spa_max_replication(os->os_spa));
zp->zp_dedup = dedup;
zp->zp_dedup_verify = dedup && dedup_verify;
zp->zp_nopwrite = nopwrite;
Expand Down
35 changes: 29 additions & 6 deletions usr/src/uts/common/fs/zfs/dmu_objset.c
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
Expand Down Expand Up @@ -115,13 +115,13 @@ dmu_objset_id(objset_t *os)
return (ds ? ds->ds_object : 0);
}

uint64_t
zfs_sync_type_t
dmu_objset_syncprop(objset_t *os)
{
return (os->os_sync);
}

uint64_t
zfs_logbias_op_t
dmu_objset_logbias(objset_t *os)
{
return (os->os_logbias);
Expand Down Expand Up @@ -229,6 +229,20 @@ sync_changed_cb(void *arg, uint64_t newval)
zil_set_sync(os->os_zil, newval);
}

static void
redundant_metadata_changed_cb(void *arg, uint64_t newval)
{
objset_t *os = arg;

/*
* Inheritance and range checking should have been done by now.
*/
ASSERT(newval == ZFS_REDUNDANT_METADATA_ALL ||
newval == ZFS_REDUNDANT_METADATA_MOST);

os->os_redundant_metadata = newval;
}

static void
logbias_changed_cb(void *arg, uint64_t newval)
{
Expand Down Expand Up @@ -364,6 +378,12 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
zfs_prop_to_name(ZFS_PROP_SYNC),
sync_changed_cb, os);
}
if (err == 0) {
err = dsl_prop_register(ds,
zfs_prop_to_name(
ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os);
}
}
if (err != 0) {
VERIFY(arc_buf_remove_ref(os->os_phys_buf,
Expand All @@ -377,9 +397,9 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp,
os->os_compress = ZIO_COMPRESS_LZJB;
os->os_copies = spa_max_replication(spa);
os->os_dedup_checksum = ZIO_CHECKSUM_OFF;
os->os_dedup_verify = 0;
os->os_logbias = 0;
os->os_sync = 0;
os->os_dedup_verify = B_FALSE;
os->os_logbias = ZFS_LOGBIAS_LATENCY;
os->os_sync = ZFS_SYNC_STANDARD;
os->os_primary_cache = ZFS_CACHE_ALL;
os->os_secondary_cache = ZFS_CACHE_ALL;
}
Expand Down Expand Up @@ -622,6 +642,9 @@ dmu_objset_evict(objset_t *os)
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_SYNC),
sync_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os));
}
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
Expand Down
6 changes: 3 additions & 3 deletions usr/src/uts/common/fs/zfs/sys/dmu.h
Expand Up @@ -21,7 +21,7 @@

/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2012, Joyent, Inc. All rights reserved.
* Copyright 2013 DEY Storage Systems, Inc.
Expand Down Expand Up @@ -737,8 +737,8 @@ extern struct dsl_dataset *dmu_objset_ds(objset_t *os);
extern void dmu_objset_name(objset_t *os, char *buf);
extern dmu_objset_type_t dmu_objset_type(objset_t *os);
extern uint64_t dmu_objset_id(objset_t *os);
extern uint64_t dmu_objset_syncprop(objset_t *os);
extern uint64_t dmu_objset_logbias(objset_t *os);
extern zfs_sync_type_t dmu_objset_syncprop(objset_t *os);
extern zfs_logbias_op_t dmu_objset_logbias(objset_t *os);
extern int dmu_snapshot_list_next(objset_t *os, int namelen, char *name,
uint64_t *id, uint64_t *offp, boolean_t *case_conflict);
extern int dmu_snapshot_realname(objset_t *os, char *name, char *real,
Expand Down
19 changes: 10 additions & 9 deletions usr/src/uts/common/fs/zfs/sys/dmu_objset.h
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/

Expand Down Expand Up @@ -85,15 +85,16 @@ struct objset {
zilog_t *os_zil;

/* can change, under dsl_dir's locks: */
uint8_t os_checksum;
uint8_t os_compress;
enum zio_checksum os_checksum;
enum zio_compress os_compress;
uint8_t os_copies;
uint8_t os_dedup_checksum;
uint8_t os_dedup_verify;
uint8_t os_logbias;
uint8_t os_primary_cache;
uint8_t os_secondary_cache;
uint8_t os_sync;
enum zio_checksum os_dedup_checksum;
boolean_t os_dedup_verify;
zfs_logbias_op_t os_logbias;
zfs_cache_type_t os_primary_cache;
zfs_cache_type_t os_secondary_cache;
zfs_sync_type_t os_sync;
zfs_redundant_metadata_type_t os_redundant_metadata;

/* no lock needed: */
struct dmu_tx *os_synctx; /* XXX sketchy */
Expand Down
7 changes: 6 additions & 1 deletion usr/src/uts/common/sys/fs/zfs.h
Expand Up @@ -21,7 +21,7 @@

/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright 2011 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2013, Joyent, Inc. All rights reserved.
*/
Expand Down Expand Up @@ -146,6 +146,7 @@ typedef enum {
ZFS_PROP_SNAPSHOT_LIMIT,
ZFS_PROP_FILESYSTEM_COUNT,
ZFS_PROP_SNAPSHOT_COUNT,
ZFS_PROP_REDUNDANT_METADATA,
ZFS_NUM_PROPS
} zfs_prop_t;

Expand Down Expand Up @@ -339,6 +340,10 @@ typedef enum {
ZFS_SYNC_DISABLED = 2
} zfs_sync_type_t;

typedef enum {
ZFS_REDUNDANT_METADATA_ALL,
ZFS_REDUNDANT_METADATA_MOST
} zfs_redundant_metadata_type_t;

/*
* On-disk version number.
Expand Down

0 comments on commit edf345e

Please sign in to comment.