Skip to content

Commit

Permalink
4873 zvol unmap calls can take a very long time for larger datasets
Browse files Browse the repository at this point in the history
Reviewed by: George Wilson <george@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com>
Reviewed by: Basil Crow <basil.crow@delphix.com>
Reviewed by: Dan McDonald <danmcd@omniti.com>
Approved by: Robert Mustacchi <rm@joyent.com>
  • Loading branch information
Alex Reece authored and Christopher Siden committed Jul 26, 2014
1 parent d690b62 commit 0f6d88a
Show file tree
Hide file tree
Showing 9 changed files with 145 additions and 39 deletions.
13 changes: 13 additions & 0 deletions usr/src/cmd/sgs/rtld/common/util.c
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,10 @@
* Copyright (c) 1990, 2010, Oracle and/or its affiliates. All rights reserved.
*/

/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/

/*
* Utility routines for run-time linker. some are duplicated here from libc
* (with different names) to avoid name space collisions.
Expand Down Expand Up @@ -3069,6 +3073,15 @@ assfail(const char *a, const char *f, int l)
(void) _lwp_kill(_lwp_self(), SIGABRT);
return (0);
}

void
assfail3(const char *msg, uintmax_t a, const char *op, uintmax_t b,
const char *f, int l)
{
(void) printf("assertion failed: %s (%llu %s %llu), "
"file: %s, line: %d\n", msg, a, op, b, f, l);
(void) _lwp_kill(_lwp_self(), SIGABRT);
}
#endif

/*
Expand Down
28 changes: 28 additions & 0 deletions usr/src/common/avl/avl.c
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,10 @@
* Use is subject to license terms.
*/

/*
* Copyright (c) 2014 by Delphix. All rights reserved.
*/

/*
* AVL - generic AVL tree implementation for kernel use
*
Expand Down Expand Up @@ -85,6 +89,12 @@
* is a modified "avl_node_t *". The bottom bit (normally 0 for a
* pointer) is set to indicate if that the new node has a value greater
* than the value of the indicated "avl_node_t *".
*
* Note - in addition to userland (e.g. libavl and libutil) and the kernel
* (e.g. genunix), avl.c is compiled into ld.so and kmdb's genunix module,
* which each have their own compilation environments and subsequent
* requirements. Each of these environments must be considered when adding
* dependencies from avl.c.
*/

#include <sys/types.h>
Expand Down Expand Up @@ -864,6 +874,24 @@ avl_update(avl_tree_t *t, void *obj)
return (B_FALSE);
}

void
avl_swap(avl_tree_t *tree1, avl_tree_t *tree2)
{
avl_node_t *temp_node;
ulong_t temp_numnodes;

ASSERT3P(tree1->avl_compar, ==, tree2->avl_compar);
ASSERT3U(tree1->avl_offset, ==, tree2->avl_offset);
ASSERT3U(tree1->avl_size, ==, tree2->avl_size);

temp_node = tree1->avl_root;
temp_numnodes = tree1->avl_numnodes;
tree1->avl_root = tree2->avl_root;
tree1->avl_numnodes = tree2->avl_numnodes;
tree2->avl_root = temp_node;
tree2->avl_numnodes = temp_numnodes;
}

/*
* initialize a new AVL tree
*/
Expand Down
2 changes: 2 additions & 0 deletions usr/src/lib/libavl/mapfile-vers
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
#
#
# Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
# Copyright (c) 2014 by Delphix. All rights reserved.
#

#
Expand Down Expand Up @@ -53,6 +54,7 @@ SYMBOL_VERSION SUNWprivate_1.1 {
avl_nearest;
avl_numnodes;
avl_remove;
avl_swap;
avl_walk;
local:
*;
Expand Down
52 changes: 35 additions & 17 deletions usr/src/uts/common/fs/zfs/dbuf.c
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,9 @@ dbuf_cons(void *vdb, void *unused, int kmflag)
mutex_init(&db->db_mtx, NULL, MUTEX_DEFAULT, NULL);
cv_init(&db->db_changed, NULL, CV_DEFAULT, NULL);
refcount_create(&db->db_holds);

db->db_creation = gethrtime();

return (0);
}

Expand Down Expand Up @@ -330,7 +333,7 @@ dbuf_verify(dmu_buf_impl_t *db)
ASSERT3U(db->db_level, <, dn->dn_nlevels);
ASSERT(db->db_blkid == DMU_BONUS_BLKID ||
db->db_blkid == DMU_SPILL_BLKID ||
!list_is_empty(&dn->dn_dbufs));
!avl_is_empty(&dn->dn_dbufs));
}
if (db->db_blkid == DMU_BONUS_BLKID) {
ASSERT(dn != NULL);
Expand Down Expand Up @@ -803,18 +806,30 @@ dbuf_unoverride(dbuf_dirty_record_t *dr)
* receive; see comment below for details.
*/
void
dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
dbuf_free_range(dnode_t *dn, uint64_t start_blkid, uint64_t end_blkid,
dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
dmu_buf_impl_t *db, *db_next, db_search;
uint64_t txg = tx->tx_txg;
avl_index_t where;

if (end > dn->dn_maxblkid && (end != DMU_SPILL_BLKID))
end = dn->dn_maxblkid;
dprintf_dnode(dn, "start=%llu end=%llu\n", start, end);
if (end_blkid > dn->dn_maxblkid && (end_blkid != DMU_SPILL_BLKID))
end_blkid = dn->dn_maxblkid;
dprintf_dnode(dn, "start=%llu end=%llu\n", start_blkid, end_blkid);

db_search.db_level = 0;
db_search.db_blkid = start_blkid;
db_search.db_creation = 0;

mutex_enter(&dn->dn_dbufs_mtx);
if (start >= dn->dn_unlisted_l0_blkid * dn->dn_datablksz) {
if (start_blkid >= dn->dn_unlisted_l0_blkid) {
/* There can't be any dbufs in this range; no need to search. */
#ifdef DEBUG
db = avl_find(&dn->dn_dbufs, &db_search, &where);
ASSERT3P(db, ==, NULL);
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);
ASSERT(db == NULL || db->db_level > 0);
#endif
mutex_exit(&dn->dn_dbufs_mtx);
return;
} else if (dmu_objset_is_receiving(dn->dn_objset)) {
Expand All @@ -828,14 +843,18 @@ dbuf_free_range(dnode_t *dn, uint64_t start, uint64_t end, dmu_tx_t *tx)
atomic_inc_64(&zfs_free_range_recv_miss);
}

for (db = list_head(&dn->dn_dbufs); db != NULL; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);
db = avl_find(&dn->dn_dbufs, &db_search, &where);
ASSERT3P(db, ==, NULL);
db = avl_nearest(&dn->dn_dbufs, where, AVL_AFTER);

for (; db != NULL; db = db_next) {
db_next = AVL_NEXT(&dn->dn_dbufs, db);
ASSERT(db->db_blkid != DMU_BONUS_BLKID);

if (db->db_level != 0)
continue;
if (db->db_blkid < start || db->db_blkid > end)
continue;
if (db->db_level != 0 || db->db_blkid > end_blkid) {
break;
}
ASSERT3U(db->db_blkid, >=, start_blkid);

/* found a level 0 buffer in the range */
mutex_enter(&db->db_mtx);
Expand Down Expand Up @@ -1585,7 +1604,7 @@ dbuf_clear(dmu_buf_impl_t *db)
dn = DB_DNODE(db);
dndb = dn->dn_dbuf;
if (db->db_blkid != DMU_BONUS_BLKID && MUTEX_HELD(&dn->dn_dbufs_mtx)) {
list_remove(&dn->dn_dbufs, db);
avl_remove(&dn->dn_dbufs, db);
(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
membar_producer();
DB_DNODE_EXIT(db);
Expand Down Expand Up @@ -1748,7 +1767,7 @@ dbuf_create(dnode_t *dn, uint8_t level, uint64_t blkid,
mutex_exit(&dn->dn_dbufs_mtx);
return (odb);
}
list_insert_head(&dn->dn_dbufs, db);
avl_add(&dn->dn_dbufs, db);
if (db->db_level == 0 && db->db_blkid >=
dn->dn_unlisted_l0_blkid)
dn->dn_unlisted_l0_blkid = db->db_blkid + 1;
Expand Down Expand Up @@ -1807,7 +1826,7 @@ dbuf_destroy(dmu_buf_impl_t *db)
DB_DNODE_ENTER(db);
dn = DB_DNODE(db);
mutex_enter(&dn->dn_dbufs_mtx);
list_remove(&dn->dn_dbufs, db);
avl_remove(&dn->dn_dbufs, db);
(void) atomic_dec_32_nv(&dn->dn_dbufs_count);
mutex_exit(&dn->dn_dbufs_mtx);
DB_DNODE_EXIT(db);
Expand All @@ -1825,7 +1844,6 @@ dbuf_destroy(dmu_buf_impl_t *db)
db->db_parent = NULL;
db->db_buf = NULL;

ASSERT(!list_link_active(&db->db_link));
ASSERT(db->db.db_data == NULL);
ASSERT(db->db_hash_next == NULL);
ASSERT(db->db_blkptr == NULL);
Expand Down
59 changes: 48 additions & 11 deletions usr/src/uts/common/fs/zfs/dnode.c
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,43 @@ int zfs_default_ibs = DN_MAX_INDBLKSHIFT;

static kmem_cbrc_t dnode_move(void *, void *, size_t, void *);

static int
dbuf_compare(const void *x1, const void *x2)
{
const dmu_buf_impl_t *d1 = x1;
const dmu_buf_impl_t *d2 = x2;

if (d1->db_level < d2->db_level) {
return (-1);
} else if (d1->db_level > d2->db_level) {
return (1);
}

if (d1->db_blkid < d2->db_blkid) {
return (-1);
} else if (d1->db_blkid > d2->db_blkid) {
return (1);
}

/*
* If a dbuf is being evicted while dn_dbufs_mutex is not held, we set
* the db_state to DB_EVICTING but do not remove it from dn_dbufs. If
* another thread creates a dbuf of the same blkid before the dbuf is
* removed from dn_dbufs, we can reach a state where there are two
* dbufs of the same blkid and level in db_dbufs. To maintain the avl
* invariant that there cannot be duplicate items, we distinguish
* between these two dbufs based on the time they were created.
*/
if (d1->db_creation < d2->db_creation) {
return (-1);
} else if (d1->db_creation > d2->db_creation) {
return (1);
} else {
ASSERT3P(d1, ==, d2);
return (0);
}
}

/* ARGSUSED */
static int
dnode_cons(void *arg, void *unused, int kmflag)
Expand Down Expand Up @@ -113,7 +150,7 @@ dnode_cons(void *arg, void *unused, int kmflag)

dn->dn_dbufs_count = 0;
dn->dn_unlisted_l0_blkid = 0;
list_create(&dn->dn_dbufs, sizeof (dmu_buf_impl_t),
avl_create(&dn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));

dn->dn_moved = 0;
Expand Down Expand Up @@ -166,7 +203,7 @@ dnode_dest(void *arg, void *unused)

ASSERT0(dn->dn_dbufs_count);
ASSERT0(dn->dn_unlisted_l0_blkid);
list_destroy(&dn->dn_dbufs);
avl_destroy(&dn->dn_dbufs);
}

void
Expand Down Expand Up @@ -502,7 +539,7 @@ dnode_allocate(dnode_t *dn, dmu_object_type_t ot, int blocksize, int ibs,
ASSERT0(dn->dn_assigned_txg);
ASSERT(refcount_is_zero(&dn->dn_tx_holds));
ASSERT3U(refcount_count(&dn->dn_holds), <=, 1);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
ASSERT(avl_is_empty(&dn->dn_dbufs));

for (i = 0; i < TXG_SIZE; i++) {
ASSERT0(dn->dn_next_nblkptr[i]);
Expand Down Expand Up @@ -687,8 +724,8 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
ndn->dn_dirtyctx_firstset = odn->dn_dirtyctx_firstset;
ASSERT(refcount_count(&odn->dn_tx_holds) == 0);
refcount_transfer(&ndn->dn_holds, &odn->dn_holds);
ASSERT(list_is_empty(&ndn->dn_dbufs));
list_move_tail(&ndn->dn_dbufs, &odn->dn_dbufs);
ASSERT(avl_is_empty(&ndn->dn_dbufs));
avl_swap(&ndn->dn_dbufs, &odn->dn_dbufs);
ndn->dn_dbufs_count = odn->dn_dbufs_count;
ndn->dn_unlisted_l0_blkid = odn->dn_unlisted_l0_blkid;
ndn->dn_bonus = odn->dn_bonus;
Expand Down Expand Up @@ -722,7 +759,7 @@ dnode_move_impl(dnode_t *odn, dnode_t *ndn)
*/
odn->dn_dbuf = NULL;
odn->dn_handle = NULL;
list_create(&odn->dn_dbufs, sizeof (dmu_buf_impl_t),
avl_create(&odn->dn_dbufs, dbuf_compare, sizeof (dmu_buf_impl_t),
offsetof(dmu_buf_impl_t, db_link));
odn->dn_dbufs_count = 0;
odn->dn_unlisted_l0_blkid = 0;
Expand Down Expand Up @@ -1231,7 +1268,8 @@ dnode_setdirty(dnode_t *dn, dmu_tx_t *tx)
return;
}

ASSERT(!refcount_is_zero(&dn->dn_holds) || list_head(&dn->dn_dbufs));
ASSERT(!refcount_is_zero(&dn->dn_holds) ||
!avl_is_empty(&dn->dn_dbufs));
ASSERT(dn->dn_datablksz != 0);
ASSERT0(dn->dn_next_bonuslen[txg&TXG_MASK]);
ASSERT0(dn->dn_next_blksz[txg&TXG_MASK]);
Expand Down Expand Up @@ -1304,7 +1342,7 @@ dnode_free(dnode_t *dn, dmu_tx_t *tx)
int
dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
{
dmu_buf_impl_t *db, *db_next;
dmu_buf_impl_t *db;
int err;

if (size == 0)
Expand All @@ -1327,9 +1365,8 @@ dnode_set_blksz(dnode_t *dn, uint64_t size, int ibs, dmu_tx_t *tx)
goto fail;

mutex_enter(&dn->dn_dbufs_mtx);
for (db = list_head(&dn->dn_dbufs); db; db = db_next) {
db_next = list_next(&dn->dn_dbufs, db);

for (db = avl_first(&dn->dn_dbufs); db != NULL;
db = AVL_NEXT(&dn->dn_dbufs, db)) {
if (db->db_blkid != 0 && db->db_blkid != DMU_BONUS_BLKID &&
db->db_blkid != DMU_SPILL_BLKID) {
mutex_exit(&dn->dn_dbufs_mtx);
Expand Down
12 changes: 4 additions & 8 deletions usr/src/uts/common/fs/zfs/dnode_sync.c
Original file line number Diff line number Diff line change
Expand Up @@ -400,16 +400,13 @@ dnode_evict_dbufs(dnode_t *dn)
int pass = 0;

do {
dmu_buf_impl_t *db, marker;
dmu_buf_impl_t *db, *db_next;
int evicting = FALSE;

progress = FALSE;
mutex_enter(&dn->dn_dbufs_mtx);
list_insert_tail(&dn->dn_dbufs, &marker);
db = list_head(&dn->dn_dbufs);
for (; db != &marker; db = list_head(&dn->dn_dbufs)) {
list_remove(&dn->dn_dbufs, db);
list_insert_tail(&dn->dn_dbufs, db);
for (db = avl_first(&dn->dn_dbufs); db != NULL; db = db_next) {
db_next = AVL_NEXT(&dn->dn_dbufs, db);
#ifdef DEBUG
DB_DNODE_ENTER(db);
ASSERT3P(DB_DNODE(db), ==, dn);
Expand All @@ -429,7 +426,6 @@ dnode_evict_dbufs(dnode_t *dn)
}

}
list_remove(&dn->dn_dbufs, &marker);
/*
* NB: we need to drop dn_dbufs_mtx between passes so
* that any DB_EVICTING dbufs can make progress.
Expand Down Expand Up @@ -500,7 +496,7 @@ dnode_sync_free(dnode_t *dn, dmu_tx_t *tx)

dnode_undirty_dbufs(&dn->dn_dirty_records[txgoff]);
dnode_evict_dbufs(dn);
ASSERT3P(list_head(&dn->dn_dbufs), ==, NULL);
ASSERT(avl_is_empty(&dn->dn_dbufs));
ASSERT3P(dn->dn_bonus, ==, NULL);

/*
Expand Down
7 changes: 5 additions & 2 deletions usr/src/uts/common/fs/zfs/sys/dbuf.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2013 by Delphix. All rights reserved.
* Copyright (c) 2012, 2014 by Delphix. All rights reserved.
* Copyright (c) 2013 by Saso Kiselkov. All rights reserved.
*/

Expand Down Expand Up @@ -213,11 +213,14 @@ typedef struct dmu_buf_impl {
/* pointer to most recent dirty record for this buffer */
dbuf_dirty_record_t *db_last_dirty;

/* Creation time of dbuf (see comment in dbuf_compare). */
hrtime_t db_creation;

/*
* Our link on the owner dnodes's dn_dbufs list.
* Protected by its dn_dbufs_mtx.
*/
list_node_t db_link;
avl_node_t db_link;

/* Data which is unique to data (leaf) blocks: */

Expand Down
Loading

0 comments on commit 0f6d88a

Please sign in to comment.