Skip to content

Commit

Permalink
7090 zfs should improve allocation order and throttle allocations
Browse files Browse the repository at this point in the history
Reviewed by: Adam Leventhal <ahl@delphix.com>
Reviewed by: Alex Reece <alex@delphix.com>
Reviewed by: Christopher Siden <christopher.siden@delphix.com>
Reviewed by: Dan Kimmel <dan.kimmel@delphix.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Paul Dagnelie <paul.dagnelie@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Reviewed by: Sebastien Roy <sebastien.roy@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
  • Loading branch information
grwilson authored and ahrens committed Aug 2, 2016
1 parent c79a72d commit 0f7643c
Show file tree
Hide file tree
Showing 17 changed files with 1,012 additions and 209 deletions.
329 changes: 278 additions & 51 deletions usr/src/uts/common/fs/zfs/metaslab.c

Large diffs are not rendered by default.

65 changes: 64 additions & 1 deletion usr/src/uts/common/fs/zfs/refcount.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2012 by Delphix. All rights reserved.
* Copyright (c) 2012, 2015 by Delphix. All rights reserved.
*/

#include <sys/zfs_context.h>
Expand Down Expand Up @@ -68,6 +68,13 @@ refcount_create(refcount_t *rc)
rc->rc_tracked = reference_tracking_enable;
}

void
refcount_create_tracked(refcount_t *rc)
{
refcount_create(rc);
rc->rc_tracked = B_TRUE;
}

void
refcount_create_untracked(refcount_t *rc)
{
Expand Down Expand Up @@ -251,4 +258,60 @@ refcount_transfer_ownership(refcount_t *rc, void *current_holder,
ASSERT(found);
mutex_exit(&rc->rc_mtx);
}

/*
* If tracking is enabled, return true if a reference exists that matches
* the "holder" tag. If tracking is disabled, then return true if a reference
* might be held.
*/
boolean_t
refcount_held(refcount_t *rc, void *holder)
{
reference_t *ref;

mutex_enter(&rc->rc_mtx);

if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
return (rc->rc_count > 0);
}

for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == holder) {
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}
}
mutex_exit(&rc->rc_mtx);
return (B_FALSE);
}

/*
* If tracking is enabled, return true if a reference does not exist that
* matches the "holder" tag. If tracking is disabled, always return true
* since the reference might not be held.
*/
boolean_t
refcount_not_held(refcount_t *rc, void *holder)
{
reference_t *ref;

mutex_enter(&rc->rc_mtx);

if (!rc->rc_tracked) {
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}

for (ref = list_head(&rc->rc_list); ref;
ref = list_next(&rc->rc_list, ref)) {
if (ref->ref_holder == holder) {
mutex_exit(&rc->rc_mtx);
return (B_FALSE);
}
}
mutex_exit(&rc->rc_mtx);
return (B_TRUE);
}
#endif /* ZFS_DEBUG */
47 changes: 45 additions & 2 deletions usr/src/uts/common/fs/zfs/spa.c
Original file line number Diff line number Diff line change
Expand Up @@ -1275,7 +1275,6 @@ spa_unload(spa_t *spa)

ddt_unload(spa);


/*
* Drop and purge level 2 cache
*/
Expand Down Expand Up @@ -3634,6 +3633,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
spa->spa_uberblock.ub_txg = txg - 1;
spa->spa_uberblock.ub_version = version;
spa->spa_ubsync = spa->spa_uberblock;
spa->spa_load_state = SPA_LOAD_CREATE;

/*
* Create "The Godfather" zio to hold all async IOs
Expand Down Expand Up @@ -3818,6 +3818,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props,
*/
spa_evicting_os_wait(spa);
spa->spa_minref = refcount_count(&spa->spa_refcount);
spa->spa_load_state = SPA_LOAD_NONE;

mutex_exit(&spa_namespace_lock);

Expand Down Expand Up @@ -5321,7 +5322,7 @@ spa_nvlist_lookup_by_guid(nvlist_t **nvpp, int count, uint64_t target_guid)

static void
spa_vdev_remove_aux(nvlist_t *config, char *name, nvlist_t **dev, int count,
nvlist_t *dev_to_remove)
nvlist_t *dev_to_remove)
{
nvlist_t **newdev = NULL;

Expand Down Expand Up @@ -6483,6 +6484,8 @@ spa_sync(spa_t *spa, uint64_t txg)
vdev_t *vd;
dmu_tx_t *tx;
int error;
uint32_t max_queue_depth = zfs_vdev_async_write_max_active *
zfs_vdev_queue_depth_pct / 100;

VERIFY(spa_writeable(spa));

Expand All @@ -6494,6 +6497,10 @@ spa_sync(spa_t *spa, uint64_t txg)
spa->spa_syncing_txg = txg;
spa->spa_sync_pass = 0;

mutex_enter(&spa->spa_alloc_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
mutex_exit(&spa->spa_alloc_lock);

/*
* If there are any pending vdev state changes, convert them
* into config changes that go out with this transaction group.
Expand Down Expand Up @@ -6545,6 +6552,38 @@ spa_sync(spa_t *spa, uint64_t txg)
}
}

/*
* Set the top-level vdev's max queue depth. Evaluate each
* top-level's async write queue depth in case it changed.
* The max queue depth will not change in the middle of syncing
* out this txg.
*/
uint64_t queue_depth_total = 0;
for (int c = 0; c < rvd->vdev_children; c++) {
vdev_t *tvd = rvd->vdev_child[c];
metaslab_group_t *mg = tvd->vdev_mg;

if (mg == NULL || mg->mg_class != spa_normal_class(spa) ||
!metaslab_group_initialized(mg))
continue;

/*
* It is safe to do a lock-free check here because only async
* allocations look at mg_max_alloc_queue_depth, and async
* allocations all happen from spa_sync().
*/
ASSERT0(refcount_count(&mg->mg_alloc_queue_depth));
mg->mg_max_alloc_queue_depth = max_queue_depth;
queue_depth_total += mg->mg_max_alloc_queue_depth;
}
metaslab_class_t *mc = spa_normal_class(spa);
ASSERT0(refcount_count(&mc->mc_alloc_slots));
mc->mc_alloc_max_slots = queue_depth_total;
mc->mc_alloc_throttle_enabled = zio_dva_throttle_enabled;

ASSERT3U(mc->mc_alloc_max_slots, <=,
max_queue_depth * rvd->vdev_children);

/*
* Iterate to convergence.
*/
Expand Down Expand Up @@ -6696,6 +6735,10 @@ spa_sync(spa_t *spa, uint64_t txg)

dsl_pool_sync_done(dp, txg);

mutex_enter(&spa->spa_alloc_lock);
VERIFY0(avl_numnodes(&spa->spa_alloc_tree));
mutex_exit(&spa->spa_alloc_lock);

/*
* Update usable space statistics.
*/
Expand Down
6 changes: 6 additions & 0 deletions usr/src/uts/common/fs/zfs/spa_misc.c
Original file line number Diff line number Diff line change
Expand Up @@ -570,6 +570,7 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
mutex_init(&spa->spa_suspend_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_vdev_top_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_iokstat_lock, NULL, MUTEX_DEFAULT, NULL);
mutex_init(&spa->spa_alloc_lock, NULL, MUTEX_DEFAULT, NULL);

cv_init(&spa->spa_async_cv, NULL, CV_DEFAULT, NULL);
cv_init(&spa->spa_evicting_os_cv, NULL, CV_DEFAULT, NULL);
Expand Down Expand Up @@ -619,6 +620,9 @@ spa_add(const char *name, nvlist_t *config, const char *altroot)
spa_active_count++;
}

avl_create(&spa->spa_alloc_tree, zio_timestamp_compare,
sizeof (zio_t), offsetof(zio_t, io_alloc_node));

/*
* Every pool starts with the default cachefile
*/
Expand Down Expand Up @@ -704,6 +708,7 @@ spa_remove(spa_t *spa)
kmem_free(dp, sizeof (spa_config_dirent_t));
}

avl_destroy(&spa->spa_alloc_tree);
list_destroy(&spa->spa_config_list);

nvlist_free(spa->spa_label_features);
Expand Down Expand Up @@ -734,6 +739,7 @@ spa_remove(spa_t *spa)
cv_destroy(&spa->spa_scrub_io_cv);
cv_destroy(&spa->spa_suspend_cv);

mutex_destroy(&spa->spa_alloc_lock);
mutex_destroy(&spa->spa_async_lock);
mutex_destroy(&spa->spa_errlist_lock);
mutex_destroy(&spa->spa_errlog_lock);
Expand Down
21 changes: 14 additions & 7 deletions usr/src/uts/common/fs/zfs/sys/metaslab.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/

#ifndef _SYS_METASLAB_H
Expand Down Expand Up @@ -55,14 +55,15 @@ void metaslab_sync_done(metaslab_t *, uint64_t);
void metaslab_sync_reassess(metaslab_group_t *);
uint64_t metaslab_block_maxsize(metaslab_t *);

#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_GANG_AVOID 0x8
#define METASLAB_HINTBP_FAVOR 0x0
#define METASLAB_HINTBP_AVOID 0x1
#define METASLAB_GANG_HEADER 0x2
#define METASLAB_GANG_CHILD 0x4
#define METASLAB_ASYNC_ALLOC 0x8
#define METASLAB_DONT_THROTTLE 0x10

int metaslab_alloc(spa_t *, metaslab_class_t *, uint64_t,
blkptr_t *, int, uint64_t, blkptr_t *, int);
blkptr_t *, int, uint64_t, blkptr_t *, int, zio_t *);
void metaslab_free(spa_t *, const blkptr_t *, uint64_t, boolean_t);
int metaslab_claim(spa_t *, const blkptr_t *, uint64_t);
void metaslab_check_free(spa_t *, const blkptr_t *);
Expand All @@ -73,6 +74,9 @@ int metaslab_class_validate(metaslab_class_t *);
void metaslab_class_histogram_verify(metaslab_class_t *);
uint64_t metaslab_class_fragmentation(metaslab_class_t *);
uint64_t metaslab_class_expandable_space(metaslab_class_t *);
boolean_t metaslab_class_throttle_reserve(metaslab_class_t *, int,
zio_t *, int);
void metaslab_class_throttle_unreserve(metaslab_class_t *, int, zio_t *);

void metaslab_class_space_update(metaslab_class_t *, int64_t, int64_t,
int64_t, int64_t);
Expand All @@ -85,10 +89,13 @@ metaslab_group_t *metaslab_group_create(metaslab_class_t *, vdev_t *);
void metaslab_group_destroy(metaslab_group_t *);
void metaslab_group_activate(metaslab_group_t *);
void metaslab_group_passivate(metaslab_group_t *);
boolean_t metaslab_group_initialized(metaslab_group_t *);
uint64_t metaslab_group_get_space(metaslab_group_t *);
void metaslab_group_histogram_verify(metaslab_group_t *);
uint64_t metaslab_group_fragmentation(metaslab_group_t *);
void metaslab_group_histogram_remove(metaslab_group_t *, metaslab_t *);
void metaslab_group_alloc_decrement(spa_t *, uint64_t, void *, int);
void metaslab_group_alloc_verify(spa_t *, const blkptr_t *, void *);

#ifdef __cplusplus
}
Expand Down
63 changes: 62 additions & 1 deletion usr/src/uts/common/fs/zfs/sys/metaslab_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
*/

/*
* Copyright (c) 2011, 2014 by Delphix. All rights reserved.
* Copyright (c) 2011, 2015 by Delphix. All rights reserved.
*/

#ifndef _SYS_METASLAB_IMPL_H
Expand Down Expand Up @@ -59,11 +59,42 @@ extern "C" {
* to use a block allocator that best suits that class.
*/
struct metaslab_class {
kmutex_t mc_lock;
spa_t *mc_spa;
metaslab_group_t *mc_rotor;
metaslab_ops_t *mc_ops;
uint64_t mc_aliquot;

/*
* Track the number of metaslab groups that have been initialized
* and can accept allocations. An initialized metaslab group is
* one has been completely added to the config (i.e. we have
* updated the MOS config and the space has been added to the pool).
*/
uint64_t mc_groups;

/*
* Toggle to enable/disable the allocation throttle.
*/
boolean_t mc_alloc_throttle_enabled;

/*
* The allocation throttle works on a reservation system. Whenever
* an asynchronous zio wants to perform an allocation it must
* first reserve the number of blocks that it wants to allocate.
* If there aren't sufficient slots available for the pending zio
* then that I/O is throttled until more slots free up. The current
* number of reserved allocations is maintained by the mc_alloc_slots
* refcount. The mc_alloc_max_slots value determines the maximum
* number of allocations that the system allows. Gang blocks are
* allowed to reserve slots even if we've reached the maximum
* number of allocations allowed.
*/
uint64_t mc_alloc_max_slots;
refcount_t mc_alloc_slots;

uint64_t mc_alloc_groups; /* # of allocatable groups */

uint64_t mc_alloc; /* total allocated space */
uint64_t mc_deferred; /* total deferred frees */
uint64_t mc_space; /* total space (alloc + free) */
Expand All @@ -85,6 +116,15 @@ struct metaslab_group {
avl_tree_t mg_metaslab_tree;
uint64_t mg_aliquot;
boolean_t mg_allocatable; /* can we allocate? */

/*
* A metaslab group is considered to be initialized only after
* we have updated the MOS config and added the space to the pool.
* We only allow allocation attempts to a metaslab group if it
* has been initialized.
*/
boolean_t mg_initialized;

uint64_t mg_free_capacity; /* percentage free */
int64_t mg_bias;
int64_t mg_activation_count;
Expand All @@ -93,6 +133,27 @@ struct metaslab_group {
taskq_t *mg_taskq;
metaslab_group_t *mg_prev;
metaslab_group_t *mg_next;

/*
* Each metaslab group can handle mg_max_alloc_queue_depth allocations
* which are tracked by mg_alloc_queue_depth. It's possible for a
* metaslab group to handle more allocations than its max. This
* can occur when gang blocks are required or when other groups
* are unable to handle their share of allocations.
*/
uint64_t mg_max_alloc_queue_depth;
refcount_t mg_alloc_queue_depth;

/*
* A metalab group that can no longer allocate the minimum block
* size will set mg_no_free_space. Once a metaslab group is out
* of space then its share of work must be distributed to other
* groups.
*/
boolean_t mg_no_free_space;

uint64_t mg_allocations;
uint64_t mg_failed_allocations;
uint64_t mg_fragmentation;
uint64_t mg_histogram[RANGE_TREE_HISTOGRAM_SIZE];
};
Expand Down
Loading

0 comments on commit 0f7643c

Please sign in to comment.