Skip to content

Commit

Permalink
5313 Allow I/Os to be aggregated across ZIO priority classes
Browse files Browse the repository at this point in the history
Reviewed by: Andriy Gapon <avg@FreeBSD.org>
Reviewed by: Will Andrews <willa@SpectraLogic.com>
Reviewed by: Matt Ahrens <mahrens@delphix.com>
Reviewed by: George Wilson <george@delphix.com>
Approved by: Robert Mustacchi <rm@joyent.com>
  • Loading branch information
Justin T. Gibbs authored and Christopher Siden committed Jan 9, 2015
1 parent f093add commit fe31923
Show file tree
Hide file tree
Showing 3 changed files with 51 additions and 30 deletions.
2 changes: 2 additions & 0 deletions usr/src/uts/common/fs/zfs/sys/vdev_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,8 @@ struct vdev_queue {
vdev_t *vq_vdev;
vdev_queue_class_t vq_class[ZIO_PRIORITY_NUM_QUEUEABLE];
avl_tree_t vq_active_tree;
avl_tree_t vq_read_offset_tree;
avl_tree_t vq_write_offset_tree;
uint64_t vq_last_offset;
hrtime_t vq_io_complete_ts; /* time last i/o completed */
kmutex_t vq_lock;
Expand Down
1 change: 1 addition & 0 deletions usr/src/uts/common/fs/zfs/sys/zio.h
Original file line number Diff line number Diff line change
Expand Up @@ -419,6 +419,7 @@ struct zio {
uint64_t io_offset;
hrtime_t io_timestamp;
avl_node_t io_queue_node;
avl_node_t io_offset_node;

/* Internal pipeline state */
enum zio_flag io_flags;
Expand Down
78 changes: 48 additions & 30 deletions usr/src/uts/common/fs/zfs/vdev_queue.c
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,22 @@ vdev_queue_offset_compare(const void *x1, const void *x2)
return (0);
}

static inline avl_tree_t *
vdev_queue_class_tree(vdev_queue_t *vq, zio_priority_t p)
{
return (&vq->vq_class[p].vqc_queued_tree);
}

static inline avl_tree_t *
vdev_queue_type_tree(vdev_queue_t *vq, zio_type_t t)
{
ASSERT(t == ZIO_TYPE_READ || t == ZIO_TYPE_WRITE);
if (t == ZIO_TYPE_READ)
return (&vq->vq_read_offset_tree);
else
return (&vq->vq_write_offset_tree);
}

int
vdev_queue_timestamp_compare(const void *x1, const void *x2)
{
Expand Down Expand Up @@ -214,19 +230,27 @@ vdev_queue_init(vdev_t *vd)

avl_create(&vq->vq_active_tree, vdev_queue_offset_compare,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_READ),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));
avl_create(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE),
vdev_queue_offset_compare, sizeof (zio_t),
offsetof(struct zio, io_offset_node));

for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
int (*compfn) (const void *, const void *);

/*
* The synchronous i/o queues are FIFO rather than LBA ordered.
* This provides more consistent latency for these i/os, and
* they tend to not be tightly clustered anyway so there is
* little to no throughput loss.
* The synchronous i/o queues are dispatched in FIFO rather
* than LBA order. This provides more consistent latency for
* these i/os.
*/
boolean_t fifo = (p == ZIO_PRIORITY_SYNC_READ ||
p == ZIO_PRIORITY_SYNC_WRITE);
avl_create(&vq->vq_class[p].vqc_queued_tree,
fifo ? vdev_queue_timestamp_compare :
vdev_queue_offset_compare,
if (p == ZIO_PRIORITY_SYNC_READ || p == ZIO_PRIORITY_SYNC_WRITE)
compfn = vdev_queue_timestamp_compare;
else
compfn = vdev_queue_offset_compare;

avl_create(vdev_queue_class_tree(vq, p), compfn,
sizeof (zio_t), offsetof(struct zio, io_queue_node));
}
}
Expand All @@ -237,8 +261,10 @@ vdev_queue_fini(vdev_t *vd)
vdev_queue_t *vq = &vd->vdev_queue;

for (zio_priority_t p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++)
avl_destroy(&vq->vq_class[p].vqc_queued_tree);
avl_destroy(vdev_queue_class_tree(vq, p));
avl_destroy(&vq->vq_active_tree);
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_READ));
avl_destroy(vdev_queue_type_tree(vq, ZIO_TYPE_WRITE));

mutex_destroy(&vq->vq_lock);
}
Expand All @@ -248,7 +274,8 @@ vdev_queue_io_add(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_add(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_add(vdev_queue_type_tree(vq, zio->io_type), zio);

mutex_enter(&spa->spa_iokstat_lock);
spa->spa_queue_stats[zio->io_priority].spa_queued++;
Expand All @@ -262,7 +289,8 @@ vdev_queue_io_remove(vdev_queue_t *vq, zio_t *zio)
{
spa_t *spa = zio->io_spa;
ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE);
avl_remove(&vq->vq_class[zio->io_priority].vqc_queued_tree, zio);
avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio);
avl_remove(vdev_queue_type_tree(vq, zio->io_type), zio);

mutex_enter(&spa->spa_iokstat_lock);
ASSERT3U(spa->spa_queue_stats[zio->io_priority].spa_queued, >, 0);
Expand Down Expand Up @@ -423,7 +451,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)

/* find a queue that has not reached its minimum # outstanding i/os */
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_min_active(p))
return (p);
Expand All @@ -434,7 +462,7 @@ vdev_queue_class_to_issue(vdev_queue_t *vq)
* maximum # outstanding i/os.
*/
for (p = 0; p < ZIO_PRIORITY_NUM_QUEUEABLE; p++) {
if (avl_numnodes(&vq->vq_class[p].vqc_queued_tree) > 0 &&
if (avl_numnodes(vdev_queue_class_tree(vq, p)) > 0 &&
vq->vq_class[p].vqc_active <
vdev_queue_class_max_active(spa, p))
return (p);
Expand All @@ -460,22 +488,12 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio)
uint64_t maxgap = 0;
uint64_t size;
boolean_t stretch = B_FALSE;
vdev_queue_class_t *vqc = &vq->vq_class[zio->io_priority];
avl_tree_t *t = &vqc->vqc_queued_tree;
avl_tree_t *t = vdev_queue_type_tree(vq, zio->io_type);
enum zio_flag flags = zio->io_flags & ZIO_FLAG_AGG_INHERIT;

if (zio->io_flags & ZIO_FLAG_DONT_AGGREGATE)
return (NULL);

/*
* The synchronous i/o queues are not sorted by LBA, so we can't
* find adjacent i/os. These i/os tend to not be tightly clustered,
* or too large to aggregate, so this has little impact on performance.
*/
if (zio->io_priority == ZIO_PRIORITY_SYNC_READ ||
zio->io_priority == ZIO_PRIORITY_SYNC_WRITE)
return (NULL);

first = last = zio;

if (zio->io_type == ZIO_TYPE_READ)
Expand Down Expand Up @@ -607,7 +625,7 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
zio_t *zio, *aio;
zio_priority_t p;
avl_index_t idx;
vdev_queue_class_t *vqc;
avl_tree_t *tree;
zio_t search;

again:
Expand All @@ -626,13 +644,13 @@ vdev_queue_io_to_issue(vdev_queue_t *vq)
*
* For FIFO queues (sync), issue the i/o with the lowest timestamp.
*/
vqc = &vq->vq_class[p];
tree = vdev_queue_class_tree(vq, p);
search.io_timestamp = 0;
search.io_offset = vq->vq_last_offset + 1;
VERIFY3P(avl_find(&vqc->vqc_queued_tree, &search, &idx), ==, NULL);
zio = avl_nearest(&vqc->vqc_queued_tree, idx, AVL_AFTER);
VERIFY3P(avl_find(tree, &search, &idx), ==, NULL);
zio = avl_nearest(tree, idx, AVL_AFTER);
if (zio == NULL)
zio = avl_first(&vqc->vqc_queued_tree);
zio = avl_first(tree);
ASSERT3U(zio->io_priority, ==, p);

aio = vdev_queue_aggregate(vq, zio);
Expand Down

0 comments on commit fe31923

Please sign in to comment.