Skip to content

Commit 01f55e4

Browse files
grwilsonChristopher Siden
authored andcommitted
3329 spa_sync() spends 10-20% of its time in spa_free_sync_cb()
3330 space_seg_t should have its own kmem_cache 3331 deferred frees should happen after sync_pass 1 3335 make SYNC_PASS_* constants tunable Reviewed by: Adam Leventhal <ahl@delphix.com> Reviewed by: Matt Ahrens <matthew.ahrens@delphix.com> Reviewed by: Christopher Siden <chris.siden@delphix.com> Reviewed by: Eric Schrock <eric.schrock@delphix.com> Reviewed by: Richard Lowe <richlowe@richlowe.net> Reviewed by: Dan McDonald <danmcd@nexenta.com> Approved by: Eric Schrock <eric.schrock@delphix.com>
1 parent c523716 commit 01f55e4

File tree

9 files changed

+75
-23
lines changed

9 files changed

+75
-23
lines changed

usr/src/uts/common/fs/zfs/metaslab.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -875,8 +875,9 @@ metaslab_activate(metaslab_t *msp, uint64_t activation_weight)
875875
if ((msp->ms_weight & METASLAB_ACTIVE_MASK) == 0) {
876876
space_map_load_wait(sm);
877877
if (!sm->sm_loaded) {
878-
int error = space_map_load(sm, sm_ops, SM_FREE,
879-
&msp->ms_smo,
878+
space_map_obj_t *smo = &msp->ms_smo;
879+
880+
int error = space_map_load(sm, sm_ops, SM_FREE, smo,
880881
spa_meta_objset(msp->ms_group->mg_vd->vdev_spa));
881882
if (error) {
882883
metaslab_group_sort(msp->ms_group, msp, 0);

usr/src/uts/common/fs/zfs/spa.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,7 @@ boolean_t zio_taskq_sysdc = B_TRUE; /* use SDC scheduling class */
130130
uint_t zio_taskq_basedc = 80; /* base duty cycle */
131131

132132
boolean_t spa_create_process = B_TRUE; /* no process ==> no sysdc */
133+
extern int zfs_sync_pass_deferred_free;
133134

134135
/*
135136
* This (illegal) pool name is used when temporarily importing a spa_t in order
@@ -6040,7 +6041,7 @@ spa_sync(spa_t *spa, uint64_t txg)
60406041
spa_errlog_sync(spa, txg);
60416042
dsl_pool_sync(dp, txg);
60426043

6043-
if (pass <= SYNC_PASS_DEFERRED_FREE) {
6044+
if (pass < zfs_sync_pass_deferred_free) {
60446045
zio_t *zio = zio_root(spa, NULL, NULL, 0);
60456046
bplist_iterate(free_bpl, spa_free_sync_cb,
60466047
zio, tx);

usr/src/uts/common/fs/zfs/spa_misc.c

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,6 +1687,7 @@ spa_init(int mode)
16871687

16881688
refcount_init();
16891689
unique_init();
1690+
space_map_init();
16901691
zio_init();
16911692
dmu_init();
16921693
zil_init();
@@ -1709,6 +1710,7 @@ spa_fini(void)
17091710
zil_fini();
17101711
dmu_fini();
17111712
zio_fini();
1713+
space_map_fini();
17121714
unique_fini();
17131715
refcount_fini();
17141716

usr/src/uts/common/fs/zfs/space_map.c

Lines changed: 34 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,23 @@
3232
#include <sys/zio.h>
3333
#include <sys/space_map.h>
3434

35+
static kmem_cache_t *space_seg_cache;
36+
37+
void
38+
space_map_init(void)
39+
{
40+
ASSERT(space_seg_cache == NULL);
41+
space_seg_cache = kmem_cache_create("space_seg_cache",
42+
sizeof (space_seg_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
43+
}
44+
45+
void
46+
space_map_fini(void)
47+
{
48+
kmem_cache_destroy(space_seg_cache);
49+
space_seg_cache = NULL;
50+
}
51+
3552
/*
3653
* Space map routines.
3754
* NOTE: caller is responsible for all locking.
@@ -124,7 +141,7 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
124141
avl_remove(sm->sm_pp_root, ss_after);
125142
}
126143
ss_after->ss_start = ss_before->ss_start;
127-
kmem_free(ss_before, sizeof (*ss_before));
144+
kmem_cache_free(space_seg_cache, ss_before);
128145
ss = ss_after;
129146
} else if (merge_before) {
130147
ss_before->ss_end = end;
@@ -137,7 +154,7 @@ space_map_add(space_map_t *sm, uint64_t start, uint64_t size)
137154
avl_remove(sm->sm_pp_root, ss_after);
138155
ss = ss_after;
139156
} else {
140-
ss = kmem_alloc(sizeof (*ss), KM_SLEEP);
157+
ss = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
141158
ss->ss_start = start;
142159
ss->ss_end = end;
143160
avl_insert(&sm->sm_root, ss, where);
@@ -184,7 +201,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
184201
avl_remove(sm->sm_pp_root, ss);
185202

186203
if (left_over && right_over) {
187-
newseg = kmem_alloc(sizeof (*newseg), KM_SLEEP);
204+
newseg = kmem_cache_alloc(space_seg_cache, KM_SLEEP);
188205
newseg->ss_start = end;
189206
newseg->ss_end = ss->ss_end;
190207
ss->ss_end = start;
@@ -197,7 +214,7 @@ space_map_remove(space_map_t *sm, uint64_t start, uint64_t size)
197214
ss->ss_start = end;
198215
} else {
199216
avl_remove(&sm->sm_root, ss);
200-
kmem_free(ss, sizeof (*ss));
217+
kmem_cache_free(space_seg_cache, ss);
201218
ss = NULL;
202219
}
203220

@@ -237,7 +254,7 @@ space_map_vacate(space_map_t *sm, space_map_func_t *func, space_map_t *mdest)
237254
while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
238255
if (func != NULL)
239256
func(mdest, ss->ss_start, ss->ss_end - ss->ss_start);
240-
kmem_free(ss, sizeof (*ss));
257+
kmem_cache_free(space_seg_cache, ss);
241258
}
242259
sm->sm_space = 0;
243260
}
@@ -409,7 +426,7 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
409426
spa_t *spa = dmu_objset_spa(os);
410427
void *cookie = NULL;
411428
space_seg_t *ss;
412-
uint64_t bufsize, start, size, run_len;
429+
uint64_t bufsize, start, size, run_len, delta, sm_space;
413430
uint64_t *entry, *entry_map, *entry_map_end;
414431

415432
ASSERT(MUTEX_HELD(sm->sm_lock));
@@ -438,11 +455,13 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
438455
SM_DEBUG_SYNCPASS_ENCODE(spa_sync_pass(spa)) |
439456
SM_DEBUG_TXG_ENCODE(dmu_tx_get_txg(tx));
440457

458+
delta = 0;
459+
sm_space = sm->sm_space;
441460
while ((ss = avl_destroy_nodes(&sm->sm_root, &cookie)) != NULL) {
442461
size = ss->ss_end - ss->ss_start;
443462
start = (ss->ss_start - sm->sm_start) >> sm->sm_shift;
444463

445-
sm->sm_space -= size;
464+
delta += size;
446465
size >>= sm->sm_shift;
447466

448467
while (size) {
@@ -464,7 +483,7 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
464483
start += run_len;
465484
size -= run_len;
466485
}
467-
kmem_free(ss, sizeof (*ss));
486+
kmem_cache_free(space_seg_cache, ss);
468487
}
469488

470489
if (entry != entry_map) {
@@ -476,8 +495,15 @@ space_map_sync(space_map_t *sm, uint8_t maptype,
476495
smo->smo_objsize += size;
477496
}
478497

498+
/*
499+
* Ensure that the space_map's accounting wasn't changed
500+
* while we were in the middle of writing it out.
501+
*/
502+
VERIFY3U(sm->sm_space, ==, sm_space);
503+
479504
zio_buf_free(entry_map, bufsize);
480505

506+
sm->sm_space -= delta;
481507
VERIFY0(sm->sm_space);
482508
}
483509

usr/src/uts/common/fs/zfs/sys/metaslab_impl.h

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,10 @@
2121
/*
2222
* Copyright 2009 Sun Microsystems, Inc. All rights reserved.
2323
* Use is subject to license terms.
24-
* Copyright (c) 2011 by Delphix. All rights reserved.
24+
*/
25+
26+
/*
27+
* Copyright (c) 2012 by Delphix. All rights reserved.
2528
*/
2629

2730
#ifndef _SYS_METASLAB_IMPL_H

usr/src/uts/common/fs/zfs/sys/spa.h

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -486,14 +486,6 @@ extern int spa_scan_stop(spa_t *spa);
486486
extern void spa_sync(spa_t *spa, uint64_t txg); /* only for DMU use */
487487
extern void spa_sync_allpools(void);
488488

489-
/*
490-
* DEFERRED_FREE must be large enough that regular blocks are not
491-
* deferred. XXX so can't we change it back to 1?
492-
*/
493-
#define SYNC_PASS_DEFERRED_FREE 2 /* defer frees after this pass */
494-
#define SYNC_PASS_DONT_COMPRESS 4 /* don't compress after this pass */
495-
#define SYNC_PASS_REWRITE 1 /* rewrite new bps after this pass */
496-
497489
/* spa namespace global mutex */
498490
extern kmutex_t spa_namespace_lock;
499491

usr/src/uts/common/fs/zfs/sys/space_map.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
* Use is subject to license terms.
2424
*/
2525

26+
/*
27+
* Copyright (c) 2012 by Delphix. All rights reserved.
28+
*/
29+
2630
#ifndef _SYS_SPACE_MAP_H
2731
#define _SYS_SPACE_MAP_H
2832

@@ -136,6 +140,8 @@ struct space_map_ops {
136140

137141
typedef void space_map_func_t(space_map_t *sm, uint64_t start, uint64_t size);
138142

143+
extern void space_map_init(void);
144+
extern void space_map_fini(void);
139145
extern void space_map_create(space_map_t *sm, uint64_t start, uint64_t size,
140146
uint8_t shift, kmutex_t *lp);
141147
extern void space_map_destroy(space_map_t *sm);

usr/src/uts/common/fs/zfs/sys/zio_impl.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,10 @@
2323
* Use is subject to license terms.
2424
*/
2525

26+
/*
27+
* Copyright (c) 2012 by Delphix. All rights reserved.
28+
*/
29+
2630
#ifndef _ZIO_IMPL_H
2731
#define _ZIO_IMPL_H
2832

@@ -143,6 +147,7 @@ enum zio_stage {
143147
#define ZIO_FREE_PIPELINE \
144148
(ZIO_INTERLOCK_STAGES | \
145149
ZIO_STAGE_FREE_BP_INIT | \
150+
ZIO_STAGE_ISSUE_ASYNC | \
146151
ZIO_STAGE_DVA_FREE)
147152

148153
#define ZIO_DDT_FREE_PIPELINE \

usr/src/uts/common/fs/zfs/zio.c

Lines changed: 19 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,22 @@ extern vmem_t *zio_alloc_arena;
8282
#endif
8383
extern int zfs_mg_alloc_failures;
8484

85+
/*
86+
* The following actions directly effect the spa's sync-to-convergence logic.
87+
* The values below define the sync pass when we start performing the action.
88+
* Care should be taken when changing these values as they directly impact
89+
* spa_sync() performance. Tuning these values may introduce subtle performance
90+
* pathologies and should only be done in the context of performance analysis.
91+
* These tunables will eventually be removed and replaced with #defines once
92+
* enough analysis has been done to determine optimal values.
93+
*
94+
* The 'zfs_sync_pass_deferred_free' pass must be greater than 1 to ensure that
95+
* regular blocks are not deferred.
96+
*/
97+
int zfs_sync_pass_deferred_free = 2; /* defer frees starting in this pass */
98+
int zfs_sync_pass_dont_compress = 5; /* don't compress starting in this pass */
99+
int zfs_sync_pass_rewrite = 2; /* rewrite new bps starting in this pass */
100+
85101
/*
86102
* An allocating zio is one that either currently has the DVA allocate
87103
* stage set or will have it later in its lifetime.
@@ -690,7 +706,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp,
690706

691707
ASSERT(!BP_IS_HOLE(bp));
692708
ASSERT(spa_syncing_txg(spa) == txg);
693-
ASSERT(spa_sync_pass(spa) <= SYNC_PASS_DEFERRED_FREE);
709+
ASSERT(spa_sync_pass(spa) < zfs_sync_pass_deferred_free);
694710

695711
zio = zio_create(pio, spa, txg, bp, NULL, BP_GET_PSIZE(bp),
696712
NULL, NULL, ZIO_TYPE_FREE, ZIO_PRIORITY_FREE, flags,
@@ -987,7 +1003,7 @@ zio_write_bp_init(zio_t *zio)
9871003
ASSERT(zio->io_child_type == ZIO_CHILD_LOGICAL);
9881004
ASSERT(!BP_GET_DEDUP(bp));
9891005

990-
if (pass > SYNC_PASS_DONT_COMPRESS)
1006+
if (pass >= zfs_sync_pass_dont_compress)
9911007
compress = ZIO_COMPRESS_OFF;
9921008

9931009
/* Make sure someone doesn't change their mind on overwrites */
@@ -1016,7 +1032,7 @@ zio_write_bp_init(zio_t *zio)
10161032
* There should only be a handful of blocks after pass 1 in any case.
10171033
*/
10181034
if (bp->blk_birth == zio->io_txg && BP_GET_PSIZE(bp) == psize &&
1019-
pass > SYNC_PASS_REWRITE) {
1035+
pass >= zfs_sync_pass_rewrite) {
10201036
ASSERT(psize != 0);
10211037
enum zio_stage gang_stages = zio->io_pipeline & ZIO_GANG_STAGES;
10221038
zio->io_pipeline = ZIO_REWRITE_PIPELINE | gang_stages;

0 commit comments

Comments
 (0)