Skip to content

Commit c9030f6

Browse files
amotinChristopher Siden
authored andcommitted
5008 lock contention (rrw_exit) while running a read only load
Reviewed by: Matthew Ahrens <matthew.ahrens@delphix.com> Reviewed by: George Wilson <george.wilson@delphix.com> Reviewed by: Alex Reece <alex.reece@delphix.com> Reviewed by: Christopher Siden <christopher.siden@delphix.com> Reviewed by: Richard Yao <ryao@gentoo.org> Reviewed by: Saso Kiselkov <skiselkov.ml@gmail.com> Approved by: Garrett D'Amore <garrett@damore.org>
1 parent 73527f4 commit c9030f6

File tree

7 files changed

+127
-14
lines changed

7 files changed

+127
-14
lines changed

usr/src/uts/common/fs/zfs/rrwlock.c

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -286,3 +286,91 @@ rrw_tsd_destroy(void *arg)
286286
(void *)curthread, (void *)rn->rn_rrl);
287287
}
288288
}
289+
290+
/*
291+
* A reader-mostly lock implementation, tuning above reader-writer locks
292+
* for hightly parallel read acquisitions, while pessimizing writes.
293+
*
294+
* The idea is to split single busy lock into array of locks, so that
295+
* each reader can lock only one of them for read, depending on result
296+
* of simple hash function. That proportionally reduces lock congestion.
297+
* Writer same time has to sequentially aquire write on all the locks.
298+
* That makes write aquisition proportionally slower, but in places where
299+
* it is used (filesystem unmount) performance is not critical.
300+
*
301+
* All the functions below are direct wrappers around functions above.
302+
*/
303+
void
304+
rrm_init(rrmlock_t *rrl, boolean_t track_all)
305+
{
306+
int i;
307+
308+
for (i = 0; i < RRM_NUM_LOCKS; i++)
309+
rrw_init(&rrl->locks[i], track_all);
310+
}
311+
312+
void
313+
rrm_destroy(rrmlock_t *rrl)
314+
{
315+
int i;
316+
317+
for (i = 0; i < RRM_NUM_LOCKS; i++)
318+
rrw_destroy(&rrl->locks[i]);
319+
}
320+
321+
void
322+
rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag)
323+
{
324+
if (rw == RW_READER)
325+
rrm_enter_read(rrl, tag);
326+
else
327+
rrm_enter_write(rrl);
328+
}
329+
330+
/*
331+
* This maps the current thread to a specific lock. Note that the lock
332+
* must be released by the same thread that acquired it. We do this
333+
* mapping by taking the thread pointer mod a prime number. We examine
334+
* only the low 32 bits of the thread pointer, because 32-bit division
335+
* is faster than 64-bit division, and the high 32 bits have little
336+
* entropy anyway.
337+
*/
338+
#define RRM_TD_LOCK() (((uint32_t)(uintptr_t)(curthread)) % RRM_NUM_LOCKS)
339+
340+
void
341+
rrm_enter_read(rrmlock_t *rrl, void *tag)
342+
{
343+
rrw_enter_read(&rrl->locks[RRM_TD_LOCK()], tag);
344+
}
345+
346+
void
347+
rrm_enter_write(rrmlock_t *rrl)
348+
{
349+
int i;
350+
351+
for (i = 0; i < RRM_NUM_LOCKS; i++)
352+
rrw_enter_write(&rrl->locks[i]);
353+
}
354+
355+
void
356+
rrm_exit(rrmlock_t *rrl, void *tag)
357+
{
358+
int i;
359+
360+
if (rrl->locks[0].rr_writer == curthread) {
361+
for (i = 0; i < RRM_NUM_LOCKS; i++)
362+
rrw_exit(&rrl->locks[i], tag);
363+
} else {
364+
rrw_exit(&rrl->locks[RRM_TD_LOCK()], tag);
365+
}
366+
}
367+
368+
boolean_t
369+
rrm_held(rrmlock_t *rrl, krw_t rw)
370+
{
371+
if (rw == RW_WRITER) {
372+
return (rrw_held(&rrl->locks[0], rw));
373+
} else {
374+
return (rrw_held(&rrl->locks[RRM_TD_LOCK()], rw));
375+
}
376+
}

usr/src/uts/common/fs/zfs/sys/rrwlock.h

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,31 @@ void rrw_tsd_destroy(void *arg);
8080
#define RRW_LOCK_HELD(x) \
8181
(rrw_held(x, RW_WRITER) || rrw_held(x, RW_READER))
8282

83+
/*
84+
* A reader-mostly lock implementation, tuning above reader-writer locks
85+
* for hightly parallel read acquisitions, pessimizing write acquisitions.
86+
*
87+
* This should be a prime number. See comment in rrwlock.c near
88+
* RRM_TD_LOCK() for details.
89+
*/
90+
#define RRM_NUM_LOCKS 17
91+
typedef struct rrmlock {
92+
rrwlock_t locks[RRM_NUM_LOCKS];
93+
} rrmlock_t;
94+
95+
void rrm_init(rrmlock_t *rrl, boolean_t track_all);
96+
void rrm_destroy(rrmlock_t *rrl);
97+
void rrm_enter(rrmlock_t *rrl, krw_t rw, void *tag);
98+
void rrm_enter_read(rrmlock_t *rrl, void *tag);
99+
void rrm_enter_write(rrmlock_t *rrl);
100+
void rrm_exit(rrmlock_t *rrl, void *tag);
101+
boolean_t rrm_held(rrmlock_t *rrl, krw_t rw);
102+
103+
#define RRM_READ_HELD(x) rrm_held(x, RW_READER)
104+
#define RRM_WRITE_HELD(x) rrm_held(x, RW_WRITER)
105+
#define RRM_LOCK_HELD(x) \
106+
(rrm_held(x, RW_WRITER) || rrm_held(x, RW_READER))
107+
83108
#ifdef __cplusplus
84109
}
85110
#endif

usr/src/uts/common/fs/zfs/sys/zfs_vfsops.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,7 @@ struct zfsvfs {
6464
int z_norm; /* normalization flags */
6565
boolean_t z_atime; /* enable atimes mount option */
6666
boolean_t z_unmounted; /* unmounted */
67-
rrwlock_t z_teardown_lock;
67+
rrmlock_t z_teardown_lock;
6868
krwlock_t z_teardown_inactive_lock;
6969
list_t z_all_znodes; /* all vnodes in the fs */
7070
kmutex_t z_znodes_lock; /* lock for z_all_znodes */

usr/src/uts/common/fs/zfs/sys/zfs_znode.h

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -238,15 +238,15 @@ typedef struct znode {
238238
/* Called on entry to each ZFS vnode and vfs operation */
239239
#define ZFS_ENTER(zfsvfs) \
240240
{ \
241-
rrw_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
241+
rrm_enter_read(&(zfsvfs)->z_teardown_lock, FTAG); \
242242
if ((zfsvfs)->z_unmounted) { \
243243
ZFS_EXIT(zfsvfs); \
244244
return (EIO); \
245245
} \
246246
}
247247

248248
/* Must be called before exiting the vop */
249-
#define ZFS_EXIT(zfsvfs) rrw_exit(&(zfsvfs)->z_teardown_lock, FTAG)
249+
#define ZFS_EXIT(zfsvfs) rrm_exit(&(zfsvfs)->z_teardown_lock, FTAG)
250250

251251
/* Verifies the znode is valid */
252252
#define ZFS_VERIFY_ZP(zp) \

usr/src/uts/common/fs/zfs/zfs_ioctl.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1420,15 +1420,15 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
14201420
if (getzfsvfs(name, zfvp) != 0)
14211421
error = zfsvfs_create(name, zfvp);
14221422
if (error == 0) {
1423-
rrw_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
1423+
rrm_enter(&(*zfvp)->z_teardown_lock, (writer) ? RW_WRITER :
14241424
RW_READER, tag);
14251425
if ((*zfvp)->z_unmounted) {
14261426
/*
14271427
* XXX we could probably try again, since the unmounting
14281428
* thread should be just about to disassociate the
14291429
* objset from the zfsvfs.
14301430
*/
1431-
rrw_exit(&(*zfvp)->z_teardown_lock, tag);
1431+
rrm_exit(&(*zfvp)->z_teardown_lock, tag);
14321432
return (SET_ERROR(EBUSY));
14331433
}
14341434
}
@@ -1438,7 +1438,7 @@ zfsvfs_hold(const char *name, void *tag, zfsvfs_t **zfvp, boolean_t writer)
14381438
static void
14391439
zfsvfs_rele(zfsvfs_t *zfsvfs, void *tag)
14401440
{
1441-
rrw_exit(&zfsvfs->z_teardown_lock, tag);
1441+
rrm_exit(&zfsvfs->z_teardown_lock, tag);
14421442

14431443
if (zfsvfs->z_vfs) {
14441444
VFS_RELE(zfsvfs->z_vfs);

usr/src/uts/common/fs/zfs/zfs_vfsops.c

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1004,7 +1004,7 @@ zfsvfs_create(const char *osname, zfsvfs_t **zfvp)
10041004
mutex_init(&zfsvfs->z_lock, NULL, MUTEX_DEFAULT, NULL);
10051005
list_create(&zfsvfs->z_all_znodes, sizeof (znode_t),
10061006
offsetof(znode_t, z_link_node));
1007-
rrw_init(&zfsvfs->z_teardown_lock, B_FALSE);
1007+
rrm_init(&zfsvfs->z_teardown_lock, B_FALSE);
10081008
rw_init(&zfsvfs->z_teardown_inactive_lock, NULL, RW_DEFAULT, NULL);
10091009
rw_init(&zfsvfs->z_fuid_lock, NULL, RW_DEFAULT, NULL);
10101010
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1119,7 +1119,7 @@ zfsvfs_free(zfsvfs_t *zfsvfs)
11191119
mutex_destroy(&zfsvfs->z_znodes_lock);
11201120
mutex_destroy(&zfsvfs->z_lock);
11211121
list_destroy(&zfsvfs->z_all_znodes);
1122-
rrw_destroy(&zfsvfs->z_teardown_lock);
1122+
rrm_destroy(&zfsvfs->z_teardown_lock);
11231123
rw_destroy(&zfsvfs->z_teardown_inactive_lock);
11241124
rw_destroy(&zfsvfs->z_fuid_lock);
11251125
for (i = 0; i != ZFS_OBJ_MTX_SZ; i++)
@@ -1784,7 +1784,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
17841784
{
17851785
znode_t *zp;
17861786

1787-
rrw_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
1787+
rrm_enter(&zfsvfs->z_teardown_lock, RW_WRITER, FTAG);
17881788

17891789
if (!unmounting) {
17901790
/*
@@ -1814,7 +1814,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
18141814
*/
18151815
if (!unmounting && (zfsvfs->z_unmounted || zfsvfs->z_os == NULL)) {
18161816
rw_exit(&zfsvfs->z_teardown_inactive_lock);
1817-
rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1817+
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
18181818
return (SET_ERROR(EIO));
18191819
}
18201820

@@ -1841,7 +1841,7 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting)
18411841
*/
18421842
if (unmounting) {
18431843
zfsvfs->z_unmounted = B_TRUE;
1844-
rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
1844+
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
18451845
rw_exit(&zfsvfs->z_teardown_inactive_lock);
18461846
}
18471847

@@ -2073,7 +2073,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
20732073
znode_t *zp;
20742074
uint64_t sa_obj = 0;
20752075

2076-
ASSERT(RRW_WRITE_HELD(&zfsvfs->z_teardown_lock));
2076+
ASSERT(RRM_WRITE_HELD(&zfsvfs->z_teardown_lock));
20772077
ASSERT(RW_WRITE_HELD(&zfsvfs->z_teardown_inactive_lock));
20782078

20792079
/*
@@ -2129,7 +2129,7 @@ zfs_resume_fs(zfsvfs_t *zfsvfs, const char *osname)
21292129
bail:
21302130
/* release the VOPs */
21312131
rw_exit(&zfsvfs->z_teardown_inactive_lock);
2132-
rrw_exit(&zfsvfs->z_teardown_lock, FTAG);
2132+
rrm_exit(&zfsvfs->z_teardown_lock, FTAG);
21332133

21342134
if (err) {
21352135
/*

usr/src/uts/common/fs/zfs/zfs_znode.c

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -276,7 +276,7 @@ zfs_znode_move(void *buf, void *newbuf, size_t size, void *arg)
276276
* can safely ensure that the filesystem is not and will not be
277277
* unmounted. The next statement is equivalent to ZFS_ENTER().
278278
*/
279-
rrw_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
279+
rrm_enter(&zfsvfs->z_teardown_lock, RW_READER, FTAG);
280280
if (zfsvfs->z_unmounted) {
281281
ZFS_EXIT(zfsvfs);
282282
rw_exit(&zfsvfs_lock);

0 commit comments

Comments
 (0)