Skip to content

Commit cb92f41

Browse files
amotinahrens
authored andcommitted
6322 ZFS indirect block predictive prefetch
Reviewed by: Matthew Ahrens <mahrens@delphix.com> Reviewed by: Paul Dagnelie <pcd@delphix.com> Approved by: Robert Mustacchi <rm@joyent.com>
1 parent f63cc15 commit cb92f41

File tree

5 files changed

+90
-23
lines changed

5 files changed

+90
-23
lines changed

usr/src/uts/common/fs/zfs/dbuf.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -721,7 +721,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
721721
if (db->db_state == DB_CACHED) {
722722
mutex_exit(&db->db_mtx);
723723
if (prefetch)
724-
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
724+
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
725725
if ((flags & DB_RF_HAVESTRUCT) == 0)
726726
rw_exit(&dn->dn_struct_rwlock);
727727
DB_DNODE_EXIT(db);
@@ -735,7 +735,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
735735
/* dbuf_read_impl has dropped db_mtx for us */
736736

737737
if (prefetch)
738-
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
738+
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
739739

740740
if ((flags & DB_RF_HAVESTRUCT) == 0)
741741
rw_exit(&dn->dn_struct_rwlock);
@@ -754,7 +754,7 @@ dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags)
754754
*/
755755
mutex_exit(&db->db_mtx);
756756
if (prefetch)
757-
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1);
757+
dmu_zfetch(&dn->dn_zfetch, db->db_blkid, 1, B_TRUE);
758758
if ((flags & DB_RF_HAVESTRUCT) == 0)
759759
rw_exit(&dn->dn_struct_rwlock);
760760
DB_DNODE_EXIT(db);

usr/src/uts/common/fs/zfs/dmu.c

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -441,9 +441,10 @@ dmu_buf_hold_array_by_dnode(dnode_t *dn, uint64_t offset, uint64_t length,
441441
dbp[i] = &db->db;
442442
}
443443

444-
if ((flags & DMU_READ_NO_PREFETCH) == 0 && read &&
445-
length <= zfetch_array_rd_sz) {
446-
dmu_zfetch(&dn->dn_zfetch, blkid, nblks);
444+
if ((flags & DMU_READ_NO_PREFETCH) == 0 &&
445+
DNODE_META_IS_CACHEABLE(dn) && length <= zfetch_array_rd_sz) {
446+
dmu_zfetch(&dn->dn_zfetch, blkid, nblks,
447+
read && DNODE_IS_CACHEABLE(dn));
447448
}
448449
rw_exit(&dn->dn_struct_rwlock);
449450

usr/src/uts/common/fs/zfs/dmu_zfetch.c

Lines changed: 66 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,8 @@ uint32_t zfetch_max_streams = 8;
4949
uint32_t zfetch_min_sec_reap = 2;
5050
/* max bytes to prefetch per stream (default 8MB) */
5151
uint32_t zfetch_max_distance = 8 * 1024 * 1024;
52+
/* max bytes to prefetch indirects for per stream (default 64MB) */
53+
uint32_t zfetch_max_idistance = 64 * 1024 * 1024;
5254
/* max number of bytes in an array_read in which we allow prefetching (1MB) */
5355
uint64_t zfetch_array_rd_sz = 1024 * 1024;
5456

@@ -186,20 +188,29 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
186188
zstream_t *zs = kmem_zalloc(sizeof (*zs), KM_SLEEP);
187189
zs->zs_blkid = blkid;
188190
zs->zs_pf_blkid = blkid;
191+
zs->zs_ipf_blkid = blkid;
189192
zs->zs_atime = gethrtime();
190193
mutex_init(&zs->zs_lock, NULL, MUTEX_DEFAULT, NULL);
191194

192195
list_insert_head(&zf->zf_stream, zs);
193196
}
194197

195198
/*
196-
* This is the prefetch entry point. It calls all of the other dmu_zfetch
197-
* routines to create, delete, find, or operate upon prefetch streams.
199+
* This is the predictive prefetch entry point. It associates dnode access
200+
* specified with blkid and nblks arguments with prefetch stream, predicts
201+
* further accesses based on that stats and initiates speculative prefetch.
202+
* fetch_data argument specifies whether actual data blocks should be fetched:
203+
* FALSE -- prefetch only indirect blocks for predicted data blocks;
204+
* TRUE -- prefetch predicted data blocks plus following indirect blocks.
198205
*/
199206
void
200-
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
207+
dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks, boolean_t fetch_data)
201208
{
202209
zstream_t *zs;
210+
int64_t pf_start, ipf_start, ipf_istart, ipf_iend;
211+
int64_t pf_ahead_blks, max_blks;
212+
int epbs, max_dist_blks, pf_nblks, ipf_nblks;
213+
uint64_t end_of_access_blkid = blkid + nblks;
203214

204215
if (zfs_prefetch_disable)
205216
return;
@@ -236,7 +247,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
236247
*/
237248
ZFETCHSTAT_BUMP(zfetchstat_misses);
238249
if (rw_tryupgrade(&zf->zf_rwlock))
239-
dmu_zfetch_stream_create(zf, blkid + nblks);
250+
dmu_zfetch_stream_create(zf, end_of_access_blkid);
240251
rw_exit(&zf->zf_rwlock);
241252
return;
242253
}
@@ -248,35 +259,74 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
248259
* Normally, we start prefetching where we stopped
249260
* prefetching last (zs_pf_blkid). But when we get our first
250261
* hit on this stream, zs_pf_blkid == zs_blkid, we don't
251-
* want to prefetch to block we just accessed. In this case,
262+
* want to prefetch the block we just accessed. In this case,
252263
* start just after the block we just accessed.
253264
*/
254-
int64_t pf_start = MAX(zs->zs_pf_blkid, blkid + nblks);
265+
pf_start = MAX(zs->zs_pf_blkid, end_of_access_blkid);
255266

256267
/*
257268
* Double our amount of prefetched data, but don't let the
258269
* prefetch get further ahead than zfetch_max_distance.
259270
*/
260-
int pf_nblks =
261-
MIN((int64_t)zs->zs_pf_blkid - zs->zs_blkid + nblks,
262-
zs->zs_blkid + nblks +
263-
(zfetch_max_distance >> zf->zf_dnode->dn_datablkshift) - pf_start);
271+
if (fetch_data) {
272+
max_dist_blks =
273+
zfetch_max_distance >> zf->zf_dnode->dn_datablkshift;
274+
/*
275+
* Previously, we were (zs_pf_blkid - blkid) ahead. We
276+
* want to now be double that, so read that amount again,
277+
* plus the amount we are catching up by (i.e. the amount
278+
* read just now).
279+
*/
280+
pf_ahead_blks = zs->zs_pf_blkid - blkid + nblks;
281+
max_blks = max_dist_blks - (pf_start - end_of_access_blkid);
282+
pf_nblks = MIN(pf_ahead_blks, max_blks);
283+
} else {
284+
pf_nblks = 0;
285+
}
264286

265287
zs->zs_pf_blkid = pf_start + pf_nblks;
266-
zs->zs_atime = gethrtime();
267-
zs->zs_blkid = blkid + nblks;
268288

269289
/*
270-
* dbuf_prefetch() issues the prefetch i/o
271-
* asynchronously, but it may need to wait for an
272-
* indirect block to be read from disk. Therefore
273-
* we do not want to hold any locks while we call it.
290+
* Do the same for indirects, starting from where we stopped last,
291+
* or where we will stop reading data blocks (and the indirects
292+
* that point to them).
274293
*/
294+
ipf_start = MAX(zs->zs_ipf_blkid, zs->zs_pf_blkid);
295+
max_dist_blks = zfetch_max_idistance >> zf->zf_dnode->dn_datablkshift;
296+
/*
297+
* We want to double our distance ahead of the data prefetch
298+
* (or reader, if we are not prefetching data). Previously, we
299+
* were (zs_ipf_blkid - blkid) ahead. To double that, we read
300+
* that amount again, plus the amount we are catching up by
301+
* (i.e. the amount read now + the amount of data prefetched now).
302+
*/
303+
pf_ahead_blks = zs->zs_ipf_blkid - blkid + nblks + pf_nblks;
304+
max_blks = max_dist_blks - (ipf_start - end_of_access_blkid);
305+
ipf_nblks = MIN(pf_ahead_blks, max_blks);
306+
zs->zs_ipf_blkid = ipf_start + ipf_nblks;
307+
308+
epbs = zf->zf_dnode->dn_indblkshift - SPA_BLKPTRSHIFT;
309+
ipf_istart = P2ROUNDUP(ipf_start, 1 << epbs) >> epbs;
310+
ipf_iend = P2ROUNDUP(zs->zs_ipf_blkid, 1 << epbs) >> epbs;
311+
312+
zs->zs_atime = gethrtime();
313+
zs->zs_blkid = end_of_access_blkid;
275314
mutex_exit(&zs->zs_lock);
276315
rw_exit(&zf->zf_rwlock);
316+
317+
/*
318+
* dbuf_prefetch() is asynchronous (even when it needs to read
319+
* indirect blocks), but we still prefer to drop our locks before
320+
* calling it to reduce the time we hold them.
321+
*/
322+
277323
for (int i = 0; i < pf_nblks; i++) {
278324
dbuf_prefetch(zf->zf_dnode, 0, pf_start + i,
279325
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
280326
}
327+
for (int64_t iblk = ipf_istart; iblk < ipf_iend; iblk++) {
328+
dbuf_prefetch(zf->zf_dnode, 1, iblk,
329+
ZIO_PRIORITY_ASYNC_READ, ARC_FLAG_PREDICTIVE_PREFETCH);
330+
}
281331
ZFETCHSTAT_BUMP(zfetchstat_hits);
282332
}

usr/src/uts/common/fs/zfs/sys/dmu_zfetch.h

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,13 @@ struct dnode; /* so we can reference dnode */
4343
typedef struct zstream {
4444
uint64_t zs_blkid; /* expect next access at this blkid */
4545
uint64_t zs_pf_blkid; /* next block to prefetch */
46+
47+
/*
48+
* We will next prefetch the L1 indirect block of this level-0
49+
* block id.
50+
*/
51+
uint64_t zs_ipf_blkid;
52+
4653
kmutex_t zs_lock; /* protects stream */
4754
hrtime_t zs_atime; /* time last prefetch issued */
4855
list_node_t zs_node; /* link for zf_stream */
@@ -59,7 +66,7 @@ void zfetch_fini(void);
5966

6067
void dmu_zfetch_init(zfetch_t *, struct dnode *);
6168
void dmu_zfetch_fini(zfetch_t *);
62-
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t);
69+
void dmu_zfetch(zfetch_t *, uint64_t, uint64_t, boolean_t);
6370

6471

6572
#ifdef __cplusplus

usr/src/uts/common/fs/zfs/sys/dnode.h

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -305,6 +305,15 @@ int dnode_next_offset(dnode_t *dn, int flags, uint64_t *off,
305305
void dnode_evict_dbufs(dnode_t *dn);
306306
void dnode_evict_bonus(dnode_t *dn);
307307

308+
#define DNODE_IS_CACHEABLE(_dn) \
309+
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
310+
(DMU_OT_IS_METADATA((_dn)->dn_type) && \
311+
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA))
312+
313+
#define DNODE_META_IS_CACHEABLE(_dn) \
314+
((_dn)->dn_objset->os_primary_cache == ZFS_CACHE_ALL || \
315+
(_dn)->dn_objset->os_primary_cache == ZFS_CACHE_METADATA)
316+
308317
#ifdef ZFS_DEBUG
309318

310319
/*

0 commit comments

Comments
 (0)