@@ -49,6 +49,8 @@ uint32_t zfetch_max_streams = 8;
49
49
uint32_t zfetch_min_sec_reap = 2 ;
50
50
/* max bytes to prefetch per stream (default 8MB) */
51
51
uint32_t zfetch_max_distance = 8 * 1024 * 1024 ;
52
+ /* max bytes to prefetch indirects for per stream (default 64MB) */
53
+ uint32_t zfetch_max_idistance = 64 * 1024 * 1024 ;
52
54
/* max number of bytes in an array_read in which we allow prefetching (1MB) */
53
55
uint64_t zfetch_array_rd_sz = 1024 * 1024 ;
54
56
@@ -186,20 +188,29 @@ dmu_zfetch_stream_create(zfetch_t *zf, uint64_t blkid)
186
188
zstream_t * zs = kmem_zalloc (sizeof (* zs ), KM_SLEEP );
187
189
zs -> zs_blkid = blkid ;
188
190
zs -> zs_pf_blkid = blkid ;
191
+ zs -> zs_ipf_blkid = blkid ;
189
192
zs -> zs_atime = gethrtime ();
190
193
mutex_init (& zs -> zs_lock , NULL , MUTEX_DEFAULT , NULL );
191
194
192
195
list_insert_head (& zf -> zf_stream , zs );
193
196
}
194
197
195
198
/*
196
- * This is the prefetch entry point. It calls all of the other dmu_zfetch
197
- * routines to create, delete, find, or operate upon prefetch streams.
199
+ * This is the predictive prefetch entry point. It associates dnode access
200
+ * specified with blkid and nblks arguments with prefetch stream, predicts
201
+ * further accesses based on that stats and initiates speculative prefetch.
202
+ * fetch_data argument specifies whether actual data blocks should be fetched:
203
+ * FALSE -- prefetch only indirect blocks for predicted data blocks;
204
+ * TRUE -- prefetch predicted data blocks plus following indirect blocks.
198
205
*/
199
206
void
200
- dmu_zfetch (zfetch_t * zf , uint64_t blkid , uint64_t nblks )
207
+ dmu_zfetch (zfetch_t * zf , uint64_t blkid , uint64_t nblks , boolean_t fetch_data )
201
208
{
202
209
zstream_t * zs ;
210
+ int64_t pf_start , ipf_start , ipf_istart , ipf_iend ;
211
+ int64_t pf_ahead_blks , max_blks ;
212
+ int epbs , max_dist_blks , pf_nblks , ipf_nblks ;
213
+ uint64_t end_of_access_blkid = blkid + nblks ;
203
214
204
215
if (zfs_prefetch_disable )
205
216
return ;
@@ -236,7 +247,7 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
236
247
*/
237
248
ZFETCHSTAT_BUMP (zfetchstat_misses );
238
249
if (rw_tryupgrade (& zf -> zf_rwlock ))
239
- dmu_zfetch_stream_create (zf , blkid + nblks );
250
+ dmu_zfetch_stream_create (zf , end_of_access_blkid );
240
251
rw_exit (& zf -> zf_rwlock );
241
252
return ;
242
253
}
@@ -248,35 +259,74 @@ dmu_zfetch(zfetch_t *zf, uint64_t blkid, uint64_t nblks)
248
259
* Normally, we start prefetching where we stopped
249
260
* prefetching last (zs_pf_blkid). But when we get our first
250
261
* hit on this stream, zs_pf_blkid == zs_blkid, we don't
251
- * want to prefetch to block we just accessed. In this case,
262
+ * want to prefetch the block we just accessed. In this case,
252
263
* start just after the block we just accessed.
253
264
*/
254
- int64_t pf_start = MAX (zs -> zs_pf_blkid , blkid + nblks );
265
+ pf_start = MAX (zs -> zs_pf_blkid , end_of_access_blkid );
255
266
256
267
/*
257
268
* Double our amount of prefetched data, but don't let the
258
269
* prefetch get further ahead than zfetch_max_distance.
259
270
*/
260
- int pf_nblks =
261
- MIN ((int64_t )zs -> zs_pf_blkid - zs -> zs_blkid + nblks ,
262
- zs -> zs_blkid + nblks +
263
- (zfetch_max_distance >> zf -> zf_dnode -> dn_datablkshift ) - pf_start );
271
+ if (fetch_data ) {
272
+ max_dist_blks =
273
+ zfetch_max_distance >> zf -> zf_dnode -> dn_datablkshift ;
274
+ /*
275
+ * Previously, we were (zs_pf_blkid - blkid) ahead. We
276
+ * want to now be double that, so read that amount again,
277
+ * plus the amount we are catching up by (i.e. the amount
278
+ * read just now).
279
+ */
280
+ pf_ahead_blks = zs -> zs_pf_blkid - blkid + nblks ;
281
+ max_blks = max_dist_blks - (pf_start - end_of_access_blkid );
282
+ pf_nblks = MIN (pf_ahead_blks , max_blks );
283
+ } else {
284
+ pf_nblks = 0 ;
285
+ }
264
286
265
287
zs -> zs_pf_blkid = pf_start + pf_nblks ;
266
- zs -> zs_atime = gethrtime ();
267
- zs -> zs_blkid = blkid + nblks ;
268
288
269
289
/*
270
- * dbuf_prefetch() issues the prefetch i/o
271
- * asynchronously, but it may need to wait for an
272
- * indirect block to be read from disk. Therefore
273
- * we do not want to hold any locks while we call it.
290
+ * Do the same for indirects, starting from where we stopped last,
291
+ * or where we will stop reading data blocks (and the indirects
292
+ * that point to them).
274
293
*/
294
+ ipf_start = MAX (zs -> zs_ipf_blkid , zs -> zs_pf_blkid );
295
+ max_dist_blks = zfetch_max_idistance >> zf -> zf_dnode -> dn_datablkshift ;
296
+ /*
297
+ * We want to double our distance ahead of the data prefetch
298
+ * (or reader, if we are not prefetching data). Previously, we
299
+ * were (zs_ipf_blkid - blkid) ahead. To double that, we read
300
+ * that amount again, plus the amount we are catching up by
301
+ * (i.e. the amount read now + the amount of data prefetched now).
302
+ */
303
+ pf_ahead_blks = zs -> zs_ipf_blkid - blkid + nblks + pf_nblks ;
304
+ max_blks = max_dist_blks - (ipf_start - end_of_access_blkid );
305
+ ipf_nblks = MIN (pf_ahead_blks , max_blks );
306
+ zs -> zs_ipf_blkid = ipf_start + ipf_nblks ;
307
+
308
+ epbs = zf -> zf_dnode -> dn_indblkshift - SPA_BLKPTRSHIFT ;
309
+ ipf_istart = P2ROUNDUP (ipf_start , 1 << epbs ) >> epbs ;
310
+ ipf_iend = P2ROUNDUP (zs -> zs_ipf_blkid , 1 << epbs ) >> epbs ;
311
+
312
+ zs -> zs_atime = gethrtime ();
313
+ zs -> zs_blkid = end_of_access_blkid ;
275
314
mutex_exit (& zs -> zs_lock );
276
315
rw_exit (& zf -> zf_rwlock );
316
+
317
+ /*
318
+ * dbuf_prefetch() is asynchronous (even when it needs to read
319
+ * indirect blocks), but we still prefer to drop our locks before
320
+ * calling it to reduce the time we hold them.
321
+ */
322
+
277
323
for (int i = 0 ; i < pf_nblks ; i ++ ) {
278
324
dbuf_prefetch (zf -> zf_dnode , 0 , pf_start + i ,
279
325
ZIO_PRIORITY_ASYNC_READ , ARC_FLAG_PREDICTIVE_PREFETCH );
280
326
}
327
+ for (int64_t iblk = ipf_istart ; iblk < ipf_iend ; iblk ++ ) {
328
+ dbuf_prefetch (zf -> zf_dnode , 1 , iblk ,
329
+ ZIO_PRIORITY_ASYNC_READ , ARC_FLAG_PREDICTIVE_PREFETCH );
330
+ }
281
331
ZFETCHSTAT_BUMP (zfetchstat_hits );
282
332
}
0 commit comments