From a3874b8b1fe5103fc1f961609557c0587435fec0 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Thu, 4 Oct 2018 00:00:49 +0300 Subject: [PATCH] 10405 Implement ZFS sorted scans Reviewed by: Jerry Jelinek Reviewed by: Kody Kantor Approved by: Dan McDonald --- usr/src/cmd/zdb/zdb.c | 10 +- usr/src/cmd/zpool/zpool_main.c | 131 +- usr/src/cmd/ztest/ztest.c | 6 +- usr/src/lib/libfakekernel/common/mapfile-vers | 1 + usr/src/lib/libfakekernel/common/taskq.c | 6 + usr/src/lib/libzfs/common/libzfs_status.c | 2 +- .../cli_root/zpool_scrub/cleanup.ksh | 2 + .../functional/cli_root/zpool_scrub/setup.ksh | 9 +- .../cli_root/zpool_scrub/zpool_scrub.cfg | 5 +- .../zpool_scrub/zpool_scrub_002_pos.ksh | 16 +- .../zpool_scrub/zpool_scrub_003_pos.ksh | 9 +- .../zpool_scrub/zpool_scrub_004_pos.ksh | 7 +- .../zpool_scrub/zpool_scrub_005_pos.ksh | 4 + .../tests/perf/scripts/prefetch_io.d | 4 +- usr/src/uts/common/fs/zfs/arc.c | 213 +- usr/src/uts/common/fs/zfs/dbuf.c | 18 +- usr/src/uts/common/fs/zfs/ddt.c | 17 +- usr/src/uts/common/fs/zfs/dmu_objset.c | 2 + usr/src/uts/common/fs/zfs/dmu_traverse.c | 3 +- usr/src/uts/common/fs/zfs/dsl_scan.c | 2681 ++++++++++++++--- usr/src/uts/common/fs/zfs/metaslab.c | 82 +- usr/src/uts/common/fs/zfs/range_tree.c | 307 +- usr/src/uts/common/fs/zfs/spa.c | 6 +- usr/src/uts/common/fs/zfs/spa_misc.c | 9 +- usr/src/uts/common/fs/zfs/sys/arc.h | 43 +- usr/src/uts/common/fs/zfs/sys/dsl_pool.h | 1 + usr/src/uts/common/fs/zfs/sys/dsl_scan.h | 49 +- usr/src/uts/common/fs/zfs/sys/range_tree.h | 27 + usr/src/uts/common/fs/zfs/sys/spa_impl.h | 5 +- usr/src/uts/common/fs/zfs/sys/vdev.h | 2 + usr/src/uts/common/fs/zfs/sys/vdev_impl.h | 9 + usr/src/uts/common/fs/zfs/sys/zio.h | 2 + usr/src/uts/common/fs/zfs/vdev.c | 33 + usr/src/uts/common/fs/zfs/vdev_disk.c | 25 +- usr/src/uts/common/fs/zfs/vdev_file.c | 50 +- usr/src/uts/common/fs/zfs/vdev_indirect.c | 25 +- usr/src/uts/common/fs/zfs/vdev_mirror.c | 75 +- usr/src/uts/common/fs/zfs/vdev_missing.c | 50 +- usr/src/uts/common/fs/zfs/vdev_queue.c | 65 +- usr/src/uts/common/fs/zfs/vdev_raidz.c | 63 +- usr/src/uts/common/fs/zfs/vdev_root.c | 25 +- usr/src/uts/common/fs/zfs/zio.c | 73 +- usr/src/uts/common/os/taskq.c | 6 + usr/src/uts/common/sys/fs/zfs.h | 7 +- usr/src/uts/common/sys/taskq.h | 1 + 45 files changed, 3260 insertions(+), 926 deletions(-) diff --git a/usr/src/cmd/zdb/zdb.c b/usr/src/cmd/zdb/zdb.c index 3d0490b73e35..16c8c60df1f9 100644 --- a/usr/src/cmd/zdb/zdb.c +++ b/usr/src/cmd/zdb/zdb.c @@ -2386,8 +2386,6 @@ dump_dir(objset_t *os) max_slot_used = object + dnode_slots - 1; } - ASSERT3U(object_count, ==, usedobjs); - (void) printf("\n"); (void) printf(" Dnode slots:\n"); @@ -2410,6 +2408,8 @@ dump_dir(objset_t *os) leaked_objects); leaked_objects = 0; } + + ASSERT3U(object_count, ==, usedobjs); } static void @@ -2964,7 +2964,7 @@ zdb_blkptr_done(zio_t *zio) abd_free(zio->io_abd); mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; + spa->spa_load_verify_ios--; cv_broadcast(&spa->spa_scrub_io_cv); if (ioerr && !(zio->io_flags & ZIO_FLAG_SPECULATIVE)) { @@ -3035,9 +3035,9 @@ zdb_blkptr_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, flags |= ZIO_FLAG_SPECULATIVE; mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > max_inflight) + while (spa->spa_load_verify_ios > max_inflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; + spa->spa_load_verify_ios++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(NULL, spa, bp, abd, size, diff --git a/usr/src/cmd/zpool/zpool_main.c b/usr/src/cmd/zpool/zpool_main.c index 240e74f6da11..013ef1c69935 100644 --- a/usr/src/cmd/zpool/zpool_main.c +++ b/usr/src/cmd/zpool/zpool_main.c @@ -1692,7 +1692,7 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, (void) nvlist_lookup_uint64_array(root, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &c); - if (ps && ps->pss_state == DSS_SCANNING && + if (ps != NULL && ps->pss_state == DSS_SCANNING && vs->vs_scan_processed != 0 && children == 0) { (void) printf(gettext(" (%s)"), (ps->pss_func == POOL_SCAN_RESILVER) ? @@ -4707,11 +4707,13 @@ static void print_scan_status(pool_scan_stat_t *ps) { time_t start, end, pause; - uint64_t elapsed, mins_left, hours_left; - uint64_t pass_exam, examined, total; - uint_t rate; + uint64_t total_secs_left; + uint64_t elapsed, secs_left, mins_left, hours_left, days_left; + uint64_t pass_scanned, scanned, pass_issued, issued, total; + uint_t scan_rate, issue_rate; double fraction_done; - char processed_buf[7], examined_buf[7], total_buf[7], rate_buf[7]; + char processed_buf[7], scanned_buf[7], issued_buf[7], total_buf[7]; + char srate_buf[7], irate_buf[7]; (void) printf(gettext(" scan: ")); @@ -4725,30 +4727,37 @@ print_scan_status(pool_scan_stat_t *ps) start = ps->pss_start_time; end = ps->pss_end_time; pause = ps->pss_pass_scrub_pause; + zfs_nicenum(ps->pss_processed, processed_buf, sizeof (processed_buf)); assert(ps->pss_func == POOL_SCAN_SCRUB || ps->pss_func == POOL_SCAN_RESILVER); + /* * Scan is finished or canceled. */ if (ps->pss_state == DSS_FINISHED) { - uint64_t minutes_taken = (end - start) / 60; - char *fmt = NULL; + total_secs_left = end - start; + days_left = total_secs_left / 60 / 60 / 24; + hours_left = (total_secs_left / 60 / 60) % 24; + mins_left = (total_secs_left / 60) % 60; + secs_left = (total_secs_left % 60); if (ps->pss_func == POOL_SCAN_SCRUB) { - fmt = gettext("scrub repaired %s in %lluh%um with " - "%llu errors on %s"); + (void) printf(gettext("scrub repaired %s " + "in %llu days %02llu:%02llu:%02llu " + "with %llu errors on %s"), processed_buf, + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left, + (u_longlong_t)ps->pss_errors, ctime(&end)); } else if (ps->pss_func == POOL_SCAN_RESILVER) { - fmt = gettext("resilvered %s in %lluh%um with " - "%llu errors on %s"); + (void) printf(gettext("resilvered %s " + "in %llu days %02llu:%02llu:%02llu " + "with %llu errors on %s"), processed_buf, + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left, + (u_longlong_t)ps->pss_errors, ctime(&end)); } - /* LINTED */ - (void) printf(fmt, processed_buf, - (u_longlong_t)(minutes_taken / 60), - (uint_t)(minutes_taken % 60), - (u_longlong_t)ps->pss_errors, - ctime((time_t *)&end)); return; } else if (ps->pss_state == DSS_CANCELED) { if (ps->pss_func == POOL_SCAN_SCRUB) { @@ -4763,19 +4772,15 @@ print_scan_status(pool_scan_stat_t *ps) assert(ps->pss_state == DSS_SCANNING); - /* - * Scan is in progress. - */ + /* Scan is in progress. Resilvers can't be paused. */ if (ps->pss_func == POOL_SCAN_SCRUB) { if (pause == 0) { (void) printf(gettext("scrub in progress since %s"), ctime(&start)); } else { - char buf[32]; - struct tm *p = localtime(&pause); - (void) strftime(buf, sizeof (buf), "%a %b %e %T %Y", p); - (void) printf(gettext("scrub paused since %s\n"), buf); - (void) printf(gettext("\tscrub started on %s"), + (void) printf(gettext("scrub paused since %s"), + ctime(&pause)); + (void) printf(gettext("\tscrub started on %s"), ctime(&start)); } } else if (ps->pss_func == POOL_SCAN_RESILVER) { @@ -4783,50 +4788,68 @@ print_scan_status(pool_scan_stat_t *ps) ctime(&start)); } - examined = ps->pss_examined ? ps->pss_examined : 1; + scanned = ps->pss_examined; + pass_scanned = ps->pss_pass_exam; + issued = ps->pss_issued; + pass_issued = ps->pss_pass_issued; total = ps->pss_to_examine; - fraction_done = (double)examined / total; - /* elapsed time for this pass */ + /* we are only done with a block once we have issued the IO for it */ + fraction_done = (double)issued / total; + + /* elapsed time for this pass, rounding up to 1 if it's 0 */ elapsed = time(NULL) - ps->pss_pass_start; elapsed -= ps->pss_pass_scrub_spent_paused; - elapsed = elapsed ? elapsed : 1; - pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1; - rate = pass_exam / elapsed; - rate = rate ? rate : 1; - mins_left = ((total - examined) / rate) / 60; - hours_left = mins_left / 60; - - zfs_nicenum(examined, examined_buf, sizeof (examined_buf)); + elapsed = (elapsed != 0) ? elapsed : 1; + + scan_rate = pass_scanned / elapsed; + issue_rate = pass_issued / elapsed; + total_secs_left = (issue_rate != 0) ? + ((total - issued) / issue_rate) : UINT64_MAX; + + days_left = total_secs_left / 60 / 60 / 24; + hours_left = (total_secs_left / 60 / 60) % 24; + mins_left = (total_secs_left / 60) % 60; + secs_left = (total_secs_left % 60); + + /* format all of the numbers we will be reporting */ + zfs_nicenum(scanned, scanned_buf, sizeof (scanned_buf)); + zfs_nicenum(issued, issued_buf, sizeof (issued_buf)); zfs_nicenum(total, total_buf, sizeof (total_buf)); + zfs_nicenum(scan_rate, srate_buf, sizeof (srate_buf)); + zfs_nicenum(issue_rate, irate_buf, sizeof (irate_buf)); - /* - * do not print estimated time if hours_left is more than 30 days - * or we have a paused scrub - */ + /* do not print estimated time if we have a paused scrub */ if (pause == 0) { - zfs_nicenum(rate, rate_buf, sizeof (rate_buf)); - (void) printf(gettext("\t%s scanned out of %s at %s/s"), - examined_buf, total_buf, rate_buf); - if (hours_left < (30 * 24)) { - (void) printf(gettext(", %lluh%um to go\n"), - (u_longlong_t)hours_left, (uint_t)(mins_left % 60)); - } else { - (void) printf(gettext( - ", (scan is slow, no estimated time)\n")); - } + (void) printf(gettext("\t%s scanned at %s/s, " + "%s issued at %s/s, %s total\n"), + scanned_buf, srate_buf, issued_buf, irate_buf, total_buf); } else { - (void) printf(gettext("\t%s scanned out of %s\n"), - examined_buf, total_buf); + (void) printf(gettext("\t%s scanned, %s issued, %s total\n"), + scanned_buf, issued_buf, total_buf); } if (ps->pss_func == POOL_SCAN_RESILVER) { - (void) printf(gettext(" %s resilvered, %.2f%% done\n"), + (void) printf(gettext("\t%s resilvered, %.2f%% done"), processed_buf, 100 * fraction_done); } else if (ps->pss_func == POOL_SCAN_SCRUB) { - (void) printf(gettext(" %s repaired, %.2f%% done\n"), + (void) printf(gettext("\t%s repaired, %.2f%% done"), processed_buf, 100 * fraction_done); } + + if (pause == 0) { + if (issue_rate >= 10 * 1024 * 1024) { + (void) printf(gettext(", %llu days " + "%02llu:%02llu:%02llu to go\n"), + (u_longlong_t)days_left, (u_longlong_t)hours_left, + (u_longlong_t)mins_left, (u_longlong_t)secs_left); + } else { + (void) printf(gettext(", no estimated " + "completion time\n")); + } + } else { + (void) printf(gettext("\n")); + } } /* diff --git a/usr/src/cmd/ztest/ztest.c b/usr/src/cmd/ztest/ztest.c index ef62671b3f76..c0054cddbe07 100644 --- a/usr/src/cmd/ztest/ztest.c +++ b/usr/src/cmd/ztest/ztest.c @@ -397,15 +397,15 @@ ztest_info_t ztest_info[] = { { ztest_fzap, 1, &zopt_sometimes }, { ztest_dmu_snapshot_create_destroy, 1, &zopt_sometimes }, { ztest_spa_create_destroy, 1, &zopt_sometimes }, - { ztest_fault_inject, 1, &zopt_sometimes }, + { ztest_fault_inject, 1, &zopt_incessant }, { ztest_ddt_repair, 1, &zopt_sometimes }, { ztest_dmu_snapshot_hold, 1, &zopt_sometimes }, { ztest_mmp_enable_disable, 1, &zopt_sometimes }, { ztest_reguid, 1, &zopt_rarely }, - { ztest_scrub, 1, &zopt_rarely }, + { ztest_scrub, 1, &zopt_often }, { ztest_spa_upgrade, 1, &zopt_rarely }, { ztest_dsl_dataset_promote_busy, 1, &zopt_rarely }, - { ztest_vdev_attach_detach, 1, &zopt_sometimes }, + { ztest_vdev_attach_detach, 1, &zopt_incessant }, { ztest_vdev_LUN_growth, 1, &zopt_rarely }, { ztest_vdev_add_remove, 1, &ztest_opts.zo_vdevtime }, diff --git a/usr/src/lib/libfakekernel/common/mapfile-vers b/usr/src/lib/libfakekernel/common/mapfile-vers index 69d0055a273b..2a307c7a884f 100644 --- a/usr/src/lib/libfakekernel/common/mapfile-vers +++ b/usr/src/lib/libfakekernel/common/mapfile-vers @@ -216,6 +216,7 @@ SYMBOL_VERSION SUNWprivate_1.1 { taskq_empty; taskq_member; taskq_wait; + taskq_wait_id; thread_create; thread_join; diff --git a/usr/src/lib/libfakekernel/common/taskq.c b/usr/src/lib/libfakekernel/common/taskq.c index 76962f5a64f2..e3922f6c372a 100644 --- a/usr/src/lib/libfakekernel/common/taskq.c +++ b/usr/src/lib/libfakekernel/common/taskq.c @@ -234,6 +234,12 @@ taskq_wait(taskq_t *tq) mutex_exit(&tq->tq_lock); } +void +taskq_wait_id(taskq_t *tq, taskqid_t id __unused) +{ + taskq_wait(tq); +} + static void * taskq_thread(void *arg) { diff --git a/usr/src/lib/libzfs/common/libzfs_status.c b/usr/src/lib/libzfs/common/libzfs_status.c index 7a701b78c6f9..975309c423ca 100644 --- a/usr/src/lib/libzfs/common/libzfs_status.c +++ b/usr/src/lib/libzfs/common/libzfs_status.c @@ -225,7 +225,7 @@ check_status(nvlist_t *config, boolean_t isimport) */ (void) nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_SCAN_STATS, (uint64_t **)&ps, &psc); - if (ps && ps->pss_func == POOL_SCAN_RESILVER && + if (ps != NULL && ps->pss_func == POOL_SCAN_RESILVER && ps->pss_state == DSS_SCANNING) return (ZPOOL_STATUS_RESILVERING); diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh index 80af13ba67c9..b8674764bcdb 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/cleanup.ksh @@ -26,7 +26,9 @@ # . $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg verify_runnable "global" +log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT destroy_mirrors diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh index 5db12ba164bd..abf118828eaf 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/setup.ksh @@ -37,11 +37,8 @@ verify_disk_count "$DISKS" 2 default_mirror_setup_noexit $DISK1 $DISK2 -mntpnt=$(get_prop mountpoint $TESTPOOL) -typeset -i i=0 -while ((i < 10)); do - log_must mkfile 500M $mntpnt/bigfile.$i - ((i += 1)) -done +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +# Create 256M of data +log_must file_write -b 1048576 -c 256 -o create -d 0 -f $mntpnt/bigfile log_pass diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg index 93e4e171c38c..fdf2f428477f 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub.cfg @@ -28,5 +28,8 @@ # Copyright (c) 2012, 2016 by Delphix. All rights reserved. # -export DISK1=$(echo $DISKS | awk '{print $1}') +export DISK1=${DISKS%% *} export DISK2=$(echo $DISKS | awk '{print $2}') + +export ZFS_SCAN_VDEV_LIMIT_SLOW=$((128*1024)) +export ZFS_SCAN_VDEV_LIMIT_DEFAULT=$((4*1024*1024)) diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh index 26c22fd98a83..712097bb1ca8 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_002_pos.ksh @@ -46,9 +46,9 @@ # 6. Verify zpool scrub -s succeed when the system is scrubbing. # # NOTES: -# A 10ms delay is added to the ZIOs in order to ensure that the -# scrub does not complete before it has a chance to be cancelled. -# This can occur when testing with small pools or very fast hardware. +# Artificially limit the scrub speed by setting the zfs_scan_vdev_limit +# low and adding a 50ms zio delay in order to ensure that the scrub does +# not complete early. # verify_runnable "global" @@ -56,13 +56,21 @@ verify_runnable "global" function cleanup { log_must zinject -c all + log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT + log_must rm -f $mntpnt/biggerfile } log_onexit cleanup log_assert "Verify scrub, scrub -p, and scrub -s show the right status." -log_must zinject -d $DISK1 -D20:1 $TESTPOOL +# Create 1G of additional data +mntpnt=$(get_prop mountpoint $TESTPOOL/$TESTFS) +log_must file_write -b 1048576 -c 1024 -o create -d 0 -f $mntpnt/biggerfile +log_must sync + +log_must zinject -d $DISK1 -D50:1 $TESTPOOL +log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW log_must zpool scrub $TESTPOOL log_must is_pool_scrubbing $TESTPOOL true log_must zpool scrub -p $TESTPOOL diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh index 2af3efcff727..c52ad84bc513 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_003_pos.ksh @@ -43,23 +43,22 @@ # 2. Kick off a second scrub and verify it fails # # NOTES: -# A 10ms delay is added to the ZIOs in order to ensure that the -# scrub does not complete before it has a chance to be restarted. -# This can occur when testing with small pools or very fast hardware. +# Artificially limit the scrub speed by setting the zfs_scan_vdev_limit +# low in order to ensure that the scrub does not complete early. # verify_runnable "global" function cleanup { - log_must zinject -c all + log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT } log_onexit cleanup log_assert "Scrub command fails when there is already a scrub in progress" -log_must zinject -d $DISK1 -D10:1 $TESTPOOL +log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW log_must zpool scrub $TESTPOOL log_must is_pool_scrubbing $TESTPOOL true log_mustnot zpool scrub $TESTPOOL diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh index a2065d477b72..492c7b20aa81 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_004_pos.ksh @@ -42,16 +42,21 @@ # 3. Verify scrub failed until the resilver completed # +function cleanup +{ + log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_DEFAULT +} + verify_runnable "global" log_assert "Resilver prevent scrub from starting until the resilver completes" +log_must set_tunable64 zfs_scan_vdev_limit $ZFS_SCAN_VDEV_LIMIT_SLOW log_must zpool detach $TESTPOOL $DISK2 log_must zpool attach $TESTPOOL $DISK1 $DISK2 log_must is_pool_resilvering $TESTPOOL log_mustnot zpool scrub $TESTPOOL -# Allow the resilver to finish, or it will interfere with the next test. while ! is_pool_resilvered $TESTPOOL; do sleep 1 done diff --git a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh index 5bc5eedb64dc..8db6ae980208 100644 --- a/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh +++ b/usr/src/test/zfs-tests/tests/functional/cli_root/zpool_scrub/zpool_scrub_005_pos.ksh @@ -58,4 +58,8 @@ log_must zpool scrub $TESTPOOL log_must zpool detach $TESTPOOL $DISK1 log_must zpool attach $TESTPOOL $DISK2 $DISK1 +while ! is_pool_resilvered $TESTPOOL; do + sleep 1 +done + log_pass "When scrubbing, detach device should not break system." diff --git a/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d b/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d index 03e6c4111e38..5f49e4046e4c 100644 --- a/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d +++ b/usr/src/test/zfs-tests/tests/perf/scripts/prefetch_io.d @@ -48,9 +48,9 @@ arc_read:arc-demand-hit-predictive-prefetch @c["prefetched_demand_reads"] = count(); } -arc_read:arc-sync-wait-for-async +arc_read:arc-upgrade-sync { - @c["sync_wait_for_async"] = count(); + @c["async_upgrade_sync"] = count(); } arc_read_done:entry diff --git a/usr/src/uts/common/fs/zfs/arc.c b/usr/src/uts/common/fs/zfs/arc.c index 844abbcd5dff..8cd31e7c7caa 100644 --- a/usr/src/uts/common/fs/zfs/arc.c +++ b/usr/src/uts/common/fs/zfs/arc.c @@ -348,7 +348,8 @@ int arc_no_grow_shift = 5; * minimum lifespan of a prefetch block in clock ticks * (initialized in arc_init()) */ -static int arc_min_prefetch_lifespan; +static int zfs_arc_min_prefetch_ms = 1; +static int zfs_arc_min_prescient_prefetch_ms = 6; /* * If this percent of memory is free, don't throttle. @@ -695,8 +696,9 @@ typedef struct arc_stats { kstat_named_t arcstat_meta_limit; kstat_named_t arcstat_meta_max; kstat_named_t arcstat_meta_min; - kstat_named_t arcstat_sync_wait_for_async; + kstat_named_t arcstat_async_upgrade_sync; kstat_named_t arcstat_demand_hit_predictive_prefetch; + kstat_named_t arcstat_demand_hit_prescient_prefetch; } arc_stats_t; static arc_stats_t arc_stats = { @@ -780,8 +782,9 @@ static arc_stats_t arc_stats = { { "arc_meta_limit", KSTAT_DATA_UINT64 }, { "arc_meta_max", KSTAT_DATA_UINT64 }, { "arc_meta_min", KSTAT_DATA_UINT64 }, - { "sync_wait_for_async", KSTAT_DATA_UINT64 }, + { "async_upgrade_sync", KSTAT_DATA_UINT64 }, { "demand_hit_predictive_prefetch", KSTAT_DATA_UINT64 }, + { "demand_hit_prescient_prefetch", KSTAT_DATA_UINT64 }, }; #define ARCSTAT(stat) (arc_stats.stat.value.ui64) @@ -878,22 +881,23 @@ typedef struct arc_callback arc_callback_t; struct arc_callback { void *acb_private; - arc_done_func_t *acb_done; + arc_read_done_func_t *acb_done; arc_buf_t *acb_buf; boolean_t acb_compressed; zio_t *acb_zio_dummy; + zio_t *acb_zio_head; arc_callback_t *acb_next; }; typedef struct arc_write_callback arc_write_callback_t; struct arc_write_callback { - void *awcb_private; - arc_done_func_t *awcb_ready; - arc_done_func_t *awcb_children_ready; - arc_done_func_t *awcb_physdone; - arc_done_func_t *awcb_done; - arc_buf_t *awcb_buf; + void *awcb_private; + arc_write_done_func_t *awcb_ready; + arc_write_done_func_t *awcb_children_ready; + arc_write_done_func_t *awcb_physdone; + arc_write_done_func_t *awcb_done; + arc_buf_t *awcb_buf; }; /* @@ -1019,6 +1023,8 @@ struct arc_buf_hdr { #define HDR_IO_IN_PROGRESS(hdr) ((hdr)->b_flags & ARC_FLAG_IO_IN_PROGRESS) #define HDR_IO_ERROR(hdr) ((hdr)->b_flags & ARC_FLAG_IO_ERROR) #define HDR_PREFETCH(hdr) ((hdr)->b_flags & ARC_FLAG_PREFETCH) +#define HDR_PRESCIENT_PREFETCH(hdr) \ + ((hdr)->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) #define HDR_COMPRESSION_ENABLED(hdr) \ ((hdr)->b_flags & ARC_FLAG_COMPRESSED_ARC) @@ -3249,6 +3255,8 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) { arc_state_t *evicted_state, *state; int64_t bytes_evicted = 0; + int min_lifetime = HDR_PRESCIENT_PREFETCH(hdr) ? + zfs_arc_min_prescient_prefetch_ms : zfs_arc_min_prefetch_ms; ASSERT(MUTEX_HELD(hash_lock)); ASSERT(HDR_HAS_L1HDR(hdr)); @@ -3301,8 +3309,7 @@ arc_evict_hdr(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) /* prefetch buffers have a minimum lifespan */ if (HDR_IO_IN_PROGRESS(hdr) || ((hdr->b_flags & (ARC_FLAG_PREFETCH | ARC_FLAG_INDIRECT)) && - ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < - arc_min_prefetch_lifespan)) { + ddi_get_lbolt() - hdr->b_l1hdr.b_arc_access < min_lifetime * hz)) { ARCSTAT_BUMP(arcstat_evict_skip); return (bytes_evicted); } @@ -4613,13 +4620,15 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * - move the buffer to the head of the list if this is * another prefetch (to make it less likely to be evicted). */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { /* link protected by hash lock */ ASSERT(multilist_link_active( &hdr->b_l1hdr.b_arc_node)); } else { - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); ARCSTAT_BUMP(arcstat_mru_hits); } hdr->b_l1hdr.b_arc_access = now; @@ -4650,10 +4659,13 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { new_state = arc_mru; - if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) - arc_hdr_clear_flags(hdr, ARC_FLAG_PREFETCH); + if (zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) > 0) { + arc_hdr_clear_flags(hdr, + ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH); + } DTRACE_PROBE1(new_state__mru, arc_buf_hdr_t *, hdr); } else { new_state = arc_mfu; @@ -4674,11 +4686,6 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * If it was a prefetch, we will explicitly move it to * the head of the list now. */ - if ((HDR_PREFETCH(hdr)) != 0) { - ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt)); - /* link protected by hash_lock */ - ASSERT(multilist_link_active(&hdr->b_l1hdr.b_arc_node)); - } ARCSTAT_BUMP(arcstat_mfu_hits); hdr->b_l1hdr.b_arc_access = ddi_get_lbolt(); } else if (hdr->b_l1hdr.b_state == arc_mfu_ghost) { @@ -4689,12 +4696,11 @@ arc_access(arc_buf_hdr_t *hdr, kmutex_t *hash_lock) * MFU state. */ - if (HDR_PREFETCH(hdr)) { + if (HDR_PREFETCH(hdr) || HDR_PRESCIENT_PREFETCH(hdr)) { /* * This is a prefetch access... * move this block back to the MRU state. */ - ASSERT0(zfs_refcount_count(&hdr->b_l1hdr.b_refcnt)); new_state = arc_mru; } @@ -4760,21 +4766,26 @@ arc_buf_access(arc_buf_t *buf) demand, prefetch, !HDR_ISTYPE_METADATA(hdr), data, metadata, hits); } -/* a generic arc_done_func_t which you can use */ +/* a generic arc_read_done_func_t which you can use */ /* ARGSUSED */ void -arc_bcopy_func(zio_t *zio, arc_buf_t *buf, void *arg) +arc_bcopy_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) { - if (zio == NULL || zio->io_error == 0) - bcopy(buf->b_data, arg, arc_buf_size(buf)); + if (buf == NULL) + return; + + bcopy(buf->b_data, arg, arc_buf_size(buf)); arc_buf_destroy(buf, arg); } -/* a generic arc_done_func_t */ +/* a generic arc_read_done_func_t */ void -arc_getbuf_func(zio_t *zio, arc_buf_t *buf, void *arg) +arc_getbuf_func(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *arg) { arc_buf_t **bufp = arg; + if (buf == NULL) { ASSERT(zio == NULL || zio->io_error != 0); *bufp = NULL; @@ -4809,7 +4820,6 @@ arc_read_done(zio_t *zio) arc_callback_t *callback_list; arc_callback_t *acb; boolean_t freeable = B_FALSE; - boolean_t no_zio_error = (zio->io_error == 0); /* * The hdr was inserted into hash-table and removed from lists @@ -4835,7 +4845,7 @@ arc_read_done(zio_t *zio) ASSERT3P(hash_lock, !=, NULL); } - if (no_zio_error) { + if (zio->io_error == 0) { /* byteswap if necessary */ if (BP_SHOULD_BYTESWAP(zio->io_bp)) { if (BP_GET_LEVEL(zio->io_bp) > 0) { @@ -4856,7 +4866,8 @@ arc_read_done(zio_t *zio) callback_list = hdr->b_l1hdr.b_acb; ASSERT3P(callback_list, !=, NULL); - if (hash_lock && no_zio_error && hdr->b_l1hdr.b_state == arc_anon) { + if (hash_lock && zio->io_error == 0 && + hdr->b_l1hdr.b_state == arc_anon) { /* * Only call arc_access on anonymous buffers. This is because * if we've issued an I/O for an evicted buffer, we've already @@ -4877,30 +4888,29 @@ arc_read_done(zio_t *zio) if (!acb->acb_done) continue; - /* This is a demand read since prefetches don't use callbacks */ callback_cnt++; - if (no_zio_error) { - int error = arc_buf_alloc_impl(hdr, acb->acb_private, - acb->acb_compressed, zio->io_error == 0, - &acb->acb_buf); - if (error != 0) { - /* - * Decompression failed. Set io_error - * so that when we call acb_done (below), - * we will indicate that the read failed. - * Note that in the unusual case where one - * callback is compressed and another - * uncompressed, we will mark all of them - * as failed, even though the uncompressed - * one can't actually fail. In this case, - * the hdr will not be anonymous, because - * if there are multiple callbacks, it's - * because multiple threads found the same - * arc buf in the hash table. - */ - zio->io_error = error; - } + if (zio->io_error != 0) + continue; + + int error = arc_buf_alloc_impl(hdr, acb->acb_private, + acb->acb_compressed, B_TRUE, &acb->acb_buf); + if (error != 0) { + /* + * Decompression failed. Set io_error + * so that when we call acb_done (below), + * we will indicate that the read failed. + * Note that in the unusual case where one + * callback is compressed and another + * uncompressed, we will mark all of them + * as failed, even though the uncompressed + * one can't actually fail. In this case, + * the hdr will not be anonymous, because + * if there are multiple callbacks, it's + * because multiple threads found the same + * arc buf in the hash table. + */ + zio->io_error = error; } } /* @@ -4923,7 +4933,7 @@ arc_read_done(zio_t *zio) ASSERT(zfs_refcount_is_zero(&hdr->b_l1hdr.b_refcnt) || callback_list != NULL); - if (no_zio_error) { + if (zio->io_error == 0) { arc_hdr_verify(hdr, zio->io_bp); } else { arc_hdr_set_flags(hdr, ARC_FLAG_IO_ERROR); @@ -4966,7 +4976,8 @@ arc_read_done(zio_t *zio) arc_buf_destroy(acb->acb_buf, acb->acb_private); acb->acb_buf = NULL; } - acb->acb_done(zio, acb->acb_buf, acb->acb_private); + acb->acb_done(zio, &zio->io_bookmark, zio->io_bp, + acb->acb_buf, acb->acb_private); } if (acb->acb_zio_dummy != NULL) { @@ -5001,7 +5012,7 @@ arc_read_done(zio_t *zio) * for readers of this block. */ int -arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, +arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_read_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb) { @@ -5010,6 +5021,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, zio_t *rzio; uint64_t guid = spa_load_guid(spa); boolean_t compressed_read = (zio_flags & ZIO_FLAG_RAW) != 0; + int rc = 0; ASSERT(!BP_IS_EMBEDDED(bp) || BPE_GET_ETYPE(bp) == BP_EMBEDDED_TYPE_DATA); @@ -5028,32 +5040,20 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, *arc_flags |= ARC_FLAG_CACHED; if (HDR_IO_IN_PROGRESS(hdr)) { + zio_t *head_zio = hdr->b_l1hdr.b_acb->acb_zio_head; + ASSERT3P(head_zio, !=, NULL); if ((hdr->b_flags & ARC_FLAG_PRIO_ASYNC_READ) && priority == ZIO_PRIORITY_SYNC_READ) { /* - * This sync read must wait for an - * in-progress async read (e.g. a predictive - * prefetch). Async reads are queued - * separately at the vdev_queue layer, so - * this is a form of priority inversion. - * Ideally, we would "inherit" the demand - * i/o's priority by moving the i/o from - * the async queue to the synchronous queue, - * but there is currently no mechanism to do - * so. Track this so that we can evaluate - * the magnitude of this potential performance - * problem. - * - * Note that if the prefetch i/o is already - * active (has been issued to the device), - * the prefetch improved performance, because - * we issued it sooner than we would have - * without the prefetch. + * This is a sync read that needs to wait for + * an in-flight async read. Request that the + * zio have its priority upgraded. */ - DTRACE_PROBE1(arc__sync__wait__for__async, + zio_change_priority(head_zio, priority); + DTRACE_PROBE1(arc__async__upgrade__sync, arc_buf_hdr_t *, hdr); - ARCSTAT_BUMP(arcstat_sync_wait_for_async); + ARCSTAT_BUMP(arcstat_async_upgrade_sync); } if (hdr->b_flags & ARC_FLAG_PREDICTIVE_PREFETCH) { arc_hdr_clear_flags(hdr, @@ -5080,6 +5080,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, spa, NULL, NULL, NULL, zio_flags); ASSERT3P(acb->acb_done, !=, NULL); + acb->acb_zio_head = head_zio; acb->acb_next = hdr->b_l1hdr.b_acb; hdr->b_l1hdr.b_acb = acb; mutex_exit(hash_lock); @@ -5107,17 +5108,33 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, arc_hdr_clear_flags(hdr, ARC_FLAG_PREDICTIVE_PREFETCH); } + + if (hdr->b_flags & ARC_FLAG_PRESCIENT_PREFETCH) { + ARCSTAT_BUMP( + arcstat_demand_hit_prescient_prefetch); + arc_hdr_clear_flags(hdr, + ARC_FLAG_PRESCIENT_PREFETCH); + } + ASSERT(!BP_IS_EMBEDDED(bp) || !BP_IS_HOLE(bp)); /* Get a buf with the desired data in it. */ - VERIFY0(arc_buf_alloc_impl(hdr, private, - compressed_read, B_TRUE, &buf)); + rc = arc_buf_alloc_impl(hdr, private, + compressed_read, B_TRUE, &buf); + if (rc != 0) { + arc_buf_destroy(buf, private); + buf = NULL; + } + ASSERT((zio_flags & ZIO_FLAG_SPECULATIVE) || + rc == 0 || rc != ENOENT); } else if (*arc_flags & ARC_FLAG_PREFETCH && zfs_refcount_count(&hdr->b_l1hdr.b_refcnt) == 0) { arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); } DTRACE_PROBE1(arc__hit, arc_buf_hdr_t *, hdr); arc_access(hdr, hash_lock); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); mutex_exit(hash_lock); @@ -5127,7 +5144,7 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, data, metadata, hits); if (done) - done(NULL, buf, private); + done(NULL, zb, bp, buf, private); } else { uint64_t lsize = BP_GET_LSIZE(bp); uint64_t psize = BP_GET_PSIZE(bp); @@ -5201,6 +5218,9 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, if (*arc_flags & ARC_FLAG_PREFETCH) arc_hdr_set_flags(hdr, ARC_FLAG_PREFETCH); + if (*arc_flags & ARC_FLAG_PRESCIENT_PREFETCH) + arc_hdr_set_flags(hdr, ARC_FLAG_PRESCIENT_PREFETCH); + if (*arc_flags & ARC_FLAG_L2CACHE) arc_hdr_set_flags(hdr, ARC_FLAG_L2CACHE); if (BP_GET_LEVEL(bp) > 0) @@ -5230,14 +5250,17 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, vd = NULL; } - if (priority == ZIO_PRIORITY_ASYNC_READ) + /* + * We count both async reads and scrub IOs as asynchronous so + * that both can be upgraded in the event of a cache hit while + * the read IO is still in-flight. + */ + if (priority == ZIO_PRIORITY_ASYNC_READ || + priority == ZIO_PRIORITY_SCRUB) arc_hdr_set_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); else arc_hdr_clear_flags(hdr, ARC_FLAG_PRIO_ASYNC_READ); - if (hash_lock != NULL) - mutex_exit(hash_lock); - /* * At this point, we have a level 1 cache miss. Try again in * L2ARC if possible. @@ -5307,6 +5330,11 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, ZIO_FLAG_CANFAIL | ZIO_FLAG_DONT_PROPAGATE | ZIO_FLAG_DONT_RETRY, B_FALSE); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); + DTRACE_PROBE2(l2arc__read, vdev_t *, vd, zio_t *, rzio); ARCSTAT_INCR(arcstat_l2_read_bytes, size); @@ -5321,6 +5349,8 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, return (0); /* l2arc read error; goto zio_read() */ + if (hash_lock != NULL) + mutex_enter(hash_lock); } else { DTRACE_PROBE1(l2arc__miss, arc_buf_hdr_t *, hdr); @@ -5341,6 +5371,10 @@ arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, arc_done_func_t *done, rzio = zio_read(pio, spa, bp, hdr->b_l1hdr.b_pabd, size, arc_read_done, hdr, priority, zio_flags, zb); + acb->acb_zio_head = rzio; + + if (hash_lock != NULL) + mutex_exit(hash_lock); if (*arc_flags & ARC_FLAG_WAIT) return (zio_wait(rzio)); @@ -5828,9 +5862,9 @@ arc_write_done(zio_t *zio) zio_t * arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, - boolean_t l2arc, const zio_prop_t *zp, arc_done_func_t *ready, - arc_done_func_t *children_ready, arc_done_func_t *physdone, - arc_done_func_t *done, void *private, zio_priority_t priority, + boolean_t l2arc, const zio_prop_t *zp, arc_write_done_func_t *ready, + arc_write_done_func_t *children_ready, arc_write_done_func_t *physdone, + arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb) { arc_buf_hdr_t *hdr = buf->b_hdr; @@ -6242,9 +6276,6 @@ arc_init(void) mutex_init(&arc_adjust_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&arc_adjust_waiters_cv, NULL, CV_DEFAULT, NULL); - /* Convert seconds to clock ticks */ - arc_min_prefetch_lifespan = 1 * hz; - /* set min cache to 1/32 of all memory, or 64MB, whichever is more */ arc_c_min = MAX(allmem / 32, 64 << 20); /* set max to 3/4 of all memory, or all but 1GB, whichever is more */ diff --git a/usr/src/uts/common/fs/zfs/dbuf.c b/usr/src/uts/common/fs/zfs/dbuf.c index 7421ea291b71..b9d6ca26fe31 100644 --- a/usr/src/uts/common/fs/zfs/dbuf.c +++ b/usr/src/uts/common/fs/zfs/dbuf.c @@ -942,7 +942,8 @@ dbuf_whichblock(dnode_t *dn, int64_t level, uint64_t offset) } static void -dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) +dbuf_read_done(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *vdb) { dmu_buf_impl_t *db = vdb; @@ -961,15 +962,19 @@ dbuf_read_done(zio_t *zio, arc_buf_t *buf, void *vdb) ASSERT3P(db->db_buf, ==, NULL); db->db_state = DB_UNCACHED; } else if (db->db_level == 0 && db->db_freed_in_flight) { - /* freed in flight */ + /* we were freed in flight; disregard any error */ ASSERT(zio == NULL || zio->io_error == 0); + if (buf == NULL) { + buf = arc_alloc_buf(db->db_objset->os_spa, + db, DBUF_GET_BUFC_TYPE(db), db->db.db_size); + } arc_release(buf, db); bzero(buf->b_data, db->db.db_size); arc_buf_freeze(buf); db->db_freed_in_flight = FALSE; dbuf_set_data(db, buf); db->db_state = DB_CACHED; - } else { + } else if (buf != NULL) { /* success */ ASSERT(zio == NULL || zio->io_error == 0); dbuf_set_data(db, buf); @@ -2395,7 +2400,8 @@ dbuf_issue_final_prefetch(dbuf_prefetch_arg_t *dpa, blkptr_t *bp) * prefetch if the next block down is our target. */ static void -dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) +dbuf_prefetch_indirect_done(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *iobp, arc_buf_t *abuf, void *private) { dbuf_prefetch_arg_t *dpa = private; @@ -2442,11 +2448,11 @@ dbuf_prefetch_indirect_done(zio_t *zio, arc_buf_t *abuf, void *private) } dpa->dpa_curlevel--; - uint64_t nextblkid = dpa->dpa_zb.zb_blkid >> (dpa->dpa_epbs * (dpa->dpa_curlevel - dpa->dpa_zb.zb_level)); blkptr_t *bp = ((blkptr_t *)abuf->b_data) + P2PHASE(nextblkid, 1ULL << dpa->dpa_epbs); + if (BP_IS_HOLE(bp)) { kmem_free(dpa, sizeof (*dpa)); } else if (dpa->dpa_curlevel == dpa->dpa_zb.zb_level) { @@ -3854,7 +3860,7 @@ dbuf_write(dbuf_dirty_record_t *dr, arc_buf_t *data, dmu_tx_t *tx) * ready callback so that we can properly handle an indirect * block that only contains holes. */ - arc_done_func_t *children_ready_cb = NULL; + arc_write_done_func_t *children_ready_cb = NULL; if (db->db_level != 0) children_ready_cb = dbuf_write_children_ready; diff --git a/usr/src/uts/common/fs/zfs/ddt.c b/usr/src/uts/common/fs/zfs/ddt.c index 963ecbd58fa7..1d51329511c5 100644 --- a/usr/src/uts/common/fs/zfs/ddt.c +++ b/usr/src/uts/common/fs/zfs/ddt.c @@ -1106,14 +1106,26 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) void ddt_sync(spa_t *spa, uint64_t txg) { + dsl_scan_t *scn = spa->spa_dsl_pool->dp_scan; dmu_tx_t *tx; - zio_t *rio = zio_root(spa, NULL, NULL, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); + zio_t *rio; ASSERT(spa_syncing_txg(spa) == txg); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); + rio = zio_root(spa, NULL, NULL, + ZIO_FLAG_CANFAIL | ZIO_FLAG_SPECULATIVE | ZIO_FLAG_SELF_HEAL); + + /* + * This function may cause an immediate scan of ddt blocks (see + * the comment above dsl_scan_ddt() for details). We set the + * scan's root zio here so that we can wait for any scan IOs in + * addition to the regular ddt IOs. + */ + ASSERT3P(scn->scn_zio_root, ==, NULL); + scn->scn_zio_root = rio; + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; if (ddt == NULL) @@ -1123,6 +1135,7 @@ ddt_sync(spa_t *spa, uint64_t txg) } (void) zio_wait(rio); + scn->scn_zio_root = NULL; dmu_tx_commit(tx); } diff --git a/usr/src/uts/common/fs/zfs/dmu_objset.c b/usr/src/uts/common/fs/zfs/dmu_objset.c index d83153935f18..771b803973f2 100644 --- a/usr/src/uts/common/fs/zfs/dmu_objset.c +++ b/usr/src/uts/common/fs/zfs/dmu_objset.c @@ -398,6 +398,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT(ds == NULL || MUTEX_HELD(&ds->ds_opening_lock)); +#if 0 /* * The $ORIGIN dataset (if it exists) doesn't have an associated * objset, so there's no reason to open it. The $ORIGIN dataset @@ -408,6 +409,7 @@ dmu_objset_open_impl(spa_t *spa, dsl_dataset_t *ds, blkptr_t *bp, ASSERT3P(ds->ds_dir, !=, spa_get_dsl(spa)->dp_origin_snap->ds_dir); } +#endif os = kmem_zalloc(sizeof (objset_t), KM_SLEEP); os->os_dsl_dataset = ds; diff --git a/usr/src/uts/common/fs/zfs/dmu_traverse.c b/usr/src/uts/common/fs/zfs/dmu_traverse.c index a95bf44fefe1..f57e51053060 100644 --- a/usr/src/uts/common/fs/zfs/dmu_traverse.c +++ b/usr/src/uts/common/fs/zfs/dmu_traverse.c @@ -492,7 +492,8 @@ traverse_prefetcher(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, const zbookmark_phys_t *zb, const dnode_phys_t *dnp, void *arg) { prefetch_data_t *pfd = arg; - arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + arc_flags_t aflags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH | + ARC_FLAG_PRESCIENT_PREFETCH; ASSERT(pfd->pd_bytes_fetched >= 0); if (bp == NULL) diff --git a/usr/src/uts/common/fs/zfs/dsl_scan.c b/usr/src/uts/common/fs/zfs/dsl_scan.c index c19e43bd9fa7..00bd1498a2fd 100644 --- a/usr/src/uts/common/fs/zfs/dsl_scan.c +++ b/usr/src/uts/common/fs/zfs/dsl_scan.c @@ -51,34 +51,157 @@ #include #include #include +#include #ifdef _KERNEL #include #endif +/* + * Grand theory statement on scan queue sorting + * + * Scanning is implemented by recursively traversing all indirection levels + * in an object and reading all blocks referenced from said objects. This + * results in us approximately traversing the object from lowest logical + * offset to the highest. For best performance, we would want the logical + * blocks to be physically contiguous. However, this is frequently not the + * case with pools given the allocation patterns of copy-on-write filesystems. + * So instead, we put the I/Os into a reordering queue and issue them in a + * way that will most benefit physical disks (LBA-order). + * + * Queue management: + * + * Ideally, we would want to scan all metadata and queue up all block I/O + * prior to starting to issue it, because that allows us to do an optimal + * sorting job. This can however consume large amounts of memory. Therefore + * we continuously monitor the size of the queues and constrain them to 5% + * (zfs_scan_mem_lim_fact) of physmem. If the queues grow larger than this + * limit, we clear out a few of the largest extents at the head of the queues + * to make room for more scanning. Hopefully, these extents will be fairly + * large and contiguous, allowing us to approach sequential I/O throughput + * even without a fully sorted tree. + * + * Metadata scanning takes place in dsl_scan_visit(), which is called from + * dsl_scan_sync() every spa_sync(). If we have either fully scanned all + * metadata on the pool, or we need to make room in memory because our + * queues are too large, dsl_scan_visit() is postponed and + * scan_io_queues_run() is called from dsl_scan_sync() instead. This implies + * that metadata scanning and queued I/O issuing are mutually exclusive. This + * allows us to provide maximum sequential I/O throughput for the majority of + * I/O's issued since sequential I/O performance is significantly negatively + * impacted if it is interleaved with random I/O. + * + * Implementation Notes + * + * One side effect of the queued scanning algorithm is that the scanning code + * needs to be notified whenever a block is freed. This is needed to allow + * the scanning code to remove these I/Os from the issuing queue. Additionally, + * we do not attempt to queue gang blocks to be issued sequentially since this + * is very hard to do and would have an extremely limited performance benefit. + * Instead, we simply issue gang I/Os as soon as we find them using the legacy + * algorithm. + * + * Backwards compatibility + * + * This new algorithm is backwards compatible with the legacy on-disk data + * structures (and therefore does not require a new feature flag). + * Periodically during scanning (see zfs_scan_checkpoint_intval), the scan + * will stop scanning metadata (in logical order) and wait for all outstanding + * sorted I/O to complete. Once this is done, we write out a checkpoint + * bookmark, indicating that we have scanned everything logically before it. + * If the pool is imported on a machine without the new sorting algorithm, + * the scan simply resumes from the last checkpoint using the legacy algorithm. + */ + typedef int (scan_cb_t)(dsl_pool_t *, const blkptr_t *, const zbookmark_phys_t *); static scan_cb_t dsl_scan_scrub_cb; -static void dsl_scan_cancel_sync(void *, dmu_tx_t *); -static void dsl_scan_sync_state(dsl_scan_t *, dmu_tx_t *); -static boolean_t dsl_scan_restarting(dsl_scan_t *, dmu_tx_t *); +static int scan_ds_queue_compare(const void *a, const void *b); +static int scan_prefetch_queue_compare(const void *a, const void *b); +static void scan_ds_queue_clear(dsl_scan_t *scn); +static boolean_t scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, + uint64_t *txg); +static void scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg); +static void scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj); +static void scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx); + +extern int zfs_vdev_async_write_active_min_dirty_percent; + +/* + * By default zfs will check to ensure it is not over the hard memory + * limit before each txg. If finer-grained control of this is needed + * this value can be set to 1 to enable checking before scanning each + * block. + */ +int zfs_scan_strict_mem_lim = B_FALSE; + +/* + * Maximum number of parallelly executing I/Os per top-level vdev. + * Tune with care. Very high settings (hundreds) are known to trigger + * some firmware bugs and resets on certain SSDs. + */ int zfs_top_maxinflight = 32; /* maximum I/Os per top-level */ -int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ -int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ -int zfs_scan_idle = 50; /* idle window in clock ticks */ - -int zfs_scan_min_time_ms = 1000; /* min millisecs to scrub per txg */ -int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ -int zfs_obsolete_min_time_ms = 500; /* min millisecs to obsolete per txg */ -int zfs_resilver_min_time_ms = 3000; /* min millisecs to resilver per txg */ +unsigned int zfs_resilver_delay = 2; /* number of ticks to delay resilver */ +unsigned int zfs_scrub_delay = 4; /* number of ticks to delay scrub */ +unsigned int zfs_scan_idle = 50; /* idle window in clock ticks */ + +/* + * Maximum number of parallelly executed bytes per leaf vdev. We attempt + * to strike a balance here between keeping the vdev queues full of I/Os + * at all times and not overflowing the queues to cause long latency, + * which would cause long txg sync times. No matter what, we will not + * overload the drives with I/O, since that is protected by + * zfs_vdev_scrub_max_active. + */ +unsigned long zfs_scan_vdev_limit = 4 << 20; + +int zfs_scan_issue_strategy = 0; +int zfs_scan_legacy = B_FALSE; /* don't queue & sort zios, go direct */ +uint64_t zfs_scan_max_ext_gap = 2 << 20; /* in bytes */ + +unsigned int zfs_scan_checkpoint_intval = 7200; /* seconds */ +#define ZFS_SCAN_CHECKPOINT_INTVAL SEC_TO_TICK(zfs_scan_checkpoint_intval) + +/* + * fill_weight is non-tunable at runtime, so we copy it at module init from + * zfs_scan_fill_weight. Runtime adjustments to zfs_scan_fill_weight would + * break queue sorting. + */ +uint64_t zfs_scan_fill_weight = 3; +static uint64_t fill_weight; + +/* See dsl_scan_should_clear() for details on the memory limit tunables */ +uint64_t zfs_scan_mem_lim_min = 16 << 20; /* bytes */ +uint64_t zfs_scan_mem_lim_soft_max = 128 << 20; /* bytes */ +int zfs_scan_mem_lim_fact = 20; /* fraction of physmem */ +int zfs_scan_mem_lim_soft_fact = 20; /* fraction of mem lim above */ + +unsigned int zfs_scrub_min_time_ms = 1000; /* min millisecs to scrub per txg */ +unsigned int zfs_free_min_time_ms = 1000; /* min millisecs to free per txg */ +/* min millisecs to obsolete per txg */ +unsigned int zfs_obsolete_min_time_ms = 500; +/* min millisecs to resilver per txg */ +unsigned int zfs_resilver_min_time_ms = 3000; boolean_t zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ boolean_t zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; -int dsl_scan_delay_completion = B_FALSE; /* set to delay scan completion */ /* max number of blocks to free in a single TXG */ uint64_t zfs_async_block_max_blocks = UINT64_MAX; +/* + * We wait a few txgs after importing a pool to begin scanning so that + * the import / mounting code isn't held up by scrub / resilver IO. + * Unfortunately, it is a bit difficult to determine exactly how long + * this will take since userspace will trigger fs mounts asynchronously + * and the kernel will create zvol minors asynchronously. As a result, + * the value provided here is a bit arbitrary, but represents a + * reasonable estimate of how many txgs it will take to finish fully + * importing a pool + */ +#define SCAN_IMPORT_WAIT_TXGS 5 + + #define DSL_SCAN_IS_SCRUB_RESILVER(scn) \ ((scn)->scn_phys.scn_func == POOL_SCAN_SCRUB || \ (scn)->scn_phys.scn_func == POOL_SCAN_RESILVER) @@ -97,6 +220,163 @@ static scan_cb_t *scan_funcs[POOL_SCAN_FUNCS] = { dsl_scan_scrub_cb, /* POOL_SCAN_RESILVER */ }; +/* In core node for the scn->scn_queue. Represents a dataset to be scanned */ +typedef struct { + uint64_t sds_dsobj; + uint64_t sds_txg; + avl_node_t sds_node; +} scan_ds_t; + +/* + * This controls what conditions are placed on dsl_scan_sync_state(): + * SYNC_OPTIONAL) write out scn_phys iff scn_bytes_pending == 0 + * SYNC_MANDATORY) write out scn_phys always. scn_bytes_pending must be 0. + * SYNC_CACHED) if scn_bytes_pending == 0, write out scn_phys. Otherwise + * write out the scn_phys_cached version. + * See dsl_scan_sync_state for details. + */ +typedef enum { + SYNC_OPTIONAL, + SYNC_MANDATORY, + SYNC_CACHED +} state_sync_type_t; + +/* + * This struct represents the minimum information needed to reconstruct a + * zio for sequential scanning. This is useful because many of these will + * accumulate in the sequential IO queues before being issued, so saving + * memory matters here. + */ +typedef struct scan_io { + /* fields from blkptr_t */ + uint64_t sio_offset; + uint64_t sio_blk_prop; + uint64_t sio_phys_birth; + uint64_t sio_birth; + zio_cksum_t sio_cksum; + uint32_t sio_asize; + + /* fields from zio_t */ + int sio_flags; + zbookmark_phys_t sio_zb; + + /* members for queue sorting */ + union { + avl_node_t sio_addr_node; /* link into issueing queue */ + list_node_t sio_list_node; /* link for issuing to disk */ + } sio_nodes; +} scan_io_t; + +struct dsl_scan_io_queue { + dsl_scan_t *q_scn; /* associated dsl_scan_t */ + vdev_t *q_vd; /* top-level vdev that this queue represents */ + + /* trees used for sorting I/Os and extents of I/Os */ + range_tree_t *q_exts_by_addr; + avl_tree_t q_exts_by_size; + avl_tree_t q_sios_by_addr; + + /* members for zio rate limiting */ + uint64_t q_maxinflight_bytes; + uint64_t q_inflight_bytes; + kcondvar_t q_zio_cv; /* used under vd->vdev_scan_io_queue_lock */ + + /* per txg statistics */ + uint64_t q_total_seg_size_this_txg; + uint64_t q_segs_this_txg; + uint64_t q_total_zio_size_this_txg; + uint64_t q_zios_this_txg; +}; + +/* private data for dsl_scan_prefetch_cb() */ +typedef struct scan_prefetch_ctx { + zfs_refcount_t spc_refcnt; /* refcount for memory management */ + dsl_scan_t *spc_scn; /* dsl_scan_t for the pool */ + boolean_t spc_root; /* is this prefetch for an objset? */ + uint8_t spc_indblkshift; /* dn_indblkshift of current dnode */ + uint16_t spc_datablkszsec; /* dn_idatablkszsec of current dnode */ +} scan_prefetch_ctx_t; + +/* private data for dsl_scan_prefetch() */ +typedef struct scan_prefetch_issue_ctx { + avl_node_t spic_avl_node; /* link into scn->scn_prefetch_queue */ + scan_prefetch_ctx_t *spic_spc; /* spc for the callback */ + blkptr_t spic_bp; /* bp to prefetch */ + zbookmark_phys_t spic_zb; /* bookmark to prefetch */ +} scan_prefetch_issue_ctx_t; + +static void scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue); +static void scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, + scan_io_t *sio); + +static dsl_scan_io_queue_t *scan_io_queue_create(vdev_t *vd); +static void scan_io_queues_destroy(dsl_scan_t *scn); + +static kmem_cache_t *sio_cache; + +void +scan_init(void) +{ + /* + * This is used in ext_size_compare() to weight segments + * based on how sparse they are. This cannot be changed + * mid-scan and the tree comparison functions don't currently + * have a mechansim for passing additional context to the + * compare functions. Thus we store this value globally and + * we only allow it to be set at module intiailization time + */ + fill_weight = zfs_scan_fill_weight; + + sio_cache = kmem_cache_create("sio_cache", + sizeof (scan_io_t), 0, NULL, NULL, NULL, NULL, NULL, 0); +} + +void +scan_fini(void) +{ + kmem_cache_destroy(sio_cache); +} + +static inline boolean_t +dsl_scan_is_running(const dsl_scan_t *scn) +{ + return (scn->scn_phys.scn_state == DSS_SCANNING); +} + +boolean_t +dsl_scan_resilvering(dsl_pool_t *dp) +{ + return (dsl_scan_is_running(dp->dp_scan) && + dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); +} + +static inline void +sio2bp(const scan_io_t *sio, blkptr_t *bp, uint64_t vdev_id) +{ + bzero(bp, sizeof (*bp)); + DVA_SET_ASIZE(&bp->blk_dva[0], sio->sio_asize); + DVA_SET_VDEV(&bp->blk_dva[0], vdev_id); + DVA_SET_OFFSET(&bp->blk_dva[0], sio->sio_offset); + bp->blk_prop = sio->sio_blk_prop; + bp->blk_phys_birth = sio->sio_phys_birth; + bp->blk_birth = sio->sio_birth; + bp->blk_fill = 1; /* we always only work with data pointers */ + bp->blk_cksum = sio->sio_cksum; +} + +static inline void +bp2sio(const blkptr_t *bp, scan_io_t *sio, int dva_i) +{ + /* we discard the vdev id, since we can deduce it from the queue */ + sio->sio_offset = DVA_GET_OFFSET(&bp->blk_dva[dva_i]); + sio->sio_asize = DVA_GET_ASIZE(&bp->blk_dva[dva_i]); + sio->sio_blk_prop = bp->blk_prop; + sio->sio_phys_birth = bp->blk_phys_birth; + sio->sio_birth = bp->blk_birth; + sio->sio_cksum = bp->blk_cksum; +} + int dsl_scan_init(dsl_pool_t *dp, uint64_t txg) { @@ -117,6 +397,13 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_async_destroying = spa_feature_is_active(dp->dp_spa, SPA_FEATURE_ASYNC_DESTROY); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + avl_create(&scn->scn_queue, scan_ds_queue_compare, sizeof (scan_ds_t), + offsetof(scan_ds_t, sds_node)); + avl_create(&scn->scn_prefetch_queue, scan_prefetch_queue_compare, + sizeof (scan_prefetch_issue_ctx_t), + offsetof(scan_prefetch_issue_ctx_t, spic_avl_node)); + err = zap_lookup(dp->dp_meta_objset, DMU_POOL_DIRECTORY_OBJECT, "scrub_func", sizeof (uint64_t), 1, &f); if (err == 0) { @@ -127,7 +414,7 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_restart_txg = txg; zfs_dbgmsg("old-style scrub was in progress; " "restarting new-style scrub in txg %llu", - scn->scn_restart_txg); + (longlong_t)scn->scn_restart_txg); /* * Load the queue obj from the old location so that it @@ -145,7 +432,14 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) else if (err) return (err); - if (scn->scn_phys.scn_state == DSS_SCANNING && + /* + * We might be restarting after a reboot, so jump the issued + * counter to how far we've scanned. We know we're consistent + * up to here. + */ + scn->scn_issued_before_pass = scn->scn_phys.scn_examined; + + if (dsl_scan_is_running(scn) && spa_prev_software_version(dp->dp_spa) < SPA_VERSION_SCAN) { /* * A new-type scrub was in progress on an old @@ -157,8 +451,24 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) scn->scn_restart_txg = txg; zfs_dbgmsg("new-style scrub was modified " "by old software; restarting in txg %llu", - scn->scn_restart_txg); + (longlong_t)scn->scn_restart_txg); + } + } + + /* reload the queue into the in-core state */ + if (scn->scn_phys.scn_queue_obj != 0) { + zap_cursor_t zc; + zap_attribute_t za; + + for (zap_cursor_init(&zc, dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + za.za_first_integer); } + zap_cursor_fini(&zc); } spa_scan_stat_init(spa); @@ -168,19 +478,116 @@ dsl_scan_init(dsl_pool_t *dp, uint64_t txg) void dsl_scan_fini(dsl_pool_t *dp) { - if (dp->dp_scan) { + if (dp->dp_scan != NULL) { + dsl_scan_t *scn = dp->dp_scan; + + if (scn->scn_taskq != NULL) + taskq_destroy(scn->scn_taskq); + scan_ds_queue_clear(scn); + avl_destroy(&scn->scn_queue); + avl_destroy(&scn->scn_prefetch_queue); + kmem_free(dp->dp_scan, sizeof (dsl_scan_t)); dp->dp_scan = NULL; } } +static boolean_t +dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) +{ + return (scn->scn_restart_txg != 0 && + scn->scn_restart_txg <= tx->tx_txg); +} + +boolean_t +dsl_scan_scrubbing(const dsl_pool_t *dp) +{ + dsl_scan_phys_t *scn_phys = &dp->dp_scan->scn_phys; + + return (scn_phys->scn_state == DSS_SCANNING && + scn_phys->scn_func == POOL_SCAN_SCRUB); +} + +boolean_t +dsl_scan_is_paused_scrub(const dsl_scan_t *scn) +{ + return (dsl_scan_scrubbing(scn->scn_dp) && + scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED); +} + +/* + * Writes out a persistent dsl_scan_phys_t record to the pool directory. + * Because we can be running in the block sorting algorithm, we do not always + * want to write out the record, only when it is "safe" to do so. This safety + * condition is achieved by making sure that the sorting queues are empty + * (scn_bytes_pending == 0). When this condition is not true, the sync'd state + * is inconsistent with how much actual scanning progress has been made. The + * kind of sync to be performed is specified by the sync_type argument. If the + * sync is optional, we only sync if the queues are empty. If the sync is + * mandatory, we do a hard ASSERT to make sure that the queues are empty. The + * third possible state is a "cached" sync. This is done in response to: + * 1) The dataset that was in the last sync'd dsl_scan_phys_t having been + * destroyed, so we wouldn't be able to restart scanning from it. + * 2) The snapshot that was in the last sync'd dsl_scan_phys_t having been + * superseded by a newer snapshot. + * 3) The dataset that was in the last sync'd dsl_scan_phys_t having been + * swapped with its clone. + * In all cases, a cached sync simply rewrites the last record we've written, + * just slightly modified. For the modifications that are performed to the + * last written dsl_scan_phys_t, see dsl_scan_ds_destroyed, + * dsl_scan_ds_snapshotted and dsl_scan_ds_clone_swapped. + */ +static void +dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx, state_sync_type_t sync_type) +{ + int i; + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(sync_type != SYNC_MANDATORY || scn->scn_bytes_pending == 0); + if (scn->scn_bytes_pending == 0) { + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *q = vd->vdev_scan_io_queue; + + if (q == NULL) + continue; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + ASSERT3P(avl_first(&q->q_sios_by_addr), ==, NULL); + ASSERT3P(avl_first(&q->q_exts_by_size), ==, NULL); + ASSERT3P(range_tree_first(q->q_exts_by_addr), ==, NULL); + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + if (scn->scn_phys.scn_queue_obj != 0) + scan_ds_queue_sync(scn, tx); + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys, tx)); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, + sizeof (scn->scn_phys)); + + if (scn->scn_checkpointing) + zfs_dbgmsg("finish scan checkpoint"); + + scn->scn_checkpointing = B_FALSE; + scn->scn_last_checkpoint = ddi_get_lbolt(); + } else if (sync_type == SYNC_CACHED) { + VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, + DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, + &scn->scn_phys_cached, tx)); + } +} + /* ARGSUSED */ static int dsl_scan_setup_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - if (scn->scn_phys.scn_state == DSS_SCANNING) + if (dsl_scan_is_running(scn)) return (SET_ERROR(EBUSY)); return (0); @@ -195,7 +602,7 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) dsl_pool_t *dp = scn->scn_dp; spa_t *spa = dp->dp_spa; - ASSERT(scn->scn_phys.scn_state != DSS_SCANNING); + ASSERT(!dsl_scan_is_running(scn)); ASSERT(*funcp > POOL_SCAN_NONE && *funcp < POOL_SCAN_FUNCS); bzero(&scn->scn_phys, sizeof (scn->scn_phys)); scn->scn_phys.scn_func = *funcp; @@ -206,8 +613,11 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) scn->scn_phys.scn_start_time = gethrestime_sec(); scn->scn_phys.scn_errors = 0; scn->scn_phys.scn_to_examine = spa->spa_root_vdev->vdev_stat.vs_alloc; + scn->scn_issued_before_pass = 0; scn->scn_restart_txg = 0; scn->scn_done_txg = 0; + scn->scn_last_checkpoint = 0; + scn->scn_checkpointing = B_FALSE; spa_scan_stat_init(spa); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { @@ -240,8 +650,10 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) if (dp->dp_blkstats == NULL) { dp->dp_blkstats = kmem_alloc(sizeof (zfs_all_blkstats_t), KM_SLEEP); + mutex_init(&dp->dp_blkstats->zab_lock, NULL, + MUTEX_DEFAULT, NULL); } - bzero(dp->dp_blkstats, sizeof (zfs_all_blkstats_t)); + bzero(&dp->dp_blkstats->zab_type, sizeof (dp->dp_blkstats->zab_type)); if (spa_version(spa) < SPA_VERSION_DSL_SCRUB) ot = DMU_OT_ZAP_OTHER; @@ -249,13 +661,53 @@ dsl_scan_setup_sync(void *arg, dmu_tx_t *tx) scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot ? ot : DMU_OT_SCAN_QUEUE, DMU_OT_NONE, 0, tx); - dsl_scan_sync_state(scn, tx); + bcopy(&scn->scn_phys, &scn->scn_phys_cached, sizeof (scn->scn_phys)); + + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_history_log_internal(spa, "scan setup", tx, "func=%u mintxg=%llu maxtxg=%llu", *funcp, scn->scn_phys.scn_min_txg, scn->scn_phys.scn_max_txg); } +/* + * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. + * Can also be called to resume a paused scrub. + */ +int +dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + + /* + * Purge all vdev caches and probe all devices. We do this here + * rather than in sync context because this requires a writer lock + * on the spa_config lock, which we can't do from sync context. The + * spa_scrub_reopen flag indicates that vdev_open() should not + * attempt to start another scrub. + */ + spa_vdev_state_enter(spa, SCL_NONE); + spa->spa_scrub_reopen = B_TRUE; + vdev_reopen(spa->spa_root_vdev); + spa->spa_scrub_reopen = B_FALSE; + (void) spa_vdev_state_exit(spa, NULL, 0); + + if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { + /* got scrub start cmd, resume paused scrub */ + int err = dsl_scrub_set_pause_resume(scn->scn_dp, + POOL_SCRUB_NORMAL); + if (err == 0) { + spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); + return (ECANCELED); + } + return (SET_ERROR(err)); + } + + return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, + dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); +} + /* ARGSUSED */ static void dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) @@ -283,10 +735,11 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) } if (scn->scn_phys.scn_queue_obj != 0) { - VERIFY(0 == dmu_object_free(dp->dp_meta_objset, + VERIFY0(dmu_object_free(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, tx)); scn->scn_phys.scn_queue_obj = 0; } + scan_ds_queue_clear(scn); scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; @@ -294,13 +747,22 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) * If we were "restarted" from a stopped state, don't bother * with anything else. */ - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn)) { + ASSERT(!scn->scn_is_sorted); return; + } - if (complete) - scn->scn_phys.scn_state = DSS_FINISHED; - else - scn->scn_phys.scn_state = DSS_CANCELED; + if (scn->scn_is_sorted) { + scan_io_queues_destroy(scn); + scn->scn_is_sorted = B_FALSE; + + if (scn->scn_taskq != NULL) { + taskq_destroy(scn->scn_taskq); + scn->scn_taskq = NULL; + } + } + + scn->scn_phys.scn_state = complete ? DSS_FINISHED : DSS_CANCELED; if (dsl_scan_restarting(scn, tx)) spa_history_log_internal(spa, "scan aborted, restarting", tx, @@ -313,12 +775,6 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) "errors=%llu", spa_get_errlog_size(spa)); if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) { - cv_wait(&spa->spa_scrub_io_cv, - &spa->spa_scrub_lock); - } - mutex_exit(&spa->spa_scrub_lock); spa->spa_scrub_started = B_FALSE; spa->spa_scrub_active = B_FALSE; @@ -354,6 +810,8 @@ dsl_scan_done(dsl_scan_t *scn, boolean_t complete, dmu_tx_t *tx) } scn->scn_phys.scn_end_time = gethrestime_sec(); + + ASSERT(!dsl_scan_is_running(scn)); } /* ARGSUSED */ @@ -362,7 +820,7 @@ dsl_scan_cancel_check(void *arg, dmu_tx_t *tx) { dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn)) return (SET_ERROR(ENOENT)); return (0); } @@ -374,7 +832,7 @@ dsl_scan_cancel_sync(void *arg, dmu_tx_t *tx) dsl_scan_t *scn = dmu_tx_pool(tx)->dp_scan; dsl_scan_done(scn, B_FALSE, tx); - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_MANDATORY); spa_event_notify(scn->scn_dp->dp_spa, NULL, NULL, ESC_ZFS_SCRUB_ABORT); } @@ -385,16 +843,6 @@ dsl_scan_cancel(dsl_pool_t *dp) dsl_scan_cancel_sync, NULL, 3, ZFS_SPACE_CHECK_RESERVED)); } -boolean_t -dsl_scan_is_paused_scrub(const dsl_scan_t *scn) -{ - if (dsl_scan_scrubbing(scn->scn_dp) && - scn->scn_phys.scn_flags & DSF_SCRUB_PAUSED) - return (B_TRUE); - - return (B_FALSE); -} - static int dsl_scrub_pause_resume_check(void *arg, dmu_tx_t *tx) { @@ -429,7 +877,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) /* can't pause a scrub when there is no in-progress scrub */ spa->spa_scan_pass_scrub_pause = gethrestime_sec(); scn->scn_phys.scn_flags |= DSF_SCRUB_PAUSED; - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_PAUSED); } else { ASSERT3U(*cmd, ==, POOL_SCRUB_NORMAL); @@ -443,7 +891,7 @@ dsl_scrub_pause_resume_sync(void *arg, dmu_tx_t *tx) gethrestime_sec() - spa->spa_scan_pass_scrub_pause; spa->spa_scan_pass_scrub_pause = 0; scn->scn_phys.scn_flags &= ~DSF_SCRUB_PAUSED; - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } } } @@ -459,25 +907,25 @@ dsl_scrub_set_pause_resume(const dsl_pool_t *dp, pool_scrub_cmd_t cmd) ZFS_SPACE_CHECK_RESERVED)); } -boolean_t -dsl_scan_scrubbing(const dsl_pool_t *dp) -{ - dsl_scan_t *scn = dp->dp_scan; - if (scn->scn_phys.scn_state == DSS_SCANNING && - scn->scn_phys.scn_func == POOL_SCAN_SCRUB) - return (B_TRUE); +/* start a new scan, or restart an existing one. */ +void +dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) +{ + if (txg == 0) { + dmu_tx_t *tx; + tx = dmu_tx_create_dd(dp->dp_mos_dir); + VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); - return (B_FALSE); + txg = dmu_tx_get_txg(tx); + dp->dp_scan->scn_restart_txg = txg; + dmu_tx_commit(tx); + } else { + dp->dp_scan->scn_restart_txg = txg; + } + zfs_dbgmsg("restarting resilver txg=%llu", txg); } -static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, - dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, - dmu_objset_type_t ostype, dmu_tx_t *tx); -static void dsl_scan_visitdnode(dsl_scan_t *, dsl_dataset_t *ds, - dmu_objset_type_t ostype, - dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); - void dsl_free(dsl_pool_t *dp, uint64_t txg, const blkptr_t *bp) { @@ -491,25 +939,169 @@ dsl_free_sync(zio_t *pio, dsl_pool_t *dp, uint64_t txg, const blkptr_t *bpp) zio_nowait(zio_free_sync(pio, dp->dp_spa, txg, bpp, pio->io_flags)); } -static uint64_t -dsl_scan_ds_maxtxg(dsl_dataset_t *ds) +static int +scan_ds_queue_compare(const void *a, const void *b) { - uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; - if (ds->ds_is_snapshot) - return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); - return (smt); + const scan_ds_t *sds_a = a, *sds_b = b; + + if (sds_a->sds_dsobj < sds_b->sds_dsobj) + return (-1); + if (sds_a->sds_dsobj == sds_b->sds_dsobj) + return (0); + return (1); } static void -dsl_scan_sync_state(dsl_scan_t *scn, dmu_tx_t *tx) +scan_ds_queue_clear(dsl_scan_t *scn) { - VERIFY0(zap_update(scn->scn_dp->dp_meta_objset, - DMU_POOL_DIRECTORY_OBJECT, - DMU_POOL_SCAN, sizeof (uint64_t), SCAN_PHYS_NUMINTS, - &scn->scn_phys, tx)); + void *cookie = NULL; + scan_ds_t *sds; + while ((sds = avl_destroy_nodes(&scn->scn_queue, &cookie)) != NULL) { + kmem_free(sds, sizeof (*sds)); + } } -extern int zfs_vdev_async_write_active_min_dirty_percent; +static boolean_t +scan_ds_queue_contains(dsl_scan_t *scn, uint64_t dsobj, uint64_t *txg) +{ + scan_ds_t srch, *sds; + + srch.sds_dsobj = dsobj; + sds = avl_find(&scn->scn_queue, &srch, NULL); + if (sds != NULL && txg != NULL) + *txg = sds->sds_txg; + return (sds != NULL); +} + +static void +scan_ds_queue_insert(dsl_scan_t *scn, uint64_t dsobj, uint64_t txg) +{ + scan_ds_t *sds; + avl_index_t where; + + sds = kmem_zalloc(sizeof (*sds), KM_SLEEP); + sds->sds_dsobj = dsobj; + sds->sds_txg = txg; + + VERIFY3P(avl_find(&scn->scn_queue, sds, &where), ==, NULL); + avl_insert(&scn->scn_queue, sds, where); +} + +static void +scan_ds_queue_remove(dsl_scan_t *scn, uint64_t dsobj) +{ + scan_ds_t srch, *sds; + + srch.sds_dsobj = dsobj; + + sds = avl_find(&scn->scn_queue, &srch, NULL); + VERIFY(sds != NULL); + avl_remove(&scn->scn_queue, sds); + kmem_free(sds, sizeof (*sds)); +} + +static void +scan_ds_queue_sync(dsl_scan_t *scn, dmu_tx_t *tx) +{ + dsl_pool_t *dp = scn->scn_dp; + spa_t *spa = dp->dp_spa; + dmu_object_type_t ot = (spa_version(spa) >= SPA_VERSION_DSL_SCRUB) ? + DMU_OT_SCAN_QUEUE : DMU_OT_ZAP_OTHER; + + ASSERT0(scn->scn_bytes_pending); + ASSERT(scn->scn_phys.scn_queue_obj != 0); + + VERIFY0(dmu_object_free(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, tx)); + scn->scn_phys.scn_queue_obj = zap_create(dp->dp_meta_objset, ot, + DMU_OT_NONE, 0, tx); + for (scan_ds_t *sds = avl_first(&scn->scn_queue); + sds != NULL; sds = AVL_NEXT(&scn->scn_queue, sds)) { + VERIFY0(zap_add_int_key(dp->dp_meta_objset, + scn->scn_phys.scn_queue_obj, sds->sds_dsobj, + sds->sds_txg, tx)); + } +} + +/* + * Computes the memory limit state that we're currently in. A sorted scan + * needs quite a bit of memory to hold the sorting queue, so we need to + * reasonably constrain the size so it doesn't impact overall system + * performance. We compute two limits: + * 1) Hard memory limit: if the amount of memory used by the sorting + * queues on a pool gets above this value, we stop the metadata + * scanning portion and start issuing the queued up and sorted + * I/Os to reduce memory usage. + * This limit is calculated as a fraction of physmem (by default 5%). + * We constrain the lower bound of the hard limit to an absolute + * minimum of zfs_scan_mem_lim_min (default: 16 MiB). We also constrain + * the upper bound to 5% of the total pool size - no chance we'll + * ever need that much memory, but just to keep the value in check. + * 2) Soft memory limit: once we hit the hard memory limit, we start + * issuing I/O to reduce queue memory usage, but we don't want to + * completely empty out the queues, since we might be able to find I/Os + * that will fill in the gaps of our non-sequential IOs at some point + * in the future. So we stop the issuing of I/Os once the amount of + * memory used drops below the soft limit (at which point we stop issuing + * I/O and start scanning metadata again). + * + * This limit is calculated by subtracting a fraction of the hard + * limit from the hard limit. By default this fraction is 5%, so + * the soft limit is 95% of the hard limit. We cap the size of the + * difference between the hard and soft limits at an absolute + * maximum of zfs_scan_mem_lim_soft_max (default: 128 MiB) - this is + * sufficient to not cause too frequent switching between the + * metadata scan and I/O issue (even at 2k recordsize, 128 MiB's + * worth of queues is about 1.2 GiB of on-pool data, so scanning + * that should take at least a decent fraction of a second). + */ +static boolean_t +dsl_scan_should_clear(dsl_scan_t *scn) +{ + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + uint64_t mlim_hard, mlim_soft, mused; + uint64_t alloc = metaslab_class_get_alloc(spa_normal_class( + scn->scn_dp->dp_spa)); + + mlim_hard = MAX((physmem / zfs_scan_mem_lim_fact) * PAGESIZE, + zfs_scan_mem_lim_min); + mlim_hard = MIN(mlim_hard, alloc / 20); + mlim_soft = mlim_hard - MIN(mlim_hard / zfs_scan_mem_lim_soft_fact, + zfs_scan_mem_lim_soft_max); + mused = 0; + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + dsl_scan_io_queue_t *queue; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + queue = tvd->vdev_scan_io_queue; + if (queue != NULL) { + /* #extents in exts_by_size = # in exts_by_addr */ + mused += avl_numnodes(&queue->q_exts_by_size) * + sizeof (range_seg_t) + + avl_numnodes(&queue->q_sios_by_addr) * + sizeof (scan_io_t); + } + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } + + dprintf("current scan memory usage: %llu bytes\n", (longlong_t)mused); + + if (mused == 0) + ASSERT0(scn->scn_bytes_pending); + + /* + * If we are above our hard limit, we need to clear out memory. + * If we are below our soft limit, we need to accumulate sequential IOs. + * Otherwise, we should keep doing whatever we are currently doing. + */ + if (mused >= mlim_hard) + return (B_TRUE); + else if (mused < mlim_soft) + return (B_FALSE); + else + return (scn->scn_clearing); +} static boolean_t dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) @@ -530,9 +1122,6 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) /* * We suspend if: - * - we have scanned for the maximum time: an entire txg - * timeout (default 5 sec) - * or * - we have scanned for at least the minimum time (default 1 sec * for scrub, 3 sec for resilver), and either we have sufficient * dirty data that we are starting to write more quickly @@ -541,16 +1130,24 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) * or * - the spa is shutting down because this pool is being exported * or the machine is rebooting. + * or + * - the scan queue has reached its memory use limit */ - int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? - zfs_resilver_min_time_ms : zfs_scan_min_time_ms; - uint64_t elapsed_nanosecs = gethrtime() - scn->scn_sync_start_time; + hrtime_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; - if (elapsed_nanosecs / NANOSEC >= zfs_txg_timeout || - (NSEC2MSEC(elapsed_nanosecs) > mintime && - (txg_sync_waiting(scn->scn_dp) || - dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent)) || - spa_shutting_down(scn->scn_dp->dp_spa)) { + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + if ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa) || + (zfs_scan_strict_mem_lim && dsl_scan_should_clear(scn))) { if (zb) { dprintf("suspending at bookmark %llx/%llx/%llx/%llx\n", (longlong_t)zb->zb_objset, @@ -558,12 +1155,16 @@ dsl_scan_check_suspend(dsl_scan_t *scn, const zbookmark_phys_t *zb) (longlong_t)zb->zb_level, (longlong_t)zb->zb_blkid); scn->scn_phys.scn_bookmark = *zb; + } else { + dsl_scan_phys_t *scnp = &scn->scn_phys; + + dprintf("suspending at DDT bookmark " + "%llx/%llx/%llx/%llx\n", + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); } - dprintf("suspending at DDT bookmark %llx/%llx/%llx/%llx\n", - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); scn->scn_suspending = B_TRUE; return (B_TRUE); } @@ -662,26 +1263,278 @@ dsl_scan_zil(dsl_pool_t *dp, zil_header_t *zh) zil_free(zilog); } -/* ARGSUSED */ -static void -dsl_scan_prefetch(dsl_scan_t *scn, arc_buf_t *buf, blkptr_t *bp, - uint64_t objset, uint64_t object, uint64_t blkid) +/* + * We compare scan_prefetch_issue_ctx_t's based on their bookmarks. The idea + * here is to sort the AVL tree by the order each block will be needed. + */ +static int +scan_prefetch_queue_compare(const void *a, const void *b) +{ + const scan_prefetch_issue_ctx_t *spic_a = a, *spic_b = b; + const scan_prefetch_ctx_t *spc_a = spic_a->spic_spc; + const scan_prefetch_ctx_t *spc_b = spic_b->spic_spc; + + return (zbookmark_compare(spc_a->spc_datablkszsec, + spc_a->spc_indblkshift, spc_b->spc_datablkszsec, + spc_b->spc_indblkshift, &spic_a->spic_zb, &spic_b->spic_zb)); +} + +static void +scan_prefetch_ctx_rele(scan_prefetch_ctx_t *spc, void *tag) +{ + if (zfs_refcount_remove(&spc->spc_refcnt, tag) == 0) { + zfs_refcount_destroy(&spc->spc_refcnt); + kmem_free(spc, sizeof (scan_prefetch_ctx_t)); + } +} + +static scan_prefetch_ctx_t * +scan_prefetch_ctx_create(dsl_scan_t *scn, dnode_phys_t *dnp, void *tag) +{ + scan_prefetch_ctx_t *spc; + + spc = kmem_alloc(sizeof (scan_prefetch_ctx_t), KM_SLEEP); + zfs_refcount_create(&spc->spc_refcnt); + zfs_refcount_add(&spc->spc_refcnt, tag); + spc->spc_scn = scn; + if (dnp != NULL) { + spc->spc_datablkszsec = dnp->dn_datablkszsec; + spc->spc_indblkshift = dnp->dn_indblkshift; + spc->spc_root = B_FALSE; + } else { + spc->spc_datablkszsec = 0; + spc->spc_indblkshift = 0; + spc->spc_root = B_TRUE; + } + + return (spc); +} + +static void +scan_prefetch_ctx_add_ref(scan_prefetch_ctx_t *spc, void *tag) +{ + zfs_refcount_add(&spc->spc_refcnt, tag); +} + +static boolean_t +dsl_scan_check_prefetch_resume(scan_prefetch_ctx_t *spc, + const zbookmark_phys_t *zb) +{ + zbookmark_phys_t *last_zb = &spc->spc_scn->scn_prefetch_bookmark; + dnode_phys_t tmp_dnp; + dnode_phys_t *dnp = (spc->spc_root) ? NULL : &tmp_dnp; + + if (zb->zb_objset != last_zb->zb_objset) + return (B_TRUE); + if ((int64_t)zb->zb_object < 0) + return (B_FALSE); + + tmp_dnp.dn_datablkszsec = spc->spc_datablkszsec; + tmp_dnp.dn_indblkshift = spc->spc_indblkshift; + + if (zbookmark_subtree_completed(dnp, zb, last_zb)) + return (B_TRUE); + + return (B_FALSE); +} + +static void +dsl_scan_prefetch(scan_prefetch_ctx_t *spc, blkptr_t *bp, zbookmark_phys_t *zb) { - zbookmark_phys_t czb; - arc_flags_t flags = ARC_FLAG_NOWAIT | ARC_FLAG_PREFETCH; + avl_index_t idx; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + scan_prefetch_issue_ctx_t *spic; if (zfs_no_scrub_prefetch) return; - if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_min_txg || - (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE)) + if (BP_IS_HOLE(bp) || bp->blk_birth <= scn->scn_phys.scn_cur_min_txg || + (BP_GET_LEVEL(bp) == 0 && BP_GET_TYPE(bp) != DMU_OT_DNODE && + BP_GET_TYPE(bp) != DMU_OT_OBJSET)) + return; + + if (dsl_scan_check_prefetch_resume(spc, zb)) + return; + + scan_prefetch_ctx_add_ref(spc, scn); + spic = kmem_alloc(sizeof (scan_prefetch_issue_ctx_t), KM_SLEEP); + spic->spic_spc = spc; + spic->spic_bp = *bp; + spic->spic_zb = *zb; + + /* + * Add the IO to the queue of blocks to prefetch. This allows us to + * prioritize blocks that we will need first for the main traversal + * thread. + */ + mutex_enter(&spa->spa_scrub_lock); + if (avl_find(&scn->scn_prefetch_queue, spic, &idx) != NULL) { + /* this block is already queued for prefetch */ + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + scan_prefetch_ctx_rele(spc, scn); + mutex_exit(&spa->spa_scrub_lock); + return; + } + + avl_insert(&scn->scn_prefetch_queue, spic, idx); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); +} + +static void +dsl_scan_prefetch_dnode(dsl_scan_t *scn, dnode_phys_t *dnp, + uint64_t objset, uint64_t object) +{ + int i; + zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; + + if (dnp->dn_nblkptr == 0 && !(dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR)) return; - SET_BOOKMARK(&czb, objset, object, BP_GET_LEVEL(bp), blkid); + SET_BOOKMARK(&zb, objset, object, 0, 0); + + spc = scan_prefetch_ctx_create(scn, dnp, FTAG); + + for (i = 0; i < dnp->dn_nblkptr; i++) { + zb.zb_level = BP_GET_LEVEL(&dnp->dn_blkptr[i]); + zb.zb_blkid = i; + dsl_scan_prefetch(spc, &dnp->dn_blkptr[i], &zb); + } + + if (dnp->dn_flags & DNODE_FLAG_SPILL_BLKPTR) { + zb.zb_level = 0; + zb.zb_blkid = DMU_SPILL_BLKID; + dsl_scan_prefetch(spc, &dnp->dn_spill, &zb); + } + + scan_prefetch_ctx_rele(spc, FTAG); +} + +void +dsl_scan_prefetch_cb(zio_t *zio, const zbookmark_phys_t *zb, const blkptr_t *bp, + arc_buf_t *buf, void *private) +{ + scan_prefetch_ctx_t *spc = private; + dsl_scan_t *scn = spc->spc_scn; + spa_t *spa = scn->scn_dp->dp_spa; + + /* broadcast that the IO has completed for rate limitting purposes */ + mutex_enter(&spa->spa_scrub_lock); + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + + /* if there was an error or we are done prefetching, just cleanup */ + if (buf == NULL || scn->scn_suspending) + goto out; + + if (BP_GET_LEVEL(bp) > 0) { + int i; + blkptr_t *cbp; + int epb = BP_GET_LSIZE(bp) >> SPA_BLKPTRSHIFT; + zbookmark_phys_t czb; + + for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { + SET_BOOKMARK(&czb, zb->zb_objset, zb->zb_object, + zb->zb_level - 1, zb->zb_blkid * epb + i); + dsl_scan_prefetch(spc, cbp, &czb); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { + dnode_phys_t *cdnp = buf->b_data; + int i; + int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; + + for (i = 0, cdnp = buf->b_data; i < epb; + i += cdnp->dn_extra_slots + 1, + cdnp += cdnp->dn_extra_slots + 1) { + dsl_scan_prefetch_dnode(scn, cdnp, + zb->zb_objset, zb->zb_blkid * epb + i); + } + } else if (BP_GET_TYPE(bp) == DMU_OT_OBJSET) { + objset_phys_t *osp = buf->b_data; + + dsl_scan_prefetch_dnode(scn, &osp->os_meta_dnode, + zb->zb_objset, DMU_META_DNODE_OBJECT); - (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, bp, - NULL, NULL, ZIO_PRIORITY_ASYNC_READ, - ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD, &flags, &czb); + if (OBJSET_BUF_HAS_USERUSED(buf)) { + dsl_scan_prefetch_dnode(scn, + &osp->os_groupused_dnode, zb->zb_objset, + DMU_GROUPUSED_OBJECT); + dsl_scan_prefetch_dnode(scn, + &osp->os_userused_dnode, zb->zb_objset, + DMU_USERUSED_OBJECT); + } + } + +out: + if (buf != NULL) + arc_buf_destroy(buf, private); + scan_prefetch_ctx_rele(spc, scn); +} + +/* ARGSUSED */ +static void +dsl_scan_prefetch_thread(void *arg) +{ + dsl_scan_t *scn = arg; + spa_t *spa = scn->scn_dp->dp_spa; + vdev_t *rvd = spa->spa_root_vdev; + uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; + scan_prefetch_issue_ctx_t *spic; + + /* loop until we are told to stop */ + while (!scn->scn_prefetch_stop) { + arc_flags_t flags = ARC_FLAG_NOWAIT | + ARC_FLAG_PRESCIENT_PREFETCH | ARC_FLAG_PREFETCH; + int zio_flags = ZIO_FLAG_CANFAIL | ZIO_FLAG_SCAN_THREAD; + + mutex_enter(&spa->spa_scrub_lock); + + /* + * Wait until we have an IO to issue and are not above our + * maximum in flight limit. + */ + while (!scn->scn_prefetch_stop && + (avl_numnodes(&scn->scn_prefetch_queue) == 0 || + spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes)) { + cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); + } + + /* recheck if we should stop since we waited for the cv */ + if (scn->scn_prefetch_stop) { + mutex_exit(&spa->spa_scrub_lock); + break; + } + + /* remove the prefetch IO from the tree */ + spic = avl_first(&scn->scn_prefetch_queue); + spa->spa_scrub_inflight += BP_GET_PSIZE(&spic->spic_bp); + avl_remove(&scn->scn_prefetch_queue, spic); + + mutex_exit(&spa->spa_scrub_lock); + + /* issue the prefetch asynchronously */ + (void) arc_read(scn->scn_zio_root, scn->scn_dp->dp_spa, + &spic->spic_bp, dsl_scan_prefetch_cb, spic->spic_spc, + ZIO_PRIORITY_SCRUB, zio_flags, &flags, &spic->spic_zb); + + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + + ASSERT(scn->scn_prefetch_stop); + + /* free any prefetches we didn't get to complete */ + mutex_enter(&spa->spa_scrub_lock); + while ((spic = avl_first(&scn->scn_prefetch_queue)) != NULL) { + avl_remove(&scn->scn_prefetch_queue, spic); + scan_prefetch_ctx_rele(spic->spic_spc, scn); + kmem_free(spic, sizeof (scan_prefetch_issue_ctx_t)); + } + ASSERT0(avl_numnodes(&scn->scn_prefetch_queue)); + mutex_exit(&spa->spa_scrub_lock); } static boolean_t @@ -720,6 +1573,13 @@ dsl_scan_check_resume(dsl_scan_t *scn, const dnode_phys_t *dnp, return (B_FALSE); } +static void dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, + dnode_phys_t *dnp, dsl_dataset_t *ds, dsl_scan_t *scn, + dmu_objset_type_t ostype, dmu_tx_t *tx); +static void dsl_scan_visitdnode( + dsl_scan_t *, dsl_dataset_t *ds, dmu_objset_type_t ostype, + dnode_phys_t *dnp, uint64_t object, dmu_tx_t *tx); + /* * Return nonzero on i/o error. * Return new buf to write out in *bufp. @@ -741,15 +1601,11 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { - dsl_scan_prefetch(scn, buf, cbp, zb->zb_objset, - zb->zb_object, zb->zb_blkid * epb + i); - } for (i = 0, cbp = buf->b_data; i < epb; i++, cbp++) { zbookmark_phys_t czb; @@ -763,25 +1619,16 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, } else if (BP_GET_TYPE(bp) == DMU_OT_DNODE) { arc_flags_t flags = ARC_FLAG_WAIT; dnode_phys_t *cdnp; - int i, j; + int i; int epb = BP_GET_LSIZE(bp) >> DNODE_SHIFT; arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); } - for (i = 0, cdnp = buf->b_data; i < epb; - i += cdnp->dn_extra_slots + 1, - cdnp += cdnp->dn_extra_slots + 1) { - for (j = 0; j < cdnp->dn_nblkptr; j++) { - blkptr_t *cbp = &cdnp->dn_blkptr[j]; - dsl_scan_prefetch(scn, buf, cbp, - zb->zb_objset, zb->zb_blkid * epb + i, j); - } - } for (i = 0, cdnp = buf->b_data; i < epb; i += cdnp->dn_extra_slots + 1, cdnp += cdnp->dn_extra_slots + 1) { @@ -796,7 +1643,7 @@ dsl_scan_recurse(dsl_scan_t *scn, dsl_dataset_t *ds, dmu_objset_type_t ostype, arc_buf_t *buf; err = arc_read(NULL, dp->dp_spa, bp, arc_getbuf_func, &buf, - ZIO_PRIORITY_ASYNC_READ, zio_flags, &flags, zb); + ZIO_PRIORITY_SCRUB, zio_flags, &flags, zb); if (err) { scn->scn_phys.scn_errors++; return (err); @@ -862,10 +1709,7 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, dmu_objset_type_t ostype, dmu_tx_t *tx) { dsl_pool_t *dp = scn->scn_dp; - arc_buf_t *buf = NULL; - blkptr_t bp_toread = *bp; - - /* ASSERT(pbuf == NULL || arc_released(pbuf)); */ + blkptr_t *bp_toread = NULL; if (dsl_scan_check_suspend(scn, zb)) return; @@ -873,22 +1717,36 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, if (dsl_scan_check_resume(scn, dnp, zb)) return; - if (BP_IS_HOLE(bp)) - return; - scn->scn_visited_this_txg++; - dprintf_bp(bp, - "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", - ds, ds ? ds->ds_object : 0, - zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, - bp); + /* + * This debugging is commented out to conserve stack space. This + * function is called recursively and the debugging addes several + * bytes to the stack for each call. It can be commented back in + * if required to debug an issue in dsl_scan_visitbp(). + * + * dprintf_bp(bp, + * "visiting ds=%p/%llu zb=%llx/%llx/%llx/%llx bp=%p", + * ds, ds ? ds->ds_object : 0, + * zb->zb_objset, zb->zb_object, zb->zb_level, zb->zb_blkid, + * bp); + */ - if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) + if (BP_IS_HOLE(bp)) { + scn->scn_holes_this_txg++; return; + } - if (dsl_scan_recurse(scn, ds, ostype, dnp, &bp_toread, zb, tx) != 0) + if (bp->blk_birth <= scn->scn_phys.scn_cur_min_txg) { + scn->scn_lt_min_this_txg++; return; + } + + bp_toread = kmem_alloc(sizeof (blkptr_t), KM_SLEEP); + *bp_toread = *bp; + + if (dsl_scan_recurse(scn, ds, ostype, dnp, bp_toread, zb, tx) != 0) + goto out; /* * If dsl_scan_ddt() has already visited this block, it will have @@ -897,8 +1755,8 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, */ if (ddt_class_contains(dp->dp_spa, scn->scn_phys.scn_ddt_class_max, bp)) { - ASSERT(buf == NULL); - return; + scn->scn_ddt_contained_this_txg++; + goto out; } /* @@ -908,9 +1766,15 @@ dsl_scan_visitbp(blkptr_t *bp, const zbookmark_phys_t *zb, * Don't scan it now unless we need to because something * under it was modified. */ - if (BP_PHYSICAL_BIRTH(bp) <= scn->scn_phys.scn_cur_max_txg) { - scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + if (BP_PHYSICAL_BIRTH(bp) > scn->scn_phys.scn_cur_max_txg) { + scn->scn_gt_max_this_txg++; + goto out; } + + scan_funcs[scn->scn_phys.scn_func](dp, bp, zb); + +out: + kmem_free(bp_toread, sizeof (blkptr_t)); } static void @@ -918,26 +1782,33 @@ dsl_scan_visit_rootbp(dsl_scan_t *scn, dsl_dataset_t *ds, blkptr_t *bp, dmu_tx_t *tx) { zbookmark_phys_t zb; + scan_prefetch_ctx_t *spc; SET_BOOKMARK(&zb, ds ? ds->ds_object : DMU_META_OBJSET, ZB_ROOT_OBJECT, ZB_ROOT_LEVEL, ZB_ROOT_BLKID); - dsl_scan_visitbp(bp, &zb, NULL, - ds, scn, DMU_OST_NONE, tx); + + if (ZB_IS_ZERO(&scn->scn_phys.scn_bookmark)) { + SET_BOOKMARK(&scn->scn_prefetch_bookmark, + zb.zb_objset, 0, 0, 0); + } else { + scn->scn_prefetch_bookmark = scn->scn_phys.scn_bookmark; + } + + scn->scn_objsets_visited_this_txg++; + + spc = scan_prefetch_ctx_create(scn, NULL, FTAG); + dsl_scan_prefetch(spc, bp, &zb); + scan_prefetch_ctx_rele(spc, FTAG); + + dsl_scan_visitbp(bp, &zb, NULL, ds, scn, DMU_OST_NONE, tx); dprintf_ds(ds, "finished scan%s", ""); } -void -dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +static void +ds_destroyed_scn_phys(dsl_dataset_t *ds, dsl_scan_phys_t *scn_phys) { - dsl_pool_t *dp = ds->ds_dir->dd_pool; - dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; - - if (scn->scn_phys.scn_state != DSS_SCANNING) - return; - - if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { + if (scn_phys->scn_bookmark.zb_objset == ds->ds_object) { if (ds->ds_is_snapshot) { /* * Note: @@ -949,23 +1820,57 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) * ignore it when we retraverse it in * dsl_scan_visitds(). */ - scn->scn_phys.scn_bookmark.zb_objset = + scn_phys->scn_bookmark.zb_objset = dsl_dataset_phys(ds)->ds_next_snap_obj; zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)-> ds_next_snap_obj); - scn->scn_phys.scn_flags |= DSF_VISIT_DS_AGAIN; + scn_phys->scn_flags |= DSF_VISIT_DS_AGAIN; } else { - SET_BOOKMARK(&scn->scn_phys.scn_bookmark, + SET_BOOKMARK(&scn_phys->scn_bookmark, ZB_DESTROYED_OBJSET, 0, 0, 0); zfs_dbgmsg("destroying ds %llu; currently traversing; " "reset bookmark to -1,0,0,0", (u_longlong_t)ds->ds_object); } - } else if (zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + } +} + +/* + * Invoked when a dataset is destroyed. We need to make sure that: + * + * 1) If it is the dataset that was currently being scanned, we write + * a new dsl_scan_phys_t and marking the objset reference in it + * as destroyed. + * 2) Remove it from the work queue, if it was present. + * + * If the dataset was actually a snapshot, instead of marking the dataset + * as destroyed, we instead substitute the next snapshot in line. + */ +void +dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ds_destroyed_scn_phys(ds, &scn->scn_phys); + ds_destroyed_scn_phys(ds, &scn->scn_phys_cached); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + if (ds->ds_is_snapshot) + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_next_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { ASSERT3U(dsl_dataset_phys(ds)->ds_num_children, <=, 1); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); @@ -994,9 +1899,28 @@ dsl_scan_ds_destroyed(dsl_dataset_t *ds, dmu_tx_t *tx) * dsl_scan_sync() should be called after this, and should sync * out our changed state, but just to be safe, do it here. */ - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); +} + +static void +ds_snapshotted_bookmark(dsl_dataset_t *ds, zbookmark_phys_t *scn_bookmark) +{ + if (scn_bookmark->zb_objset == ds->ds_object) { + scn_bookmark->zb_objset = + dsl_dataset_phys(ds)->ds_prev_snap_obj; + zfs_dbgmsg("snapshotting ds %llu; currently traversing; " + "reset zb_objset to %llu", + (u_longlong_t)ds->ds_object, + (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); + } } +/* + * Called when a dataset is snapshotted. If we were currently traversing + * this snapshot, we reset our bookmark to point at the newly created + * snapshot. We also modify our work queue to remove the old snapshot and + * replace with the new one. + */ void dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) { @@ -1004,20 +1928,22 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) dsl_scan_t *scn = dp->dp_scan; uint64_t mintxg; - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn)) return; ASSERT(dsl_dataset_phys(ds)->ds_prev_snap_obj != 0); - if (scn->scn_phys.scn_bookmark.zb_objset == ds->ds_object) { - scn->scn_phys.scn_bookmark.zb_objset = - dsl_dataset_phys(ds)->ds_prev_snap_obj; - zfs_dbgmsg("snapshotting ds %llu; currently traversing; " - "reset zb_objset to %llu", - (u_longlong_t)ds->ds_object, - (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); - } else if (zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, &mintxg) == 0) { + ds_snapshotted_bookmark(ds, &scn->scn_phys.scn_bookmark); + ds_snapshotted_bookmark(ds, &scn->scn_phys_cached.scn_bookmark); + + if (scan_ds_queue_contains(scn, ds->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds->ds_object); + scan_ds_queue_insert(scn, + dsl_dataset_phys(ds)->ds_prev_snap_obj, mintxg); + } + + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds->ds_object, &mintxg) == 0) { VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds->ds_object, tx)); VERIFY(zap_add_int_key(dp->dp_meta_objset, @@ -1028,37 +1954,59 @@ dsl_scan_ds_snapshotted(dsl_dataset_t *ds, dmu_tx_t *tx) (u_longlong_t)ds->ds_object, (u_longlong_t)dsl_dataset_phys(ds)->ds_prev_snap_obj); } - dsl_scan_sync_state(scn, tx); + + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } -void -dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +static void +ds_clone_swapped_bookmark(dsl_dataset_t *ds1, dsl_dataset_t *ds2, + zbookmark_phys_t *scn_bookmark) { - dsl_pool_t *dp = ds1->ds_dir->dd_pool; - dsl_scan_t *scn = dp->dp_scan; - uint64_t mintxg; - - if (scn->scn_phys.scn_state != DSS_SCANNING) - return; - - if (scn->scn_phys.scn_bookmark.zb_objset == ds1->ds_object) { - scn->scn_phys.scn_bookmark.zb_objset = ds2->ds_object; + if (scn_bookmark->zb_objset == ds1->ds_object) { + scn_bookmark->zb_objset = ds2->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); - } else if (scn->scn_phys.scn_bookmark.zb_objset == ds2->ds_object) { - scn->scn_phys.scn_bookmark.zb_objset = ds1->ds_object; + } else if (scn_bookmark->zb_objset == ds2->ds_object) { + scn_bookmark->zb_objset = ds1->ds_object; zfs_dbgmsg("clone_swap ds %llu; currently traversing; " "reset zb_objset to %llu", (u_longlong_t)ds2->ds_object, (u_longlong_t)ds1->ds_object); } +} + +/* + * Called when a parent dataset and its clone are swapped. If we were + * currently traversing the dataset, we need to switch to traversing the + * newly promoted parent. + */ +void +dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) +{ + dsl_pool_t *dp = ds1->ds_dir->dd_pool; + dsl_scan_t *scn = dp->dp_scan; + uint64_t mintxg; + + if (!dsl_scan_is_running(scn)) + return; + + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys.scn_bookmark); + ds_clone_swapped_bookmark(ds1, ds2, &scn->scn_phys_cached.scn_bookmark); + + if (scan_ds_queue_contains(scn, ds1->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds1->ds_object); + scan_ds_queue_insert(scn, ds2->ds_object, mintxg); + } + if (scan_ds_queue_contains(scn, ds2->ds_object, &mintxg)) { + scan_ds_queue_remove(scn, ds2->ds_object); + scan_ds_queue_insert(scn, ds1->ds_object, mintxg); + } if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, ds1->ds_object, &mintxg) == 0) { int err; - ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, @@ -1076,8 +2024,9 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) "replacing with %llu", (u_longlong_t)ds1->ds_object, (u_longlong_t)ds2->ds_object); - } else if (zap_lookup_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds2->ds_object, &mintxg) == 0) { + } + if (zap_lookup_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, + ds2->ds_object, &mintxg) == 0) { ASSERT3U(mintxg, ==, dsl_dataset_phys(ds1)->ds_prev_snap_txg); ASSERT3U(mintxg, ==, dsl_dataset_phys(ds2)->ds_prev_snap_txg); VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, @@ -1090,31 +2039,26 @@ dsl_scan_ds_clone_swapped(dsl_dataset_t *ds1, dsl_dataset_t *ds2, dmu_tx_t *tx) (u_longlong_t)ds1->ds_object); } - dsl_scan_sync_state(scn, tx); + dsl_scan_sync_state(scn, tx, SYNC_CACHED); } -struct enqueue_clones_arg { - dmu_tx_t *tx; - uint64_t originobj; -}; - /* ARGSUSED */ static int enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { - struct enqueue_clones_arg *eca = arg; + uint64_t originobj = *(uint64_t *)arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; - if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != eca->originobj) + if (dsl_dir_phys(hds->ds_dir)->dd_origin_obj != originobj) return (0); err = dsl_dataset_hold_obj(dp, hds->ds_object, FTAG, &ds); if (err) return (err); - while (dsl_dataset_phys(ds)->ds_prev_snap_obj != eca->originobj) { + while (dsl_dataset_phys(ds)->ds_prev_snap_obj != originobj) { dsl_dataset_t *prev; err = dsl_dataset_hold_obj(dp, dsl_dataset_phys(ds)->ds_prev_snap_obj, FTAG, &prev); @@ -1124,9 +2068,8 @@ enqueue_clones_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) return (err); ds = prev; } - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, - dsl_dataset_phys(ds)->ds_prev_snap_txg, eca->tx) == 0); + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } @@ -1171,9 +2114,9 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) dsl_dataset_name(ds, dsname); zfs_dbgmsg("scanning dataset %llu (%s) is unnecessary because " "cur_min_txg (%llu) >= max_txg (%llu)", - dsobj, dsname, - scn->scn_phys.scn_cur_min_txg, - scn->scn_phys.scn_max_txg); + (longlong_t)dsobj, dsname, + (longlong_t)scn->scn_phys.scn_cur_min_txg, + (longlong_t)scn->scn_phys.scn_max_txg); kmem_free(dsname, MAXNAMELEN); goto out; @@ -1229,9 +2172,8 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) if (scn->scn_phys.scn_flags & DSF_VISIT_DS_AGAIN) { zfs_dbgmsg("incomplete pass; visiting again"); scn->scn_phys.scn_flags &= ~DSF_VISIT_DS_AGAIN; - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, ds->ds_object, - scn->scn_phys.scn_cur_max_txg, tx) == 0); + scan_ds_queue_insert(scn, ds->ds_object, + scn->scn_phys.scn_cur_max_txg); goto out; } @@ -1239,10 +2181,9 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) * Add descendent datasets to work queue. */ if (dsl_dataset_phys(ds)->ds_next_snap_obj != 0) { - VERIFY(zap_add_int_key(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, + scan_ds_queue_insert(scn, dsl_dataset_phys(ds)->ds_next_snap_obj, - dsl_dataset_phys(ds)->ds_creation_txg, tx) == 0); + dsl_dataset_phys(ds)->ds_creation_txg); } if (dsl_dataset_phys(ds)->ds_num_children > 1) { boolean_t usenext = B_FALSE; @@ -1263,17 +2204,21 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) } if (usenext) { - VERIFY0(zap_join_key(dp->dp_meta_objset, - dsl_dataset_phys(ds)->ds_next_clones_obj, - scn->scn_phys.scn_queue_obj, - dsl_dataset_phys(ds)->ds_creation_txg, tx)); + zap_cursor_t zc; + zap_attribute_t za; + for (zap_cursor_init(&zc, dp->dp_meta_objset, + dsl_dataset_phys(ds)->ds_next_clones_obj); + zap_cursor_retrieve(&zc, &za) == 0; + (void) zap_cursor_advance(&zc)) { + scan_ds_queue_insert(scn, + zfs_strtonum(za.za_name, NULL), + dsl_dataset_phys(ds)->ds_creation_txg); + } + zap_cursor_fini(&zc); } else { - struct enqueue_clones_arg eca; - eca.tx = tx; - eca.originobj = ds->ds_object; - VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - enqueue_clones_cb, &eca, DS_FIND_CHILDREN)); + enqueue_clones_cb, &ds->ds_object, + DS_FIND_CHILDREN)); } } @@ -1285,7 +2230,6 @@ dsl_scan_visitds(dsl_scan_t *scn, uint64_t dsobj, dmu_tx_t *tx) static int enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) { - dmu_tx_t *tx = arg; dsl_dataset_t *ds; int err; dsl_scan_t *scn = dp->dp_scan; @@ -1315,13 +2259,38 @@ enqueue_cb(dsl_pool_t *dp, dsl_dataset_t *hds, void *arg) ds = prev; } - VERIFY(zap_add_int_key(dp->dp_meta_objset, scn->scn_phys.scn_queue_obj, - ds->ds_object, dsl_dataset_phys(ds)->ds_prev_snap_txg, tx) == 0); + scan_ds_queue_insert(scn, ds->ds_object, + dsl_dataset_phys(ds)->ds_prev_snap_txg); dsl_dataset_rele(ds, FTAG); return (0); } -/* +/* ARGSUSED */ +void +dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, + ddt_entry_t *dde, dmu_tx_t *tx) +{ + const ddt_key_t *ddk = &dde->dde_key; + ddt_phys_t *ddp = dde->dde_phys; + blkptr_t bp; + zbookmark_phys_t zb = { 0 }; + int p; + + if (scn->scn_phys.scn_state != DSS_SCANNING) + return; + + for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + if (ddp->ddp_phys_birth == 0 || + ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) + continue; + ddt_bp_create(checksum, ddk, ddp, &bp); + + scn->scn_visited_this_txg++; + scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); + } +} + +/* * Scrub/dedup interaction. * * If there are N references to a deduped block, we don't want to scrub it @@ -1393,36 +2362,20 @@ dsl_scan_ddt(dsl_scan_t *scn, dmu_tx_t *tx) ddb->ddb_class > scn->scn_phys.scn_ddt_class_max); } -/* ARGSUSED */ -void -dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, - ddt_entry_t *dde, dmu_tx_t *tx) +static uint64_t +dsl_scan_ds_maxtxg(dsl_dataset_t *ds) { - const ddt_key_t *ddk = &dde->dde_key; - ddt_phys_t *ddp = dde->dde_phys; - blkptr_t bp; - zbookmark_phys_t zb = { 0 }; - - if (scn->scn_phys.scn_state != DSS_SCANNING) - return; - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - if (ddp->ddp_phys_birth == 0 || - ddp->ddp_phys_birth > scn->scn_phys.scn_max_txg) - continue; - ddt_bp_create(checksum, ddk, ddp, &bp); - - scn->scn_visited_this_txg++; - scan_funcs[scn->scn_phys.scn_func](scn->scn_dp, &bp, &zb); - } + uint64_t smt = ds->ds_dir->dd_pool->dp_scan->scn_phys.scn_max_txg; + if (ds->ds_is_snapshot) + return (MIN(smt, dsl_dataset_phys(ds)->ds_creation_txg)); + return (smt); } static void dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) { + scan_ds_t *sds; dsl_pool_t *dp = scn->scn_dp; - zap_cursor_t zc; - zap_attribute_t za; if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= scn->scn_phys.scn_ddt_class_max) { @@ -1446,7 +2399,7 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) if (spa_version(dp->dp_spa) < SPA_VERSION_DSL_SCRUB) { VERIFY0(dmu_objset_find_dp(dp, dp->dp_root_dir_obj, - enqueue_cb, tx, DS_FIND_CHILDREN)); + enqueue_cb, NULL, DS_FIND_CHILDREN)); } else { dsl_scan_visitds(scn, dp->dp_origin_snap->ds_object, tx); @@ -1454,40 +2407,42 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) ASSERT(!scn->scn_suspending); } else if (scn->scn_phys.scn_bookmark.zb_objset != ZB_DESTROYED_OBJSET) { + uint64_t dsobj = scn->scn_phys.scn_bookmark.zb_objset; /* - * If we were suspended, continue from here. Note if the + * If we were suspended, continue from here. Note if the * ds we were suspended on was deleted, the zb_objset may * be -1, so we will skip this and find a new objset * below. */ - dsl_scan_visitds(scn, scn->scn_phys.scn_bookmark.zb_objset, tx); + dsl_scan_visitds(scn, dsobj, tx); if (scn->scn_suspending) return; } /* - * In case we were suspended right at the end of the ds, zero the + * In case we suspended right at the end of the ds, zero the * bookmark so we don't think that we're still trying to resume. */ bzero(&scn->scn_phys.scn_bookmark, sizeof (zbookmark_phys_t)); - /* keep pulling things out of the zap-object-as-queue */ - while (zap_cursor_init(&zc, dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj), - zap_cursor_retrieve(&zc, &za) == 0) { + /* + * Keep pulling things out of the dataset avl queue. Updates to the + * persistent zap-object-as-queue happen only at checkpoints. + */ + while ((sds = avl_first(&scn->scn_queue)) != NULL) { dsl_dataset_t *ds; - uint64_t dsobj; + uint64_t dsobj = sds->sds_dsobj; + uint64_t txg = sds->sds_txg; - dsobj = zfs_strtonum(za.za_name, NULL); - VERIFY3U(0, ==, zap_remove_int(dp->dp_meta_objset, - scn->scn_phys.scn_queue_obj, dsobj, tx)); + /* dequeue and free the ds from the queue */ + scan_ds_queue_remove(scn, dsobj); + sds = NULL; /* must not be touched after removal */ - /* Set up min/max txg */ + /* Set up min / max txg */ VERIFY3U(0, ==, dsl_dataset_hold_obj(dp, dsobj, FTAG, &ds)); - if (za.za_first_integer != 0) { + if (txg != 0) { scn->scn_phys.scn_cur_min_txg = - MAX(scn->scn_phys.scn_min_txg, - za.za_first_integer); + MAX(scn->scn_phys.scn_min_txg, txg); } else { scn->scn_phys.scn_cur_min_txg = MAX(scn->scn_phys.scn_min_txg, @@ -1497,11 +2452,365 @@ dsl_scan_visit(dsl_scan_t *scn, dmu_tx_t *tx) dsl_dataset_rele(ds, FTAG); dsl_scan_visitds(scn, dsobj, tx); - zap_cursor_fini(&zc); if (scn->scn_suspending) return; } - zap_cursor_fini(&zc); + /* No more objsets to fetch, we're done */ + scn->scn_phys.scn_bookmark.zb_objset = ZB_DESTROYED_OBJSET; + ASSERT0(scn->scn_suspending); +} + +static uint64_t +dsl_scan_count_leaves(vdev_t *vd) +{ + uint64_t i, leaves = 0; + + /* we only count leaves that belong to the main pool and are readable */ + if (vd->vdev_islog || vd->vdev_isspare || + vd->vdev_isl2cache || !vdev_readable(vd)) + return (0); + + if (vd->vdev_ops->vdev_op_leaf) + return (1); + + for (i = 0; i < vd->vdev_children; i++) { + leaves += dsl_scan_count_leaves(vd->vdev_child[i]); + } + + return (leaves); +} + + +static void +scan_io_queues_update_zio_stats(dsl_scan_io_queue_t *q, const blkptr_t *bp) +{ + int i; + uint64_t cur_size = 0; + + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + cur_size += DVA_GET_ASIZE(&bp->blk_dva[i]); + } + + q->q_total_zio_size_this_txg += cur_size; + q->q_zios_this_txg++; +} + +static void +scan_io_queues_update_seg_stats(dsl_scan_io_queue_t *q, uint64_t start, + uint64_t end) +{ + q->q_total_seg_size_this_txg += end - start; + q->q_segs_this_txg++; +} + +static boolean_t +scan_io_queue_check_suspend(dsl_scan_t *scn) +{ + /* See comment in dsl_scan_check_suspend() */ + uint64_t curr_time_ns = gethrtime(); + uint64_t scan_time_ns = curr_time_ns - scn->scn_sync_start_time; + uint64_t sync_time_ns = curr_time_ns - + scn->scn_dp->dp_spa->spa_sync_starttime; + int dirty_pct = scn->scn_dp->dp_dirty_total * 100 / zfs_dirty_data_max; + int mintime = (scn->scn_phys.scn_func == POOL_SCAN_RESILVER) ? + zfs_resilver_min_time_ms : zfs_scrub_min_time_ms; + + return ((NSEC2MSEC(scan_time_ns) > mintime && + (dirty_pct >= zfs_vdev_async_write_active_min_dirty_percent || + txg_sync_waiting(scn->scn_dp) || + NSEC2SEC(sync_time_ns) >= zfs_txg_timeout)) || + spa_shutting_down(scn->scn_dp->dp_spa)); +} + +/* + * Given a list of scan_io_t's in io_list, this issues the io's out to + * disk. This consumes the io_list and frees the scan_io_t's. This is + * called when emptying queues, either when we're up against the memory + * limit or when we have finished scanning. Returns B_TRUE if we stopped + * processing the list before we finished. Any zios that were not issued + * will remain in the io_list. + */ +static boolean_t +scan_io_queue_issue(dsl_scan_io_queue_t *queue, list_t *io_list) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + int64_t bytes_issued = 0; + boolean_t suspended = B_FALSE; + + while ((sio = list_head(io_list)) != NULL) { + blkptr_t bp; + + if (scan_io_queue_check_suspend(scn)) { + suspended = B_TRUE; + break; + } + + sio2bp(sio, &bp, queue->q_vd->vdev_id); + bytes_issued += sio->sio_asize; + scan_exec_io(scn->scn_dp, &bp, sio->sio_flags, + &sio->sio_zb, queue); + (void) list_remove_head(io_list); + scan_io_queues_update_zio_stats(queue, &bp); + kmem_free(sio, sizeof (*sio)); + } + + atomic_add_64(&scn->scn_bytes_pending, -bytes_issued); + + return (suspended); +} + +/* + * Given a range_seg_t (extent) and a list, this function passes over a + * scan queue and gathers up the appropriate ios which fit into that + * scan seg (starting from lowest LBA). At the end, we remove the segment + * from the q_exts_by_addr range tree. + */ +static boolean_t +scan_io_queue_gather(dsl_scan_io_queue_t *queue, range_seg_t *rs, list_t *list) +{ + scan_io_t srch_sio, *sio, *next_sio; + avl_index_t idx; + uint_t num_sios = 0; + int64_t bytes_issued = 0; + + ASSERT(rs != NULL); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + srch_sio.sio_offset = rs->rs_start; + + /* + * The exact start of the extent might not contain any matching zios, + * so if that's the case, examine the next one in the tree. + */ + sio = avl_find(&queue->q_sios_by_addr, &srch_sio, &idx); + if (sio == NULL) + sio = avl_nearest(&queue->q_sios_by_addr, idx, AVL_AFTER); + + while (sio != NULL && sio->sio_offset < rs->rs_end && num_sios <= 32) { + ASSERT3U(sio->sio_offset, >=, rs->rs_start); + ASSERT3U(sio->sio_offset + sio->sio_asize, <=, rs->rs_end); + + next_sio = AVL_NEXT(&queue->q_sios_by_addr, sio); + avl_remove(&queue->q_sios_by_addr, sio); + + bytes_issued += sio->sio_asize; + num_sios++; + list_insert_tail(list, sio); + sio = next_sio; + } + + /* + * We limit the number of sios we process at once to 32 to avoid + * biting off more than we can chew. If we didn't take everything + * in the segment we update it to reflect the work we were able to + * complete. Otherwise, we remove it from the range tree entirely. + */ + if (sio != NULL && sio->sio_offset < rs->rs_end) { + range_tree_adjust_fill(queue->q_exts_by_addr, rs, + -bytes_issued); + range_tree_resize_segment(queue->q_exts_by_addr, rs, + sio->sio_offset, rs->rs_end - sio->sio_offset); + + return (B_TRUE); + } else { + range_tree_remove(queue->q_exts_by_addr, rs->rs_start, + rs->rs_end - rs->rs_start); + return (B_FALSE); + } +} + + +/* + * This is called from the queue emptying thread and selects the next + * extent from which we are to issue io's. The behavior of this function + * depends on the state of the scan, the current memory consumption and + * whether or not we are performing a scan shutdown. + * 1) We select extents in an elevator algorithm (LBA-order) if the scan + * needs to perform a checkpoint + * 2) We select the largest available extent if we are up against the + * memory limit. + * 3) Otherwise we don't select any extents. + */ +static const range_seg_t * +scan_io_queue_fetch_ext(dsl_scan_io_queue_t *queue) +{ + dsl_scan_t *scn = queue->q_scn; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + ASSERT(scn->scn_is_sorted); + + /* handle tunable overrides */ + if (scn->scn_checkpointing || scn->scn_clearing) { + if (zfs_scan_issue_strategy == 1) { + return (range_tree_first(queue->q_exts_by_addr)); + } else if (zfs_scan_issue_strategy == 2) { + return (avl_first(&queue->q_exts_by_size)); + } + } + + /* + * During normal clearing, we want to issue our largest segments + * first, keeping IO as sequential as possible, and leaving the + * smaller extents for later with the hope that they might eventually + * grow to larger sequential segments. However, when the scan is + * checkpointing, no new extents will be added to the sorting queue, + * so the way we are sorted now is as good as it will ever get. + * In this case, we instead switch to issuing extents in LBA order. + */ + if (scn->scn_checkpointing) { + return (range_tree_first(queue->q_exts_by_addr)); + } else if (scn->scn_clearing) { + return (avl_first(&queue->q_exts_by_size)); + } else { + return (NULL); + } +} + +static void +scan_io_queues_run_one(void *arg) +{ + dsl_scan_io_queue_t *queue = arg; + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; + boolean_t suspended = B_FALSE; + range_seg_t *rs = NULL; + scan_io_t *sio = NULL; + list_t sio_list; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(queue->q_vd); + + ASSERT(queue->q_scn->scn_is_sorted); + + list_create(&sio_list, sizeof (scan_io_t), + offsetof(scan_io_t, sio_nodes.sio_list_node)); + mutex_enter(q_lock); + + /* calculate maximum in-flight bytes for this txg (min 1MB) */ + queue->q_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + + /* reset per-queue scan statistics for this txg */ + queue->q_total_seg_size_this_txg = 0; + queue->q_segs_this_txg = 0; + queue->q_total_zio_size_this_txg = 0; + queue->q_zios_this_txg = 0; + + /* loop until we have run out of time or sios */ + while ((rs = (range_seg_t *)scan_io_queue_fetch_ext(queue)) != NULL) { + uint64_t seg_start = 0, seg_end = 0; + boolean_t more_left = B_TRUE; + + ASSERT(list_is_empty(&sio_list)); + + /* loop while we still have sios left to process in this rs */ + while (more_left) { + scan_io_t *first_sio, *last_sio; + + /* + * We have selected which extent needs to be + * processed next. Gather up the corresponding sios. + */ + more_left = scan_io_queue_gather(queue, rs, &sio_list); + ASSERT(!list_is_empty(&sio_list)); + first_sio = list_head(&sio_list); + last_sio = list_tail(&sio_list); + + seg_end = last_sio->sio_offset + last_sio->sio_asize; + if (seg_start == 0) + seg_start = first_sio->sio_offset; + + /* + * Issuing sios can take a long time so drop the + * queue lock. The sio queue won't be updated by + * other threads since we're in syncing context so + * we can be sure that our trees will remain exactly + * as we left them. + */ + mutex_exit(q_lock); + suspended = scan_io_queue_issue(queue, &sio_list); + mutex_enter(q_lock); + + if (suspended) + break; + } + /* update statistics for debugging purposes */ + scan_io_queues_update_seg_stats(queue, seg_start, seg_end); + + if (suspended) + break; + } + + + /* + * If we were suspended in the middle of processing, + * requeue any unfinished sios and exit. + */ + while ((sio = list_head(&sio_list)) != NULL) { + list_remove(&sio_list, sio); + scan_io_queue_insert_impl(queue, sio); + } + + mutex_exit(q_lock); + list_destroy(&sio_list); +} + +/* + * Performs an emptying run on all scan queues in the pool. This just + * punches out one thread per top-level vdev, each of which processes + * only that vdev's scan queue. We can parallelize the I/O here because + * we know that each queue's io's only affect its own top-level vdev. + * + * This function waits for the queue runs to complete, and must be + * called from dsl_scan_sync (or in general, syncing context). + */ +static void +scan_io_queues_run(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + + ASSERT(scn->scn_is_sorted); + ASSERT(spa_config_held(spa, SCL_CONFIG, RW_READER)); + + if (scn->scn_bytes_pending == 0) + return; + + if (scn->scn_taskq == NULL) { + char *tq_name = kmem_zalloc(ZFS_MAX_DATASET_NAME_LEN + 16, + KM_SLEEP); + int nthreads = spa->spa_root_vdev->vdev_children; + + /* + * We need to make this taskq *always* execute as many + * threads in parallel as we have top-level vdevs and no + * less, otherwise strange serialization of the calls to + * scan_io_queues_run_one can occur during spa_sync runs + * and that significantly impacts performance. + */ + (void) snprintf(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16, + "dsl_scan_tq_%s", spa->spa_name); + scn->scn_taskq = taskq_create(tq_name, nthreads, minclsyspri, + nthreads, nthreads, TASKQ_PREPOPULATE); + kmem_free(tq_name, ZFS_MAX_DATASET_NAME_LEN + 16); + } + + for (uint64_t i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + + mutex_enter(&vd->vdev_scan_io_queue_lock); + if (vd->vdev_scan_io_queue != NULL) { + VERIFY(taskq_dispatch(scn->scn_taskq, + scan_io_queues_run_one, vd->vdev_scan_io_queue, + TQ_SLEEP) != TASKQID_INVALID); + } + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + + /* + * Wait for the queues to finish issuing thir IOs for this run + * before we return. There may still be IOs in flight at this + * point. + */ + taskq_wait(scn->scn_taskq); } static boolean_t @@ -1542,6 +2851,41 @@ dsl_scan_free_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) return (0); } +static void +dsl_scan_update_stats(dsl_scan_t *scn) +{ + spa_t *spa = scn->scn_dp->dp_spa; + uint64_t i; + uint64_t seg_size_total = 0, zio_size_total = 0; + uint64_t seg_count_total = 0, zio_count_total = 0; + + for (i = 0; i < spa->spa_root_vdev->vdev_children; i++) { + vdev_t *vd = spa->spa_root_vdev->vdev_child[i]; + dsl_scan_io_queue_t *queue = vd->vdev_scan_io_queue; + + if (queue == NULL) + continue; + + seg_size_total += queue->q_total_seg_size_this_txg; + zio_size_total += queue->q_total_zio_size_this_txg; + seg_count_total += queue->q_segs_this_txg; + zio_count_total += queue->q_zios_this_txg; + } + + if (seg_count_total == 0 || zio_count_total == 0) { + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_zios_this_txg = 0; + return; + } + + scn->scn_avg_seg_size_this_txg = seg_size_total / seg_count_total; + scn->scn_avg_zio_size_this_txg = zio_size_total / zio_count_total; + scn->scn_segs_this_txg = seg_count_total; + scn->scn_zios_this_txg = zio_count_total; +} + static int dsl_scan_obsolete_block_cb(void *arg, const blkptr_t *bp, dmu_tx_t *tx) { @@ -1568,8 +2912,7 @@ dsl_scan_active(dsl_scan_t *scn) return (B_FALSE); if (spa_shutting_down(spa)) return (B_FALSE); - if ((scn->scn_phys.scn_state == DSS_SCANNING && - !dsl_scan_is_paused_scrub(scn)) || + if ((dsl_scan_is_running(scn) && !dsl_scan_is_paused_scrub(scn)) || (scn->scn_async_destroying && !scn->scn_async_stalled)) return (B_TRUE); @@ -1580,25 +2923,76 @@ dsl_scan_active(dsl_scan_t *scn) return (used != 0); } +static boolean_t +dsl_scan_need_resilver(spa_t *spa, const dva_t *dva, size_t psize, + uint64_t phys_birth) +{ + vdev_t *vd; + + vd = vdev_lookup_top(spa, DVA_GET_VDEV(dva)); + + if (vd->vdev_ops == &vdev_indirect_ops) { + /* + * The indirect vdev can point to multiple + * vdevs. For simplicity, always create + * the resilver zio_t. zio_vdev_io_start() + * will bypass the child resilver i/o's if + * they are on vdevs that don't have DTL's. + */ + return (B_TRUE); + } + + if (DVA_GET_GANG(dva)) { + /* + * Gang members may be spread across multiple + * vdevs, so the best estimate we have is the + * scrub range, which has already been checked. + * XXX -- it would be better to change our + * allocation policy to ensure that all + * gang members reside on the same vdev. + */ + return (B_TRUE); + } + + /* + * Check if the txg falls within the range which must be + * resilvered. DVAs outside this range can always be skipped. + */ + if (!vdev_dtl_contains(vd, DTL_PARTIAL, phys_birth, 1)) + return (B_FALSE); + + /* + * Check if the top-level vdev must resilver this offset. + * When the offset does not intersect with a dirty leaf DTL + * then it may be possible to skip the resilver IO. The psize + * is provided instead of asize to simplify the check for RAIDZ. + */ + if (!vdev_dtl_need_resilver(vd, DVA_GET_OFFSET(dva), psize)) + return (B_FALSE); + + return (B_TRUE); +} + static int dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) { + int err = 0; dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; - int err = 0; if (spa_suspend_async_destroy(spa)) return (0); if (zfs_free_bpobj_enabled && - spa_version(dp->dp_spa) >= SPA_VERSION_DEADLISTS) { + spa_version(spa) >= SPA_VERSION_DEADLISTS) { scn->scn_is_bptree = B_FALSE; scn->scn_async_block_min_time_ms = zfs_free_min_time_ms; - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bpobj_iterate(&dp->dp_free_bpobj, dsl_scan_free_block_cb, scn, tx); - VERIFY3U(0, ==, zio_wait(scn->scn_zio_root)); + VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; if (err != 0 && err != ERESTART) zfs_panic_recover("error %u from bpobj_iterate()", err); @@ -1607,11 +3001,12 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) if (err == 0 && spa_feature_is_active(spa, SPA_FEATURE_ASYNC_DESTROY)) { ASSERT(scn->scn_async_destroying); scn->scn_is_bptree = B_TRUE; - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + scn->scn_zio_root = zio_root(spa, NULL, NULL, ZIO_FLAG_MUSTSUCCEED); err = bptree_iterate(dp->dp_meta_objset, dp->dp_bptree_obj, B_TRUE, dsl_scan_free_block_cb, scn, tx); VERIFY0(zio_wait(scn->scn_zio_root)); + scn->scn_zio_root = NULL; if (err == EIO || err == ECKSUM) { err = 0; @@ -1648,7 +3043,7 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) } if (scn->scn_visited_this_txg) { zfs_dbgmsg("freed %llu blocks in %llums from " - "free_bpobj/bptree txg %llu; err=%u", + "free_bpobj/bptree txg %llu; err=%d", (longlong_t)scn->scn_visited_this_txg, (longlong_t) NSEC2MSEC(gethrtime() - scn->scn_sync_start_time), @@ -1720,12 +3115,21 @@ dsl_process_async_destroys(dsl_pool_t *dp, dmu_tx_t *tx) return (0); } +/* + * This is the primary entry point for scans that is called from syncing + * context. Scans must happen entirely during syncing context so that we + * cna guarantee that blocks we are currently scanning will not change out + * from under us. While a scan is active, this funciton controls how quickly + * transaction groups proceed, instead of the normal handling provided by + * txg_sync_thread(). + */ void dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) { dsl_scan_t *scn = dp->dp_scan; spa_t *spa = dp->dp_spa; int err = 0; + state_sync_type_t sync_type = SYNC_OPTIONAL; /* * Check for scn_restart_txg before checking spa_load_state, so @@ -1738,7 +3142,7 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (vdev_resilver_needed(spa->spa_root_vdev, NULL, NULL)) func = POOL_SCAN_RESILVER; zfs_dbgmsg("restarting scan func=%u txg=%llu", - func, tx->tx_txg); + func, (longlong_t)tx->tx_txg); dsl_scan_setup_sync(&func, tx); } @@ -1762,7 +3166,17 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (!scn->scn_async_stalled && !dsl_scan_active(scn)) return; + /* reset scan statistics */ scn->scn_visited_this_txg = 0; + scn->scn_holes_this_txg = 0; + scn->scn_lt_min_this_txg = 0; + scn->scn_gt_max_this_txg = 0; + scn->scn_ddt_contained_this_txg = 0; + scn->scn_objsets_visited_this_txg = 0; + scn->scn_avg_seg_size_this_txg = 0; + scn->scn_segs_this_txg = 0; + scn->scn_avg_zio_size_this_txg = 0; + scn->scn_zios_this_txg = 0; scn->scn_suspending = B_FALSE; scn->scn_sync_start_time = gethrtime(); spa->spa_scrub_active = B_TRUE; @@ -1778,110 +3192,189 @@ dsl_scan_sync(dsl_pool_t *dp, dmu_tx_t *tx) if (err != 0) return; - if (scn->scn_phys.scn_state != DSS_SCANNING) + if (!dsl_scan_is_running(scn) || dsl_scan_is_paused_scrub(scn)) return; - if (scn->scn_done_txg == tx->tx_txg) { - ASSERT(!scn->scn_suspending); - /* finished with scan. */ - zfs_dbgmsg("txg %llu scan complete", tx->tx_txg); - dsl_scan_done(scn, B_TRUE, tx); - ASSERT3U(spa->spa_scrub_inflight, ==, 0); - dsl_scan_sync_state(scn, tx); + /* + * Wait a few txgs after importing to begin scanning so that + * we can get the pool imported quickly. + */ + if (spa->spa_syncing_txg < spa->spa_first_txg + SCAN_IMPORT_WAIT_TXGS) return; - } - if (dsl_scan_is_paused_scrub(scn)) - return; + /* + * It is possible to switch from unsorted to sorted at any time, + * but afterwards the scan will remain sorted unless reloaded from + * a checkpoint after a reboot. + */ + if (!zfs_scan_legacy) { + scn->scn_is_sorted = B_TRUE; + if (scn->scn_last_checkpoint == 0) + scn->scn_last_checkpoint = ddi_get_lbolt(); + } - if (scn->scn_phys.scn_ddt_bookmark.ddb_class <= - scn->scn_phys.scn_ddt_class_max) { - zfs_dbgmsg("doing scan sync txg %llu; " - "ddt bm=%llu/%llu/%llu/%llx", - (longlong_t)tx->tx_txg, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_class, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_type, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_checksum, - (longlong_t)scn->scn_phys.scn_ddt_bookmark.ddb_cursor); - ASSERT(scn->scn_phys.scn_bookmark.zb_objset == 0); - ASSERT(scn->scn_phys.scn_bookmark.zb_object == 0); - ASSERT(scn->scn_phys.scn_bookmark.zb_level == 0); - ASSERT(scn->scn_phys.scn_bookmark.zb_blkid == 0); + /* + * For sorted scans, determine what kind of work we will be doing + * this txg based on our memory limitations and whether or not we + * need to perform a checkpoint. + */ + if (scn->scn_is_sorted) { + /* + * If we are over our checkpoint interval, set scn_clearing + * so that we can begin checkpointing immediately. The + * checkpoint allows us to save a consisent bookmark + * representing how much data we have scrubbed so far. + * Otherwise, use the memory limit to determine if we should + * scan for metadata or start issue scrub IOs. We accumulate + * metadata until we hit our hard memory limit at which point + * we issue scrub IOs until we are at our soft memory limit. + */ + if (scn->scn_checkpointing || + ddi_get_lbolt() - scn->scn_last_checkpoint > + SEC_TO_TICK(zfs_scan_checkpoint_intval)) { + if (!scn->scn_checkpointing) + zfs_dbgmsg("begin scan checkpoint"); + + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } else { + boolean_t should_clear = dsl_scan_should_clear(scn); + if (should_clear && !scn->scn_clearing) { + zfs_dbgmsg("begin scan clearing"); + scn->scn_clearing = B_TRUE; + } else if (!should_clear && scn->scn_clearing) { + zfs_dbgmsg("finish scan clearing"); + scn->scn_clearing = B_FALSE; + } + } } else { - zfs_dbgmsg("doing scan sync txg %llu; bm=%llu/%llu/%llu/%llu", - (longlong_t)tx->tx_txg, - (longlong_t)scn->scn_phys.scn_bookmark.zb_objset, - (longlong_t)scn->scn_phys.scn_bookmark.zb_object, - (longlong_t)scn->scn_phys.scn_bookmark.zb_level, - (longlong_t)scn->scn_phys.scn_bookmark.zb_blkid); + ASSERT0(scn->scn_checkpointing); + ASSERT0(scn->scn_clearing); } - scn->scn_zio_root = zio_root(dp->dp_spa, NULL, - NULL, ZIO_FLAG_CANFAIL); - dsl_pool_config_enter(dp, FTAG); - dsl_scan_visit(scn, tx); - dsl_pool_config_exit(dp, FTAG); - (void) zio_wait(scn->scn_zio_root); - scn->scn_zio_root = NULL; + if (!scn->scn_clearing && scn->scn_done_txg == 0) { + /* Need to scan metadata for more blocks to scrub */ + dsl_scan_phys_t *scnp = &scn->scn_phys; + taskqid_t prefetch_tqid; + uint64_t bytes_per_leaf = zfs_scan_vdev_limit; + uint64_t nr_leaves = dsl_scan_count_leaves(spa->spa_root_vdev); - zfs_dbgmsg("visited %llu blocks in %llums", - (longlong_t)scn->scn_visited_this_txg, - (longlong_t)NSEC2MSEC(gethrtime() - scn->scn_sync_start_time)); + /* + * Calculate the max number of in-flight bytes for pool-wide + * scanning operations (minimum 1MB). Limits for the issuing + * phase are done per top-level vdev and are handled separately. + */ + scn->scn_maxinflight_bytes = + MAX(nr_leaves * bytes_per_leaf, 1ULL << 20); + + if (scnp->scn_ddt_bookmark.ddb_class <= + scnp->scn_ddt_class_max) { + ASSERT(ZB_IS_ZERO(&scnp->scn_bookmark)); + zfs_dbgmsg("doing scan sync txg %llu; " + "ddt bm=%llu/%llu/%llu/%llx", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_ddt_bookmark.ddb_class, + (longlong_t)scnp->scn_ddt_bookmark.ddb_type, + (longlong_t)scnp->scn_ddt_bookmark.ddb_checksum, + (longlong_t)scnp->scn_ddt_bookmark.ddb_cursor); + } else { + zfs_dbgmsg("doing scan sync txg %llu; " + "bm=%llu/%llu/%llu/%llu", + (longlong_t)tx->tx_txg, + (longlong_t)scnp->scn_bookmark.zb_objset, + (longlong_t)scnp->scn_bookmark.zb_object, + (longlong_t)scnp->scn_bookmark.zb_level, + (longlong_t)scnp->scn_bookmark.zb_blkid); + } - if (!scn->scn_suspending) { - scn->scn_done_txg = tx->tx_txg + 1; - zfs_dbgmsg("txg %llu traversal complete, waiting till txg %llu", - tx->tx_txg, scn->scn_done_txg); - } + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); - if (DSL_SCAN_IS_SCRUB_RESILVER(scn)) { - mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight > 0) { - cv_wait(&spa->spa_scrub_io_cv, - &spa->spa_scrub_lock); - } - mutex_exit(&spa->spa_scrub_lock); - } + scn->scn_prefetch_stop = B_FALSE; + prefetch_tqid = taskq_dispatch(dp->dp_sync_taskq, + dsl_scan_prefetch_thread, scn, TQ_SLEEP); + ASSERT(prefetch_tqid != TASKQID_INVALID); - dsl_scan_sync_state(scn, tx); -} + dsl_pool_config_enter(dp, FTAG); + dsl_scan_visit(scn, tx); + dsl_pool_config_exit(dp, FTAG); -/* - * This will start a new scan, or restart an existing one. - */ -void -dsl_resilver_restart(dsl_pool_t *dp, uint64_t txg) -{ - if (txg == 0) { - dmu_tx_t *tx; - tx = dmu_tx_create_dd(dp->dp_mos_dir); - VERIFY(0 == dmu_tx_assign(tx, TXG_WAIT)); + mutex_enter(&dp->dp_spa->spa_scrub_lock); + scn->scn_prefetch_stop = B_TRUE; + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&dp->dp_spa->spa_scrub_lock); - txg = dmu_tx_get_txg(tx); - dp->dp_scan->scn_restart_txg = txg; - dmu_tx_commit(tx); - } else { - dp->dp_scan->scn_restart_txg = txg; + taskq_wait_id(dp->dp_sync_taskq, prefetch_tqid); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + zfs_dbgmsg("scan visited %llu blocks in %llums " + "(%llu os's, %llu holes, %llu < mintxg, " + "%llu in ddt, %llu > maxtxg)", + (longlong_t)scn->scn_visited_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_objsets_visited_this_txg, + (longlong_t)scn->scn_holes_this_txg, + (longlong_t)scn->scn_lt_min_this_txg, + (longlong_t)scn->scn_ddt_contained_this_txg, + (longlong_t)scn->scn_gt_max_this_txg); + + if (!scn->scn_suspending) { + ASSERT0(avl_numnodes(&scn->scn_queue)); + scn->scn_done_txg = tx->tx_txg + 1; + if (scn->scn_is_sorted) { + scn->scn_checkpointing = B_TRUE; + scn->scn_clearing = B_TRUE; + } + zfs_dbgmsg("scan complete txg %llu", + (longlong_t)tx->tx_txg); + } + } else if (scn->scn_is_sorted && scn->scn_bytes_pending != 0) { + /* need to issue scrubbing IOs from per-vdev queues */ + scn->scn_zio_root = zio_root(dp->dp_spa, NULL, + NULL, ZIO_FLAG_CANFAIL); + scan_io_queues_run(scn); + (void) zio_wait(scn->scn_zio_root); + scn->scn_zio_root = NULL; + + /* calculate and dprintf the current memory usage */ + (void) dsl_scan_should_clear(scn); + dsl_scan_update_stats(scn); + + zfs_dbgmsg("scrubbed %llu blocks (%llu segs) in %llums " + "(avg_block_size = %llu, avg_seg_size = %llu)", + (longlong_t)scn->scn_zios_this_txg, + (longlong_t)scn->scn_segs_this_txg, + (longlong_t)NSEC2MSEC(gethrtime() - + scn->scn_sync_start_time), + (longlong_t)scn->scn_avg_zio_size_this_txg, + (longlong_t)scn->scn_avg_seg_size_this_txg); + } else if (scn->scn_done_txg != 0 && scn->scn_done_txg <= tx->tx_txg) { + /* Finished with everything. Mark the scrub as complete */ + zfs_dbgmsg("scan issuing complete txg %llu", + (longlong_t)tx->tx_txg); + ASSERT3U(scn->scn_done_txg, !=, 0); + ASSERT0(spa->spa_scrub_inflight); + ASSERT0(scn->scn_bytes_pending); + dsl_scan_done(scn, B_TRUE, tx); + sync_type = SYNC_MANDATORY; } - zfs_dbgmsg("restarting resilver txg=%llu", txg); -} -boolean_t -dsl_scan_resilvering(dsl_pool_t *dp) -{ - return (dp->dp_scan->scn_phys.scn_state == DSS_SCANNING && - dp->dp_scan->scn_phys.scn_func == POOL_SCAN_RESILVER); + dsl_scan_sync_state(scn, tx, sync_type); } -/* - * scrub consumers - */ - static void -count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) +count_block(dsl_scan_t *scn, zfs_all_blkstats_t *zab, const blkptr_t *bp) { int i; + /* update the spa's stats on how many bytes we have issued */ + for (i = 0; i < BP_GET_NDVAS(bp); i++) { + atomic_add_64(&scn->scn_dp->dp_spa->spa_scan_pass_issued, + DVA_GET_ASIZE(&bp->blk_dva[i])); + } + /* * If we resume after a reboot, zab will be NULL; don't record * incomplete stats in that case. @@ -1889,6 +3382,8 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) if (zab == NULL) return; + mutex_enter(&zab->zab_lock); + for (i = 0; i < 4; i++) { int l = (i < 2) ? BP_GET_LEVEL(bp) : DN_MAX_LEVELS; int t = (i & 1) ? BP_GET_TYPE(bp) : DMU_OT_TOTAL; @@ -1923,24 +3418,96 @@ count_block(zfs_all_blkstats_t *zab, const blkptr_t *bp) break; } } + + mutex_exit(&zab->zab_lock); } static void -dsl_scan_scrub_done(zio_t *zio) +scan_io_queue_insert_impl(dsl_scan_io_queue_t *queue, scan_io_t *sio) { - spa_t *spa = zio->io_spa; + avl_index_t idx; + int64_t asize = sio->sio_asize; + dsl_scan_t *scn = queue->q_scn; - abd_free(zio->io_abd); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); - mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; - cv_broadcast(&spa->spa_scrub_io_cv); + if (avl_find(&queue->q_sios_by_addr, sio, &idx) != NULL) { + /* block is already scheduled for reading */ + atomic_add_64(&scn->scn_bytes_pending, -asize); + kmem_free(sio, sizeof (*sio)); + return; + } + avl_insert(&queue->q_sios_by_addr, sio, idx); + range_tree_add(queue->q_exts_by_addr, sio->sio_offset, asize); +} - if (zio->io_error && (zio->io_error != ECKSUM || - !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { - spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors++; +/* + * Given all the info we got from our metadata scanning process, we + * construct a scan_io_t and insert it into the scan sorting queue. The + * I/O must already be suitable for us to process. This is controlled + * by dsl_scan_enqueue(). + */ +static void +scan_io_queue_insert(dsl_scan_io_queue_t *queue, const blkptr_t *bp, int dva_i, + int zio_flags, const zbookmark_phys_t *zb) +{ + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio = kmem_zalloc(sizeof (*sio), KM_SLEEP); + + ASSERT0(BP_IS_GANG(bp)); + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + bp2sio(bp, sio, dva_i); + sio->sio_flags = zio_flags; + sio->sio_zb = *zb; + + /* + * Increment the bytes pending counter now so that we can't + * get an integer underflow in case the worker processes the + * zio before we get to incrementing this counter. + */ + atomic_add_64(&scn->scn_bytes_pending, sio->sio_asize); + + scan_io_queue_insert_impl(queue, sio); +} + +/* + * Given a set of I/O parameters as discovered by the metadata traversal + * process, attempts to place the I/O into the sorted queues (if allowed), + * or immediately executes the I/O. + */ +static void +dsl_scan_enqueue(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb) +{ + spa_t *spa = dp->dp_spa; + + ASSERT(!BP_IS_EMBEDDED(bp)); + + /* + * Gang blocks are hard to issue sequentially, so we just issue them + * here immediately instead of queuing them. + */ + if (!dp->dp_scan->scn_is_sorted || BP_IS_GANG(bp)) { + scan_exec_io(dp, bp, zio_flags, zb, NULL); + return; + } + for (int i = 0; i < BP_GET_NDVAS(bp); i++) { + dva_t dva; + vdev_t *vdev; + + dva = bp->blk_dva[i]; + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&dva)); + ASSERT(vdev != NULL); + + mutex_enter(&vdev->vdev_scan_io_queue_lock); + if (vdev->vdev_scan_io_queue == NULL) + vdev->vdev_scan_io_queue = scan_io_queue_create(vdev); + ASSERT(dp->dp_scan != NULL); + scan_io_queue_insert(vdev->vdev_scan_io_queue, bp, + i, zio_flags, zb); + mutex_exit(&vdev->vdev_scan_io_queue_lock); } - mutex_exit(&spa->spa_scrub_lock); } static int @@ -1948,18 +3515,18 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, const blkptr_t *bp, const zbookmark_phys_t *zb) { dsl_scan_t *scn = dp->dp_scan; - size_t size = BP_GET_PSIZE(bp); spa_t *spa = dp->dp_spa; uint64_t phys_birth = BP_PHYSICAL_BIRTH(bp); + size_t psize = BP_GET_PSIZE(bp); boolean_t needs_io; int zio_flags = ZIO_FLAG_SCAN_THREAD | ZIO_FLAG_RAW | ZIO_FLAG_CANFAIL; - int scan_delay = 0; - - count_block(dp->dp_blkstats, bp); + int d; if (phys_birth <= scn->scn_phys.scn_min_txg || - phys_birth >= scn->scn_phys.scn_max_txg) + phys_birth >= scn->scn_phys.scn_max_txg) { + count_block(scn, dp->dp_blkstats, bp); return (0); + } /* Embedded BP's have phys_birth==0, so we reject them above. */ ASSERT(!BP_IS_EMBEDDED(bp)); @@ -1968,125 +3535,359 @@ dsl_scan_scrub_cb(dsl_pool_t *dp, if (scn->scn_phys.scn_func == POOL_SCAN_SCRUB) { zio_flags |= ZIO_FLAG_SCRUB; needs_io = B_TRUE; - scan_delay = zfs_scrub_delay; } else { ASSERT3U(scn->scn_phys.scn_func, ==, POOL_SCAN_RESILVER); zio_flags |= ZIO_FLAG_RESILVER; needs_io = B_FALSE; - scan_delay = zfs_resilver_delay; } /* If it's an intent log block, failure is expected. */ if (zb->zb_level == ZB_ZIL_LEVEL) zio_flags |= ZIO_FLAG_SPECULATIVE; - for (int d = 0; d < BP_GET_NDVAS(bp); d++) { - vdev_t *vd = vdev_lookup_top(spa, - DVA_GET_VDEV(&bp->blk_dva[d])); + for (d = 0; d < BP_GET_NDVAS(bp); d++) { + const dva_t *dva = &bp->blk_dva[d]; /* * Keep track of how much data we've examined so that * zpool(1M) status can make useful progress reports. */ - scn->scn_phys.scn_examined += DVA_GET_ASIZE(&bp->blk_dva[d]); - spa->spa_scan_pass_exam += DVA_GET_ASIZE(&bp->blk_dva[d]); + scn->scn_phys.scn_examined += DVA_GET_ASIZE(dva); + spa->spa_scan_pass_exam += DVA_GET_ASIZE(dva); /* if it's a resilver, this may not be in the target range */ - if (!needs_io) { - if (vd->vdev_ops == &vdev_indirect_ops) { - /* - * The indirect vdev can point to multiple - * vdevs. For simplicity, always create - * the resilver zio_t. zio_vdev_io_start() - * will bypass the child resilver i/o's if - * they are on vdevs that don't have DTL's. - */ - needs_io = B_TRUE; - } else if (DVA_GET_GANG(&bp->blk_dva[d])) { - /* - * Gang members may be spread across multiple - * vdevs, so the best estimate we have is the - * scrub range, which has already been checked. - * XXX -- it would be better to change our - * allocation policy to ensure that all - * gang members reside on the same vdev. - */ - needs_io = B_TRUE; - } else { - needs_io = vdev_dtl_contains(vd, DTL_PARTIAL, - phys_birth, 1); - } - } + if (!needs_io) + needs_io = dsl_scan_need_resilver(spa, dva, psize, + phys_birth); } if (needs_io && !zfs_no_scrub_io) { - vdev_t *rvd = spa->spa_root_vdev; - uint64_t maxinflight = rvd->vdev_children * zfs_top_maxinflight; + dsl_scan_enqueue(dp, bp, zio_flags, zb); + } else { + count_block(scn, dp->dp_blkstats, bp); + } + + /* do not relocate this block */ + return (0); +} +static void +dsl_scan_scrub_done(zio_t *zio) +{ + spa_t *spa = zio->io_spa; + blkptr_t *bp = zio->io_bp; + dsl_scan_io_queue_t *queue = zio->io_private; + + abd_free(zio->io_abd); + + if (queue == NULL) { mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= maxinflight) + ASSERT3U(spa->spa_scrub_inflight, >=, BP_GET_PSIZE(bp)); + spa->spa_scrub_inflight -= BP_GET_PSIZE(bp); + cv_broadcast(&spa->spa_scrub_io_cv); + mutex_exit(&spa->spa_scrub_lock); + } else { + mutex_enter(&queue->q_vd->vdev_scan_io_queue_lock); + ASSERT3U(queue->q_inflight_bytes, >=, BP_GET_PSIZE(bp)); + queue->q_inflight_bytes -= BP_GET_PSIZE(bp); + cv_broadcast(&queue->q_zio_cv); + mutex_exit(&queue->q_vd->vdev_scan_io_queue_lock); + } + + if (zio->io_error && (zio->io_error != ECKSUM || + !(zio->io_flags & ZIO_FLAG_SPECULATIVE))) { + atomic_inc_64(&spa->spa_dsl_pool->dp_scan->scn_phys.scn_errors); + } +} + +/* + * Given a scanning zio's information, executes the zio. The zio need + * not necessarily be only sortable, this function simply executes the + * zio, no matter what it is. The optional queue argument allows the + * caller to specify that they want per top level vdev IO rate limiting + * instead of the legacy global limiting. + */ +static void +scan_exec_io(dsl_pool_t *dp, const blkptr_t *bp, int zio_flags, + const zbookmark_phys_t *zb, dsl_scan_io_queue_t *queue) +{ + spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = dp->dp_scan; + size_t size = BP_GET_PSIZE(bp); + abd_t *data = abd_alloc_for_io(size, B_FALSE); + + if (queue == NULL) { + mutex_enter(&spa->spa_scrub_lock); + while (spa->spa_scrub_inflight >= scn->scn_maxinflight_bytes) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; + spa->spa_scrub_inflight += BP_GET_PSIZE(bp); mutex_exit(&spa->spa_scrub_lock); + } else { + kmutex_t *q_lock = &queue->q_vd->vdev_scan_io_queue_lock; - /* - * If we're seeing recent (zfs_scan_idle) "important" I/Os - * then throttle our workload to limit the impact of a scan. - */ - if (ddi_get_lbolt64() - spa->spa_last_io <= zfs_scan_idle) - delay(scan_delay); + mutex_enter(q_lock); + while (queue->q_inflight_bytes >= queue->q_maxinflight_bytes) + cv_wait(&queue->q_zio_cv, q_lock); + queue->q_inflight_bytes += BP_GET_PSIZE(bp); + mutex_exit(q_lock); + } + + count_block(dp->dp_scan, dp->dp_blkstats, bp); + zio_nowait(zio_read(dp->dp_scan->scn_zio_root, spa, bp, data, size, + dsl_scan_scrub_done, queue, ZIO_PRIORITY_SCRUB, zio_flags, zb)); +} - zio_nowait(zio_read(NULL, spa, bp, - abd_alloc_for_io(size, B_FALSE), size, dsl_scan_scrub_done, - NULL, ZIO_PRIORITY_SCRUB, zio_flags, zb)); +/* + * This is the primary extent sorting algorithm. We balance two parameters: + * 1) how many bytes of I/O are in an extent + * 2) how well the extent is filled with I/O (as a fraction of its total size) + * Since we allow extents to have gaps between their constituent I/Os, it's + * possible to have a fairly large extent that contains the same amount of + * I/O bytes than a much smaller extent, which just packs the I/O more tightly. + * The algorithm sorts based on a score calculated from the extent's size, + * the relative fill volume (in %) and a "fill weight" parameter that controls + * the split between whether we prefer larger extents or more well populated + * extents: + * + * SCORE = FILL_IN_BYTES + (FILL_IN_PERCENT * FILL_IN_BYTES * FILL_WEIGHT) + * + * Example: + * 1) assume extsz = 64 MiB + * 2) assume fill = 32 MiB (extent is half full) + * 3) assume fill_weight = 3 + * 4) SCORE = 32M + (((32M * 100) / 64M) * 3 * 32M) / 100 + * SCORE = 32M + (50 * 3 * 32M) / 100 + * SCORE = 32M + (4800M / 100) + * SCORE = 32M + 48M + * ^ ^ + * | +--- final total relative fill-based score + * +--------- final total fill-based score + * SCORE = 80M + * + * As can be seen, at fill_ratio=3, the algorithm is slightly biased towards + * extents that are more completely filled (in a 3:2 ratio) vs just larger. + * Note that as an optimization, we replace multiplication and division by + * 100 with bitshifting by 7 (which effecitvely multiplies and divides by 128). + */ +static int +ext_size_compare(const void *x, const void *y) +{ + const range_seg_t *rsa = x, *rsb = y; + uint64_t sa = rsa->rs_end - rsa->rs_start, + sb = rsb->rs_end - rsb->rs_start; + uint64_t score_a, score_b; + + score_a = rsa->rs_fill + ((((rsa->rs_fill << 7) / sa) * + fill_weight * rsa->rs_fill) >> 7); + score_b = rsb->rs_fill + ((((rsb->rs_fill << 7) / sb) * + fill_weight * rsb->rs_fill) >> 7); + + if (score_a > score_b) + return (-1); + if (score_a == score_b) { + if (rsa->rs_start < rsb->rs_start) + return (-1); + if (rsa->rs_start == rsb->rs_start) + return (0); + return (1); } + return (1); +} - /* do not relocate this block */ - return (0); +/* + * Comparator for the q_sios_by_addr tree. Sorting is simply performed + * based on LBA-order (from lowest to highest). + */ +static int +io_addr_compare(const void *x, const void *y) +{ + const scan_io_t *a = x, *b = y; + + if (a->sio_offset < b->sio_offset) + return (-1); + if (a->sio_offset == b->sio_offset) + return (0); + return (1); +} + +/* IO queues are created on demand when they are needed. */ +static dsl_scan_io_queue_t * +scan_io_queue_create(vdev_t *vd) +{ + dsl_scan_t *scn = vd->vdev_spa->spa_dsl_pool->dp_scan; + dsl_scan_io_queue_t *q = kmem_zalloc(sizeof (*q), KM_SLEEP); + + q->q_scn = scn; + q->q_vd = vd; + cv_init(&q->q_zio_cv, NULL, CV_DEFAULT, NULL); + q->q_exts_by_addr = range_tree_create_impl(&rt_avl_ops, + &q->q_exts_by_size, ext_size_compare, zfs_scan_max_ext_gap); + avl_create(&q->q_sios_by_addr, io_addr_compare, + sizeof (scan_io_t), offsetof(scan_io_t, sio_nodes.sio_addr_node)); + + return (q); } /* - * Called by the ZFS_IOC_POOL_SCAN ioctl to start a scrub or resilver. - * Can also be called to resume a paused scrub. + * Destroys a scan queue and all segments and scan_io_t's contained in it. + * No further execution of I/O occurs, anything pending in the queue is + * simply freed without being executed. */ -int -dsl_scan(dsl_pool_t *dp, pool_scan_func_t func) +void +dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue) { - spa_t *spa = dp->dp_spa; + dsl_scan_t *scn = queue->q_scn; + scan_io_t *sio; + void *cookie = NULL; + int64_t bytes_dequeued = 0; + + ASSERT(MUTEX_HELD(&queue->q_vd->vdev_scan_io_queue_lock)); + + while ((sio = avl_destroy_nodes(&queue->q_sios_by_addr, &cookie)) != + NULL) { + ASSERT(range_tree_contains(queue->q_exts_by_addr, + sio->sio_offset, sio->sio_asize)); + bytes_dequeued += sio->sio_asize; + kmem_free(sio, sizeof (*sio)); + } + + atomic_add_64(&scn->scn_bytes_pending, -bytes_dequeued); + range_tree_vacate(queue->q_exts_by_addr, NULL, queue); + range_tree_destroy(queue->q_exts_by_addr); + avl_destroy(&queue->q_sios_by_addr); + cv_destroy(&queue->q_zio_cv); + + kmem_free(queue, sizeof (*queue)); +} + +/* + * Properly transfers a dsl_scan_queue_t from `svd' to `tvd'. This is + * called on behalf of vdev_top_transfer when creating or destroying + * a mirror vdev due to zpool attach/detach. + */ +void +dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd) +{ + mutex_enter(&svd->vdev_scan_io_queue_lock); + mutex_enter(&tvd->vdev_scan_io_queue_lock); + + VERIFY3P(tvd->vdev_scan_io_queue, ==, NULL); + tvd->vdev_scan_io_queue = svd->vdev_scan_io_queue; + svd->vdev_scan_io_queue = NULL; + if (tvd->vdev_scan_io_queue != NULL) + tvd->vdev_scan_io_queue->q_vd = tvd; + + mutex_exit(&tvd->vdev_scan_io_queue_lock); + mutex_exit(&svd->vdev_scan_io_queue_lock); +} + +static void +scan_io_queues_destroy(dsl_scan_t *scn) +{ + vdev_t *rvd = scn->scn_dp->dp_spa->spa_root_vdev; + + for (uint64_t i = 0; i < rvd->vdev_children; i++) { + vdev_t *tvd = rvd->vdev_child[i]; + + mutex_enter(&tvd->vdev_scan_io_queue_lock); + if (tvd->vdev_scan_io_queue != NULL) + dsl_scan_io_queue_destroy(tvd->vdev_scan_io_queue); + tvd->vdev_scan_io_queue = NULL; + mutex_exit(&tvd->vdev_scan_io_queue_lock); + } +} + +static void +dsl_scan_freed_dva(spa_t *spa, const blkptr_t *bp, int dva_i) +{ + dsl_pool_t *dp = spa->spa_dsl_pool; dsl_scan_t *scn = dp->dp_scan; + vdev_t *vdev; + kmutex_t *q_lock; + dsl_scan_io_queue_t *queue; + scan_io_t srch, *sio; + avl_index_t idx; + uint64_t start, size; + + vdev = vdev_lookup_top(spa, DVA_GET_VDEV(&bp->blk_dva[dva_i])); + ASSERT(vdev != NULL); + q_lock = &vdev->vdev_scan_io_queue_lock; + queue = vdev->vdev_scan_io_queue; + + mutex_enter(q_lock); + if (queue == NULL) { + mutex_exit(q_lock); + return; + } + + bp2sio(bp, &srch, dva_i); + start = srch.sio_offset; + size = srch.sio_asize; /* - * Purge all vdev caches and probe all devices. We do this here - * rather than in sync context because this requires a writer lock - * on the spa_config lock, which we can't do from sync context. The - * spa_scrub_reopen flag indicates that vdev_open() should not - * attempt to start another scrub. + * We can find the zio in two states: + * 1) Cold, just sitting in the queue of zio's to be issued at + * some point in the future. In this case, all we do is + * remove the zio from the q_sios_by_addr tree, decrement + * its data volume from the containing range_seg_t and + * resort the q_exts_by_size tree to reflect that the + * range_seg_t has lost some of its 'fill'. We don't shorten + * the range_seg_t - this is usually rare enough not to be + * worth the extra hassle of trying keep track of precise + * extent boundaries. + * 2) Hot, where the zio is currently in-flight in + * dsl_scan_issue_ios. In this case, we can't simply + * reach in and stop the in-flight zio's, so we instead + * block the caller. Eventually, dsl_scan_issue_ios will + * be done with issuing the zio's it gathered and will + * signal us. */ - spa_vdev_state_enter(spa, SCL_NONE); - spa->spa_scrub_reopen = B_TRUE; - vdev_reopen(spa->spa_root_vdev); - spa->spa_scrub_reopen = B_FALSE; - (void) spa_vdev_state_exit(spa, NULL, 0); + sio = avl_find(&queue->q_sios_by_addr, &srch, &idx); + if (sio != NULL) { + int64_t asize = sio->sio_asize; + blkptr_t tmpbp; - if (func == POOL_SCAN_SCRUB && dsl_scan_is_paused_scrub(scn)) { - /* got scrub start cmd, resume paused scrub */ - int err = dsl_scrub_set_pause_resume(scn->scn_dp, - POOL_SCRUB_NORMAL); - if (err == 0) { - spa_event_notify(spa, NULL, NULL, ESC_ZFS_SCRUB_RESUME); - return (ECANCELED); - } + /* Got it while it was cold in the queue */ + ASSERT3U(start, ==, sio->sio_offset); + ASSERT3U(size, ==, asize); + avl_remove(&queue->q_sios_by_addr, sio); - return (SET_ERROR(err)); - } + ASSERT(range_tree_contains(queue->q_exts_by_addr, start, size)); + range_tree_remove_fill(queue->q_exts_by_addr, start, size); - return (dsl_sync_task(spa_name(spa), dsl_scan_setup_check, - dsl_scan_setup_sync, &func, 0, ZFS_SPACE_CHECK_EXTRA_RESERVED)); + /* + * We only update scn_bytes_pending in the cold path, + * otherwise it will already have been accounted for as + * part of the zio's execution. + */ + atomic_add_64(&scn->scn_bytes_pending, -asize); + + /* count the block as though we issued it */ + sio2bp(sio, &tmpbp, dva_i); + count_block(scn, dp->dp_blkstats, &tmpbp); + + kmem_free(sio, sizeof (*sio)); + } + mutex_exit(q_lock); } -static boolean_t -dsl_scan_restarting(dsl_scan_t *scn, dmu_tx_t *tx) +/* + * Callback invoked when a zio_free() zio is executing. This needs to be + * intercepted to prevent the zio from deallocating a particular portion + * of disk space and it then getting reallocated and written to, while we + * still have it queued up for processing. + */ +void +dsl_scan_freed(spa_t *spa, const blkptr_t *bp) { - return (scn->scn_restart_txg != 0 && - scn->scn_restart_txg <= tx->tx_txg); + dsl_pool_t *dp = spa->spa_dsl_pool; + dsl_scan_t *scn = dp->dp_scan; + + ASSERT(!BP_IS_EMBEDDED(bp)); + ASSERT(scn != NULL); + if (!dsl_scan_is_running(scn)) + return; + + for (int i = 0; i < BP_GET_NDVAS(bp); i++) + dsl_scan_freed_dva(spa, bp, i); } diff --git a/usr/src/uts/common/fs/zfs/metaslab.c b/usr/src/uts/common/fs/zfs/metaslab.c index 0044f379643d..1c004f87f31a 100644 --- a/usr/src/uts/common/fs/zfs/metaslab.c +++ b/usr/src/uts/common/fs/zfs/metaslab.c @@ -1128,85 +1128,6 @@ metaslab_rangesize_compare(const void *x1, const void *x2) return (AVL_CMP(r1->rs_start, r2->rs_start)); } -/* - * Create any block allocator specific components. The current allocators - * rely on using both a size-ordered range_tree_t and an array of uint64_t's. - */ -static void -metaslab_rt_create(range_tree_t *rt, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT(msp->ms_allocatable == NULL); - - avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); -} - -/* - * Destroy the block allocator specific components. - */ -static void -metaslab_rt_destroy(range_tree_t *rt, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_allocatable, ==, rt); - ASSERT0(avl_numnodes(&msp->ms_allocatable_by_size)); - - avl_destroy(&msp->ms_allocatable_by_size); -} - -static void -metaslab_rt_add(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_allocatable, ==, rt); - VERIFY(!msp->ms_condensing); - avl_add(&msp->ms_allocatable_by_size, rs); -} - -static void -metaslab_rt_remove(range_tree_t *rt, range_seg_t *rs, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_allocatable, ==, rt); - VERIFY(!msp->ms_condensing); - avl_remove(&msp->ms_allocatable_by_size, rs); -} - -static void -metaslab_rt_vacate(range_tree_t *rt, void *arg) -{ - metaslab_t *msp = arg; - - ASSERT3P(rt->rt_arg, ==, msp); - ASSERT3P(msp->ms_allocatable, ==, rt); - - /* - * Normally one would walk the tree freeing nodes along the way. - * Since the nodes are shared with the range trees we can avoid - * walking all nodes and just reinitialize the avl tree. The nodes - * will be freed by the range tree, so we don't want to free them here. - */ - avl_create(&msp->ms_allocatable_by_size, metaslab_rangesize_compare, - sizeof (range_seg_t), offsetof(range_seg_t, rs_pp_node)); -} - -static range_tree_ops_t metaslab_rt_ops = { - metaslab_rt_create, - metaslab_rt_destroy, - metaslab_rt_add, - metaslab_rt_remove, - metaslab_rt_vacate -}; - /* * ========================================================================== * Common allocator routines @@ -1891,7 +1812,8 @@ metaslab_init(metaslab_group_t *mg, uint64_t id, uint64_t object, uint64_t txg, * we'd data fault on any attempt to use this metaslab before * it's ready. */ - ms->ms_allocatable = range_tree_create(&metaslab_rt_ops, ms); + ms->ms_allocatable = range_tree_create_impl(&rt_avl_ops, + &ms->ms_allocatable_by_size, metaslab_rangesize_compare, 0); metaslab_group_add(mg, ms); metaslab_set_fragmentation(ms); diff --git a/usr/src/uts/common/fs/zfs/range_tree.c b/usr/src/uts/common/fs/zfs/range_tree.c index 0a852a9c8da7..fc705e37964d 100644 --- a/usr/src/uts/common/fs/zfs/range_tree.c +++ b/usr/src/uts/common/fs/zfs/range_tree.c @@ -33,8 +33,58 @@ #include #include +/* + * Range trees are tree-based data structures that can be used to + * track free space or generally any space allocation information. + * A range tree keeps track of individual segments and automatically + * provides facilities such as adjacent extent merging and extent + * splitting in response to range add/remove requests. + * + * A range tree starts out completely empty, with no segments in it. + * Adding an allocation via range_tree_add to the range tree can either: + * 1) create a new extent + * 2) extend an adjacent extent + * 3) merge two adjacent extents + * Conversely, removing an allocation via range_tree_remove can: + * 1) completely remove an extent + * 2) shorten an extent (if the allocation was near one of its ends) + * 3) split an extent into two extents, in effect punching a hole + * + * A range tree is also capable of 'bridging' gaps when adding + * allocations. This is useful for cases when close proximity of + * allocations is an important detail that needs to be represented + * in the range tree. See range_tree_set_gap(). The default behavior + * is not to bridge gaps (i.e. the maximum allowed gap size is 0). + * + * In order to traverse a range tree, use either the range_tree_walk() + * or range_tree_vacate() functions. + * + * To obtain more accurate information on individual segment + * operations that the range tree performs "under the hood", you can + * specify a set of callbacks by passing a range_tree_ops_t structure + * to the range_tree_create function. Any callbacks that are non-NULL + * are then called at the appropriate times. + * + * The range tree code also supports a special variant of range trees + * that can bridge small gaps between segments. This kind of tree is used + * by the dsl scanning code to group I/Os into mostly sequential chunks to + * optimize disk performance. The code here attempts to do this with as + * little memory and computational overhead as possible. One limitation of + * this implementation is that segments of range trees with gaps can only + * support removing complete segments. + */ + kmem_cache_t *range_seg_cache; +/* Generic ops for managing an AVL tree alongside a range tree */ +struct range_tree_ops rt_avl_ops = { + .rtop_create = rt_avl_create, + .rtop_destroy = rt_avl_destroy, + .rtop_add = rt_avl_add, + .rtop_remove = rt_avl_remove, + .rtop_vacate = rt_avl_vacate, +}; + void range_tree_init(void) { @@ -119,30 +169,37 @@ range_tree_seg_compare(const void *x1, const void *x2) } range_tree_t * -range_tree_create(range_tree_ops_t *ops, void *arg) +range_tree_create_impl(range_tree_ops_t *ops, void *arg, + int (*avl_compare) (const void *, const void *), uint64_t gap) { - range_tree_t *rt; - - rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); + range_tree_t *rt = kmem_zalloc(sizeof (range_tree_t), KM_SLEEP); avl_create(&rt->rt_root, range_tree_seg_compare, sizeof (range_seg_t), offsetof(range_seg_t, rs_node)); rt->rt_ops = ops; rt->rt_arg = arg; + rt->rt_gap = gap; + rt->rt_avl_compare = avl_compare; - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_create != NULL) rt->rt_ops->rtop_create(rt, rt->rt_arg); return (rt); } +range_tree_t * +range_tree_create(range_tree_ops_t *ops, void *arg) +{ + return (range_tree_create_impl(ops, arg, NULL, 0)); +} + void range_tree_destroy(range_tree_t *rt) { VERIFY0(rt->rt_space); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_destroy != NULL) rt->rt_ops->rtop_destroy(rt, rt->rt_arg); avl_destroy(&rt->rt_root); @@ -150,39 +207,99 @@ range_tree_destroy(range_tree_t *rt) } void -range_tree_add(void *arg, uint64_t start, uint64_t size) +range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta) +{ + ASSERT3U(rs->rs_fill + delta, !=, 0); + ASSERT3U(rs->rs_fill + delta, <=, rs->rs_end - rs->rs_start); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + rs->rs_fill += delta; + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); +} + +static void +range_tree_add_impl(void *arg, uint64_t start, uint64_t size, uint64_t fill) { range_tree_t *rt = arg; avl_index_t where; range_seg_t rsearch, *rs_before, *rs_after, *rs; - uint64_t end = start + size; + uint64_t end = start + size, gap = rt->rt_gap; + uint64_t bridge_size = 0; boolean_t merge_before, merge_after; - VERIFY(size != 0); + ASSERT3U(size, !=, 0); + ASSERT3U(fill, <=, size); rsearch.rs_start = start; rsearch.rs_end = end; rs = avl_find(&rt->rt_root, &rsearch, &where); - if (rs != NULL && rs->rs_start <= start && rs->rs_end >= end) { + if (gap == 0 && rs != NULL && + rs->rs_start <= start && rs->rs_end >= end) { zfs_panic_recover("zfs: allocating allocated segment" - "(offset=%llu size=%llu)\n", - (longlong_t)start, (longlong_t)size); + "(offset=%llu size=%llu) of (offset=%llu size=%llu)\n", + (longlong_t)start, (longlong_t)size, + (longlong_t)rs->rs_start, + (longlong_t)rs->rs_end - rs->rs_start); return; } - /* Make sure we don't overlap with either of our neighbors */ - VERIFY3P(rs, ==, NULL); + /* + * If this is a gap-supporting range tree, it is possible that we + * are inserting into an existing segment. In this case simply + * bump the fill count and call the remove / add callbacks. If the + * new range will extend an existing segment, we remove the + * existing one, apply the new extent to it and re-insert it using + * the normal code paths. + */ + if (rs != NULL) { + ASSERT3U(gap, !=, 0); + if (rs->rs_start <= start && rs->rs_end >= end) { + range_tree_adjust_fill(rt, rs, fill); + return; + } + + avl_remove(&rt->rt_root, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + range_tree_stat_decr(rt, rs); + rt->rt_space -= rs->rs_end - rs->rs_start; + + fill += rs->rs_fill; + start = MIN(start, rs->rs_start); + end = MAX(end, rs->rs_end); + size = end - start; + + range_tree_add_impl(rt, start, size, fill); + + kmem_cache_free(range_seg_cache, rs); + return; + } + ASSERT3P(rs, ==, NULL); + + /* + * Determine whether or not we will have to merge with our neighbors. + * If gap != 0, we might need to merge with our neighbors even if we + * aren't directly touching. + */ rs_before = avl_nearest(&rt->rt_root, where, AVL_BEFORE); rs_after = avl_nearest(&rt->rt_root, where, AVL_AFTER); - merge_before = (rs_before != NULL && rs_before->rs_end == start); - merge_after = (rs_after != NULL && rs_after->rs_start == end); + merge_before = (rs_before != NULL && rs_before->rs_end >= start - gap); + merge_after = (rs_after != NULL && rs_after->rs_start <= end + gap); + + if (merge_before && gap != 0) + bridge_size += start - rs_before->rs_end; + if (merge_after && gap != 0) + bridge_size += rs_after->rs_start - end; if (merge_before && merge_after) { avl_remove(&rt->rt_root, rs_before); - if (rt->rt_ops != NULL) { + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) { rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); } @@ -190,43 +307,59 @@ range_tree_add(void *arg, uint64_t start, uint64_t size) range_tree_stat_decr(rt, rs_before); range_tree_stat_decr(rt, rs_after); + rs_after->rs_fill += rs_before->rs_fill + fill; rs_after->rs_start = rs_before->rs_start; kmem_cache_free(range_seg_cache, rs_before); rs = rs_after; } else if (merge_before) { - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_before, rt->rt_arg); range_tree_stat_decr(rt, rs_before); + rs_before->rs_fill += fill; rs_before->rs_end = end; rs = rs_before; } else if (merge_after) { - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs_after, rt->rt_arg); range_tree_stat_decr(rt, rs_after); + rs_after->rs_fill += fill; rs_after->rs_start = start; rs = rs_after; } else { rs = kmem_cache_alloc(range_seg_cache, KM_SLEEP); + + rs->rs_fill = fill; rs->rs_start = start; rs->rs_end = end; avl_insert(&rt->rt_root, rs, where); } - if (rt->rt_ops != NULL) + if (gap != 0) + ASSERT3U(rs->rs_fill, <=, rs->rs_end - rs->rs_start); + else + ASSERT3U(rs->rs_fill, ==, rs->rs_end - rs->rs_start); + + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); range_tree_stat_incr(rt, rs); - rt->rt_space += size; + rt->rt_space += size + bridge_size; } void -range_tree_remove(void *arg, uint64_t start, uint64_t size) +range_tree_add(void *arg, uint64_t start, uint64_t size) +{ + range_tree_add_impl(arg, start, size, size); +} + +static void +range_tree_remove_impl(range_tree_t *rt, uint64_t start, uint64_t size, + boolean_t do_fill) { - range_tree_t *rt = arg; avl_index_t where; range_seg_t rsearch, *rs, *newseg; uint64_t end = start + size; @@ -246,6 +379,34 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size) (longlong_t)start, (longlong_t)size); return; } + + /* + * Range trees with gap support must only remove complete segments + * from the tree. This allows us to maintain accurate fill accounting + * and to ensure that bridged sections are not leaked. If we need to + * remove less than the full segment, we can only adjust the fill count. + */ + if (rt->rt_gap != 0) { + if (do_fill) { + if (rs->rs_fill == size) { + start = rs->rs_start; + end = rs->rs_end; + size = end - start; + } else { + range_tree_adjust_fill(rt, rs, -size); + return; + } + } else if (rs->rs_start != start || rs->rs_end != end) { + zfs_panic_recover("zfs: freeing partial segment of " + "gap tree (offset=%llu size=%llu) of " + "(offset=%llu size=%llu)", + (longlong_t)start, (longlong_t)size, + (longlong_t)rs->rs_start, + (longlong_t)rs->rs_end - rs->rs_start); + return; + } + } + VERIFY3U(rs->rs_start, <=, start); VERIFY3U(rs->rs_end, >=, end); @@ -254,19 +415,20 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size) range_tree_stat_decr(rt, rs); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); if (left_over && right_over) { newseg = kmem_cache_alloc(range_seg_cache, KM_SLEEP); newseg->rs_start = end; newseg->rs_end = rs->rs_end; + newseg->rs_fill = newseg->rs_end - newseg->rs_start; range_tree_stat_incr(rt, newseg); rs->rs_end = start; avl_insert_here(&rt->rt_root, newseg, rs, AVL_AFTER); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, newseg, rt->rt_arg); } else if (left_over) { rs->rs_end = start; @@ -279,15 +441,53 @@ range_tree_remove(void *arg, uint64_t start, uint64_t size) } if (rs != NULL) { + /* + * The fill of the leftover segment will always be equal to + * the size, since we do not support removing partial segments + * of range trees with gaps. + */ + rs->rs_fill = rs->rs_end - rs->rs_start; range_tree_stat_incr(rt, rs); - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); } rt->rt_space -= size; } +void +range_tree_remove(void *arg, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(arg, start, size, B_FALSE); +} + +void +range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size) +{ + range_tree_remove_impl(rt, start, size, B_TRUE); +} + +void +range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, + uint64_t newstart, uint64_t newsize) +{ + int64_t delta = newsize - (rs->rs_end - rs->rs_start); + + range_tree_stat_decr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_remove != NULL) + rt->rt_ops->rtop_remove(rt, rs, rt->rt_arg); + + rs->rs_start = newstart; + rs->rs_end = newstart + newsize; + + range_tree_stat_incr(rt, rs); + if (rt->rt_ops != NULL && rt->rt_ops->rtop_add != NULL) + rt->rt_ops->rtop_add(rt, rs, rt->rt_arg); + + rt->rt_space += delta; +} + static range_seg_t * range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) { @@ -301,7 +501,7 @@ range_tree_find_impl(range_tree_t *rt, uint64_t start, uint64_t size) return (avl_find(&rt->rt_root, &rsearch, NULL)); } -static range_seg_t * +range_seg_t * range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size) { range_seg_t *rs = range_tree_find_impl(rt, start, size); @@ -363,7 +563,7 @@ range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg) void *cookie = NULL; - if (rt->rt_ops != NULL) + if (rt->rt_ops != NULL && rt->rt_ops->rtop_vacate != NULL) rt->rt_ops->rtop_vacate(rt, rt->rt_arg); while ((rs = avl_destroy_nodes(&rt->rt_root, &cookie)) != NULL) { @@ -385,12 +585,63 @@ range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg) func(arg, rs->rs_start, rs->rs_end - rs->rs_start); } +range_seg_t * +range_tree_first(range_tree_t *rt) +{ + return (avl_first(&rt->rt_root)); +} + uint64_t range_tree_space(range_tree_t *rt) { return (rt->rt_space); } +/* Generic range tree functions for maintaining segments in an AVL tree. */ +void +rt_avl_create(range_tree_t *rt, void *arg) +{ + avl_tree_t *tree = arg; + + avl_create(tree, rt->rt_avl_compare, sizeof (range_seg_t), + offsetof(range_seg_t, rs_pp_node)); +} + +void +rt_avl_destroy(range_tree_t *rt, void *arg) +{ + avl_tree_t *tree = arg; + + ASSERT0(avl_numnodes(tree)); + avl_destroy(tree); +} + +void +rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + avl_tree_t *tree = arg; + avl_add(tree, rs); +} + +void +rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg) +{ + avl_tree_t *tree = arg; + avl_remove(tree, rs); +} + +void +rt_avl_vacate(range_tree_t *rt, void *arg) +{ + /* + * Normally one would walk the tree freeing nodes along the way. + * Since the nodes are shared with the range trees we can avoid + * walking all nodes and just reinitialize the avl tree. The nodes + * will be freed by the range tree, so we don't want to free them here. + */ + rt_avl_create(rt, arg); +} + boolean_t range_tree_is_empty(range_tree_t *rt) { diff --git a/usr/src/uts/common/fs/zfs/spa.c b/usr/src/uts/common/fs/zfs/spa.c index 403ace2d9da0..68abdf114ca4 100644 --- a/usr/src/uts/common/fs/zfs/spa.c +++ b/usr/src/uts/common/fs/zfs/spa.c @@ -1999,7 +1999,7 @@ spa_load_verify_done(zio_t *zio) } mutex_enter(&spa->spa_scrub_lock); - spa->spa_scrub_inflight--; + spa->spa_load_verify_ios--; cv_broadcast(&spa->spa_scrub_io_cv); mutex_exit(&spa->spa_scrub_lock); } @@ -2033,9 +2033,9 @@ spa_load_verify_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, size_t size = BP_GET_PSIZE(bp); mutex_enter(&spa->spa_scrub_lock); - while (spa->spa_scrub_inflight >= spa_load_verify_maxinflight) + while (spa->spa_load_verify_ios >= spa_load_verify_maxinflight) cv_wait(&spa->spa_scrub_io_cv, &spa->spa_scrub_lock); - spa->spa_scrub_inflight++; + spa->spa_load_verify_ios++; mutex_exit(&spa->spa_scrub_lock); zio_nowait(zio_read(rio, spa, bp, abd_alloc_for_io(size, B_FALSE), size, diff --git a/usr/src/uts/common/fs/zfs/spa_misc.c b/usr/src/uts/common/fs/zfs/spa_misc.c index 9a80f89a8ac3..7a44ac86b003 100644 --- a/usr/src/uts/common/fs/zfs/spa_misc.c +++ b/usr/src/uts/common/fs/zfs/spa_misc.c @@ -2041,6 +2041,7 @@ spa_init(int mode) zpool_feature_init(); spa_config_load(); l2arc_start(); + scan_init(); } void @@ -2058,6 +2059,7 @@ spa_fini(void) range_tree_fini(); unique_fini(); zfs_refcount_fini(); + scan_fini(); avl_destroy(&spa_namespace_avl); avl_destroy(&spa_spare_avl); @@ -2159,6 +2161,7 @@ spa_scan_stat_init(spa_t *spa) spa->spa_scan_pass_scrub_pause = 0; spa->spa_scan_pass_scrub_spent_paused = 0; spa->spa_scan_pass_exam = 0; + spa->spa_scan_pass_issued = 0; vdev_scan_stat_init(spa->spa_root_vdev); } @@ -2176,18 +2179,22 @@ spa_scan_get_stats(spa_t *spa, pool_scan_stat_t *ps) /* data stored on disk */ ps->pss_func = scn->scn_phys.scn_func; + ps->pss_state = scn->scn_phys.scn_state; ps->pss_start_time = scn->scn_phys.scn_start_time; ps->pss_end_time = scn->scn_phys.scn_end_time; ps->pss_to_examine = scn->scn_phys.scn_to_examine; - ps->pss_examined = scn->scn_phys.scn_examined; ps->pss_to_process = scn->scn_phys.scn_to_process; ps->pss_processed = scn->scn_phys.scn_processed; ps->pss_errors = scn->scn_phys.scn_errors; + ps->pss_examined = scn->scn_phys.scn_examined; + ps->pss_issued = + scn->scn_issued_before_pass + spa->spa_scan_pass_issued; ps->pss_state = scn->scn_phys.scn_state; /* data not stored on disk */ ps->pss_pass_start = spa->spa_scan_pass_start; ps->pss_pass_exam = spa->spa_scan_pass_exam; + ps->pss_pass_issued = spa->spa_scan_pass_issued; ps->pss_pass_scrub_pause = spa->spa_scan_pass_scrub_pause; ps->pss_pass_scrub_spent_paused = spa->spa_scan_pass_scrub_spent_paused; diff --git a/usr/src/uts/common/fs/zfs/sys/arc.h b/usr/src/uts/common/fs/zfs/sys/arc.h index 87d2d1d329a5..1ce4740fcf2b 100644 --- a/usr/src/uts/common/fs/zfs/sys/arc.h +++ b/usr/src/uts/common/fs/zfs/sys/arc.h @@ -58,11 +58,13 @@ _NOTE(CONSTCOND) } while (0) typedef struct arc_buf_hdr arc_buf_hdr_t; typedef struct arc_buf arc_buf_t; -typedef void arc_done_func_t(zio_t *zio, arc_buf_t *buf, void *private); +typedef void arc_read_done_func_t(zio_t *zio, const zbookmark_phys_t *zb, + const blkptr_t *bp, arc_buf_t *buf, void *private); +typedef void arc_write_done_func_t(zio_t *zio, arc_buf_t *buf, void *priv); /* generic arc_done_func_t's which you can use */ -arc_done_func_t arc_bcopy_func; -arc_done_func_t arc_getbuf_func; +arc_read_done_func_t arc_bcopy_func; +arc_read_done_func_t arc_getbuf_func; typedef enum arc_flags { @@ -75,35 +77,36 @@ typedef enum arc_flags ARC_FLAG_CACHED = 1 << 3, /* I/O was in cache */ ARC_FLAG_L2CACHE = 1 << 4, /* cache in L2ARC */ ARC_FLAG_PREDICTIVE_PREFETCH = 1 << 5, /* I/O from zfetch */ + ARC_FLAG_PRESCIENT_PREFETCH = 1 << 6, /* long min lifespan */ /* * Private ARC flags. These flags are private ARC only flags that * will show up in b_flags in the arc_hdr_buf_t. These flags should * only be set by ARC code. */ - ARC_FLAG_IN_HASH_TABLE = 1 << 6, /* buffer is hashed */ - ARC_FLAG_IO_IN_PROGRESS = 1 << 7, /* I/O in progress */ - ARC_FLAG_IO_ERROR = 1 << 8, /* I/O failed for buf */ - ARC_FLAG_INDIRECT = 1 << 9, /* indirect block */ + ARC_FLAG_IN_HASH_TABLE = 1 << 7, /* buffer is hashed */ + ARC_FLAG_IO_IN_PROGRESS = 1 << 8, /* I/O in progress */ + ARC_FLAG_IO_ERROR = 1 << 9, /* I/O failed for buf */ + ARC_FLAG_INDIRECT = 1 << 10, /* indirect block */ /* Indicates that block was read with ASYNC priority. */ - ARC_FLAG_PRIO_ASYNC_READ = 1 << 10, - ARC_FLAG_L2_WRITING = 1 << 11, /* write in progress */ - ARC_FLAG_L2_EVICTED = 1 << 12, /* evicted during I/O */ - ARC_FLAG_L2_WRITE_HEAD = 1 << 13, /* head of write list */ + ARC_FLAG_PRIO_ASYNC_READ = 1 << 11, + ARC_FLAG_L2_WRITING = 1 << 12, /* write in progress */ + ARC_FLAG_L2_EVICTED = 1 << 13, /* evicted during I/O */ + ARC_FLAG_L2_WRITE_HEAD = 1 << 14, /* head of write list */ /* indicates that the buffer contains metadata (otherwise, data) */ - ARC_FLAG_BUFC_METADATA = 1 << 14, + ARC_FLAG_BUFC_METADATA = 1 << 15, /* Flags specifying whether optional hdr struct fields are defined */ - ARC_FLAG_HAS_L1HDR = 1 << 15, - ARC_FLAG_HAS_L2HDR = 1 << 16, + ARC_FLAG_HAS_L1HDR = 1 << 16, + ARC_FLAG_HAS_L2HDR = 1 << 17, /* * Indicates the arc_buf_hdr_t's b_pdata matches the on-disk data. * This allows the l2arc to use the blkptr's checksum to verify * the data without having to store the checksum in the hdr. */ - ARC_FLAG_COMPRESSED_ARC = 1 << 17, - ARC_FLAG_SHARED_DATA = 1 << 18, + ARC_FLAG_COMPRESSED_ARC = 1 << 18, + ARC_FLAG_SHARED_DATA = 1 << 19, /* * The arc buffer's compression mode is stored in the top 7 bits of the @@ -180,12 +183,12 @@ int arc_referenced(arc_buf_t *buf); #endif int arc_read(zio_t *pio, spa_t *spa, const blkptr_t *bp, - arc_done_func_t *done, void *private, zio_priority_t priority, int flags, - arc_flags_t *arc_flags, const zbookmark_phys_t *zb); + arc_read_done_func_t *done, void *private, zio_priority_t priority, + int flags, arc_flags_t *arc_flags, const zbookmark_phys_t *zb); zio_t *arc_write(zio_t *pio, spa_t *spa, uint64_t txg, blkptr_t *bp, arc_buf_t *buf, boolean_t l2arc, const zio_prop_t *zp, - arc_done_func_t *ready, arc_done_func_t *child_ready, - arc_done_func_t *physdone, arc_done_func_t *done, + arc_write_done_func_t *ready, arc_write_done_func_t *child_ready, + arc_write_done_func_t *physdone, arc_write_done_func_t *done, void *private, zio_priority_t priority, int zio_flags, const zbookmark_phys_t *zb); void arc_freed(spa_t *spa, const blkptr_t *bp); diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h index b23d19eef5cc..66098900db58 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_pool.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_pool.h @@ -77,6 +77,7 @@ typedef struct zfs_blkstat { typedef struct zfs_all_blkstats { zfs_blkstat_t zab_type[DN_MAX_LEVELS + 1][DMU_OT_TOTAL + 1]; + kmutex_t zab_lock; } zfs_all_blkstats_t; diff --git a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h index a738ebe3ee08..1b600405aed6 100644 --- a/usr/src/uts/common/fs/zfs/sys/dsl_scan.h +++ b/usr/src/uts/common/fs/zfs/sys/dsl_scan.h @@ -74,6 +74,8 @@ typedef enum dsl_scan_flags { DSF_SCRUB_PAUSED = 1<<1, } dsl_scan_flags_t; +#define DSL_SCAN_FLAGS_MASK (DSF_VISIT_DS_AGAIN) + /* * Every pool will have one dsl_scan_t and this structure will contain * in-memory information about the scan and a pointer to the on-disk @@ -106,12 +108,10 @@ typedef enum dsl_scan_flags { */ typedef struct dsl_scan { struct dsl_pool *scn_dp; - - boolean_t scn_suspending; uint64_t scn_restart_txg; uint64_t scn_done_txg; uint64_t scn_sync_start_time; - zio_t *scn_zio_root; + uint64_t scn_issued_before_pass; /* for freeing blocks */ boolean_t scn_is_bptree; @@ -119,12 +119,46 @@ typedef struct dsl_scan { boolean_t scn_async_stalled; uint64_t scn_async_block_min_time_ms; - /* for debugging / information */ - uint64_t scn_visited_this_txg; + /* flags and stats for controlling scan state */ + boolean_t scn_is_sorted; /* doing sequential scan */ + boolean_t scn_clearing; /* scan is issuing sequential extents */ + boolean_t scn_checkpointing; /* scan is issuing all queued extents */ + boolean_t scn_suspending; /* scan is suspending until next txg */ + uint64_t scn_last_checkpoint; /* time of last checkpoint */ + + /* members for thread synchronization */ + zio_t *scn_zio_root; /* root zio for waiting on IO */ + taskq_t *scn_taskq; /* task queue for issuing extents */ - dsl_scan_phys_t scn_phys; + /* for controlling scan prefetch, protected by spa_scrub_lock */ + boolean_t scn_prefetch_stop; /* prefetch should stop */ + zbookmark_phys_t scn_prefetch_bookmark; /* prefetch start bookmark */ + avl_tree_t scn_prefetch_queue; /* priority queue of prefetch IOs */ + uint64_t scn_maxinflight_bytes; /* max bytes in flight for pool */ + + /* per txg statistics */ + uint64_t scn_visited_this_txg; /* total bps visited this txg */ + uint64_t scn_holes_this_txg; + uint64_t scn_lt_min_this_txg; + uint64_t scn_gt_max_this_txg; + uint64_t scn_ddt_contained_this_txg; + uint64_t scn_objsets_visited_this_txg; + uint64_t scn_avg_seg_size_this_txg; + uint64_t scn_segs_this_txg; + uint64_t scn_avg_zio_size_this_txg; + uint64_t scn_zios_this_txg; + + /* members needed for syncing scan status to disk */ + dsl_scan_phys_t scn_phys; /* on disk representation of scan */ + dsl_scan_phys_t scn_phys_cached; + avl_tree_t scn_queue; /* queue of datasets to scan */ + uint64_t scn_bytes_pending; /* outstanding data to issue */ } dsl_scan_t; +typedef struct dsl_scan_io_queue dsl_scan_io_queue_t; + +void scan_init(void); +void scan_fini(void); int dsl_scan_init(struct dsl_pool *dp, uint64_t txg); void dsl_scan_fini(struct dsl_pool *dp); void dsl_scan_sync(struct dsl_pool *, dmu_tx_t *); @@ -143,6 +177,9 @@ void dsl_scan_ds_clone_swapped(struct dsl_dataset *ds1, struct dsl_dataset *ds2, struct dmu_tx *tx); boolean_t dsl_scan_active(dsl_scan_t *scn); boolean_t dsl_scan_is_paused_scrub(const dsl_scan_t *scn); +void dsl_scan_freed(spa_t *spa, const blkptr_t *bp); +void dsl_scan_io_queue_destroy(dsl_scan_io_queue_t *queue); +void dsl_scan_io_queue_vdev_xfer(vdev_t *svd, vdev_t *tvd); #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/range_tree.h b/usr/src/uts/common/fs/zfs/sys/range_tree.h index 3816dabf7c1c..588f41fcb729 100644 --- a/usr/src/uts/common/fs/zfs/sys/range_tree.h +++ b/usr/src/uts/common/fs/zfs/sys/range_tree.h @@ -48,9 +48,13 @@ typedef struct range_tree_ops range_tree_ops_t; typedef struct range_tree { avl_tree_t rt_root; /* offset-ordered segment AVL tree */ uint64_t rt_space; /* sum of all segments in the map */ + uint64_t rt_gap; /* allowable inter-segment gap */ range_tree_ops_t *rt_ops; void *rt_arg; + /* rt_avl_compare should only be set it rt_arg is an AVL tree */ + int (*rt_avl_compare)(const void *, const void *); + /* * The rt_histogram maintains a histogram of ranges. Each bucket, * rt_histogram[i], contains the number of ranges whose size is: @@ -64,6 +68,7 @@ typedef struct range_seg { avl_node_t rs_pp_node; /* AVL picker-private node */ uint64_t rs_start; /* starting offset of this segment */ uint64_t rs_end; /* ending offset (non-inclusive) */ + uint64_t rs_fill; /* actual fill if gap mode is on */ } range_seg_t; struct range_tree_ops { @@ -78,11 +83,16 @@ typedef void range_tree_func_t(void *arg, uint64_t start, uint64_t size); void range_tree_init(void); void range_tree_fini(void); +range_tree_t *range_tree_create_impl(range_tree_ops_t *ops, void *arg, + int (*avl_compare)(const void*, const void*), uint64_t gap); range_tree_t *range_tree_create(range_tree_ops_t *ops, void *arg); void range_tree_destroy(range_tree_t *rt); boolean_t range_tree_contains(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_verify_not_present(range_tree_t *rt, uint64_t start, uint64_t size); +range_seg_t *range_tree_find(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_resize_segment(range_tree_t *rt, range_seg_t *rs, + uint64_t newstart, uint64_t newsize); uint64_t range_tree_space(range_tree_t *rt); boolean_t range_tree_is_empty(range_tree_t *rt); void range_tree_swap(range_tree_t **rtsrc, range_tree_t **rtdst); @@ -93,10 +103,27 @@ uint64_t range_tree_span(range_tree_t *rt); void range_tree_add(void *arg, uint64_t start, uint64_t size); void range_tree_remove(void *arg, uint64_t start, uint64_t size); +void range_tree_remove_fill(range_tree_t *rt, uint64_t start, uint64_t size); +void range_tree_adjust_fill(range_tree_t *rt, range_seg_t *rs, int64_t delta); void range_tree_clear(range_tree_t *rt, uint64_t start, uint64_t size); void range_tree_vacate(range_tree_t *rt, range_tree_func_t *func, void *arg); void range_tree_walk(range_tree_t *rt, range_tree_func_t *func, void *arg); +range_seg_t *range_tree_first(range_tree_t *rt); + +void rt_avl_create(range_tree_t *rt, void *arg); +void rt_avl_destroy(range_tree_t *rt, void *arg); +void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_vacate(range_tree_t *rt, void *arg); +extern struct range_tree_ops rt_avl_ops; + +void rt_avl_create(range_tree_t *rt, void *arg); +void rt_avl_destroy(range_tree_t *rt, void *arg); +void rt_avl_add(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_remove(range_tree_t *rt, range_seg_t *rs, void *arg); +void rt_avl_vacate(range_tree_t *rt, void *arg); +extern struct range_tree_ops rt_avl_ops; #ifdef __cplusplus } diff --git a/usr/src/uts/common/fs/zfs/sys/spa_impl.h b/usr/src/uts/common/fs/zfs/sys/spa_impl.h index dcb6cc9f19ef..b3840a98a338 100644 --- a/usr/src/uts/common/fs/zfs/sys/spa_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/spa_impl.h @@ -265,9 +265,9 @@ struct spa { uberblock_t spa_ubsync; /* last synced uberblock */ uberblock_t spa_uberblock; /* current uberblock */ boolean_t spa_extreme_rewind; /* rewind past deferred frees */ - uint64_t spa_last_io; /* lbolt of last non-scan I/O */ kmutex_t spa_scrub_lock; /* resilver/scrub lock */ - uint64_t spa_scrub_inflight; /* in-flight scrub I/Os */ + uint64_t spa_scrub_inflight; /* in-flight scrub bytes */ + uint64_t spa_load_verify_ios; /* in-flight verification IOs */ kcondvar_t spa_scrub_io_cv; /* scrub I/O completion */ uint8_t spa_scrub_active; /* active or suspended? */ uint8_t spa_scrub_type; /* type of scrub we're doing */ @@ -278,6 +278,7 @@ struct spa { uint64_t spa_scan_pass_scrub_pause; /* scrub pause time */ uint64_t spa_scan_pass_scrub_spent_paused; /* total paused */ uint64_t spa_scan_pass_exam; /* examined bytes per pass */ + uint64_t spa_scan_pass_issued; /* issued bytes per pass */ kmutex_t spa_async_lock; /* protect async state */ kthread_t *spa_async_thread; /* thread doing async task */ int spa_async_suspended; /* async tasks suspended */ diff --git a/usr/src/uts/common/fs/zfs/sys/vdev.h b/usr/src/uts/common/fs/zfs/sys/vdev.h index ef3bc5dd09ee..0c0bc874c106 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev.h @@ -71,6 +71,7 @@ extern void vdev_dtl_dirty(vdev_t *vd, vdev_dtl_type_t d, extern boolean_t vdev_dtl_contains(vdev_t *vd, vdev_dtl_type_t d, uint64_t txg, uint64_t size); extern boolean_t vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t d); +extern boolean_t vdev_dtl_need_resilver(vdev_t *vd, uint64_t off, size_t size); extern void vdev_dtl_reassess(vdev_t *vd, uint64_t txg, uint64_t scrub_txg, int scrub_done); extern boolean_t vdev_dtl_required(vdev_t *vd); @@ -136,6 +137,7 @@ extern void vdev_queue_init(vdev_t *vd); extern void vdev_queue_fini(vdev_t *vd); extern zio_t *vdev_queue_io(zio_t *zio); extern void vdev_queue_io_done(zio_t *zio); +extern void vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority); extern void vdev_config_dirty(vdev_t *vd); extern void vdev_config_clean(vdev_t *vd); diff --git a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h index 6ddbe55a0c4b..4e1b09c27dda 100644 --- a/usr/src/uts/common/fs/zfs/sys/vdev_impl.h +++ b/usr/src/uts/common/fs/zfs/sys/vdev_impl.h @@ -73,6 +73,7 @@ typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef void vdev_io_start_func_t(zio_t *zio); typedef void vdev_io_done_func_t(zio_t *zio); typedef void vdev_state_change_func_t(vdev_t *vd, int, int); +typedef boolean_t vdev_need_resilver_func_t(vdev_t *vd, uint64_t, size_t); typedef void vdev_hold_func_t(vdev_t *vd); typedef void vdev_rele_func_t(vdev_t *vd); @@ -94,6 +95,7 @@ typedef struct vdev_ops { vdev_io_start_func_t *vdev_op_io_start; vdev_io_done_func_t *vdev_op_io_done; vdev_state_change_func_t *vdev_op_state_change; + vdev_need_resilver_func_t *vdev_op_need_resilver; vdev_hold_func_t *vdev_op_hold; vdev_rele_func_t *vdev_op_rele; vdev_remap_func_t *vdev_op_remap; @@ -304,6 +306,13 @@ struct vdev { range_tree_t *vdev_obsolete_segments; space_map_t *vdev_obsolete_sm; + /* + * Protects the vdev_scan_io_queue field itself as well as the + * structure's contents (when present). + */ + kmutex_t vdev_scan_io_queue_lock; + struct dsl_scan_io_queue *vdev_scan_io_queue; + /* * Leaf vdev state. */ diff --git a/usr/src/uts/common/fs/zfs/sys/zio.h b/usr/src/uts/common/fs/zfs/sys/zio.h index 517764f1cec4..00d2ebbebb4c 100644 --- a/usr/src/uts/common/fs/zfs/sys/zio.h +++ b/usr/src/uts/common/fs/zfs/sys/zio.h @@ -565,6 +565,8 @@ extern void zio_vdev_io_bypass(zio_t *zio); extern void zio_vdev_io_reissue(zio_t *zio); extern void zio_vdev_io_redone(zio_t *zio); +extern void zio_change_priority(zio_t *pio, zio_priority_t priority); + extern void zio_checksum_verified(zio_t *zio); extern int zio_worst_error(int e1, int e2); diff --git a/usr/src/uts/common/fs/zfs/vdev.c b/usr/src/uts/common/fs/zfs/vdev.c index b8a385d39150..c7dca8377795 100644 --- a/usr/src/uts/common/fs/zfs/vdev.c +++ b/usr/src/uts/common/fs/zfs/vdev.c @@ -501,6 +501,7 @@ vdev_alloc_common(spa_t *spa, uint_t id, uint64_t guid, vdev_ops_t *ops) mutex_init(&vd->vdev_dtl_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_stat_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_probe_lock, NULL, MUTEX_DEFAULT, NULL); + mutex_init(&vd->vdev_scan_io_queue_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_lock, NULL, MUTEX_DEFAULT, NULL); mutex_init(&vd->vdev_initialize_io_lock, NULL, MUTEX_DEFAULT, NULL); cv_init(&vd->vdev_initialize_cv, NULL, CV_DEFAULT, NULL); @@ -801,6 +802,18 @@ vdev_free(vdev_t *vd) spa_t *spa = vd->vdev_spa; ASSERT3P(vd->vdev_initialize_thread, ==, NULL); + /* + * Scan queues are normally destroyed at the end of a scan. If the + * queue exists here, that implies the vdev is being removed while + * the scan is still running. + */ + if (vd->vdev_scan_io_queue != NULL) { + mutex_enter(&vd->vdev_scan_io_queue_lock); + dsl_scan_io_queue_destroy(vd->vdev_scan_io_queue); + vd->vdev_scan_io_queue = NULL; + mutex_exit(&vd->vdev_scan_io_queue_lock); + } + /* * vdev_free() implies closing the vdev first. This is simpler than * trying to ensure complicated semantics for all callers. @@ -891,6 +904,7 @@ vdev_free(vdev_t *vd) mutex_destroy(&vd->vdev_dtl_lock); mutex_destroy(&vd->vdev_stat_lock); mutex_destroy(&vd->vdev_probe_lock); + mutex_destroy(&vd->vdev_scan_io_queue_lock); mutex_destroy(&vd->vdev_initialize_lock); mutex_destroy(&vd->vdev_initialize_io_lock); cv_destroy(&vd->vdev_initialize_io_cv); @@ -1000,6 +1014,8 @@ vdev_top_transfer(vdev_t *svd, vdev_t *tvd) tvd->vdev_islog = svd->vdev_islog; svd->vdev_islog = 0; + + dsl_scan_io_queue_vdev_xfer(svd, tvd); } static void @@ -2161,6 +2177,7 @@ vdev_metaslab_set_size(vdev_t *vd) uint64_t ms_count = asize >> zfs_vdev_default_ms_shift; uint64_t ms_shift; + /* BEGIN CSTYLED */ /* * There are two dimensions to the metaslab sizing calculation: * the size of the metaslab and the count of metaslabs per vdev. @@ -2204,6 +2221,7 @@ vdev_metaslab_set_size(vdev_t *vd) * in additional metaslabs being allocated making it possible * to exceed the zfs_vdev_ms_count_limit. */ + /* END CSTYLED */ if (ms_count < zfs_vdev_min_ms_count) ms_shift = highbit64(asize / zfs_vdev_min_ms_count); @@ -2347,6 +2365,21 @@ vdev_dtl_empty(vdev_t *vd, vdev_dtl_type_t t) return (empty); } +/* + * Returns B_TRUE if vdev determines offset needs to be resilvered. + */ +boolean_t +vdev_dtl_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + ASSERT(vd != vd->vdev_spa->spa_root_vdev); + + if (vd->vdev_ops->vdev_op_need_resilver == NULL || + vd->vdev_ops->vdev_op_leaf) + return (B_TRUE); + + return (vd->vdev_ops->vdev_op_need_resilver(vd, offset, psize)); +} + /* * Returns the lowest txg in the DTL range. */ diff --git a/usr/src/uts/common/fs/zfs/vdev_disk.c b/usr/src/uts/common/fs/zfs/vdev_disk.c index 2d431373ce1d..93462ee2ba53 100644 --- a/usr/src/uts/common/fs/zfs/vdev_disk.c +++ b/usr/src/uts/common/fs/zfs/vdev_disk.c @@ -884,18 +884,19 @@ vdev_disk_io_done(zio_t *zio) } vdev_ops_t vdev_disk_ops = { - vdev_disk_open, - vdev_disk_close, - vdev_default_asize, - vdev_disk_io_start, - vdev_disk_io_done, - NULL, - vdev_disk_hold, - vdev_disk_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_disk_open, + .vdev_op_close = vdev_disk_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_disk_io_start, + .vdev_op_io_done = vdev_disk_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_disk_hold, + .vdev_op_rele = vdev_disk_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; /* diff --git a/usr/src/uts/common/fs/zfs/vdev_file.c b/usr/src/uts/common/fs/zfs/vdev_file.c index 4dbac6321bc0..3aaebe8505cd 100644 --- a/usr/src/uts/common/fs/zfs/vdev_file.c +++ b/usr/src/uts/common/fs/zfs/vdev_file.c @@ -255,18 +255,19 @@ vdev_file_io_done(zio_t *zio) } vdev_ops_t vdev_file_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_FILE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_FILE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; /* @@ -275,18 +276,19 @@ vdev_ops_t vdev_file_ops = { #ifndef _KERNEL vdev_ops_t vdev_disk_ops = { - vdev_file_open, - vdev_file_close, - vdev_default_asize, - vdev_file_io_start, - vdev_file_io_done, - NULL, - vdev_file_hold, - vdev_file_rele, - NULL, - vdev_default_xlate, - VDEV_TYPE_DISK, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_file_open, + .vdev_op_close = vdev_file_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_file_io_start, + .vdev_op_io_done = vdev_file_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = vdev_file_hold, + .vdev_op_rele = vdev_file_rele, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_DISK, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; #endif diff --git a/usr/src/uts/common/fs/zfs/vdev_indirect.c b/usr/src/uts/common/fs/zfs/vdev_indirect.c index 5b6415937f61..958e8cd8581e 100644 --- a/usr/src/uts/common/fs/zfs/vdev_indirect.c +++ b/usr/src/uts/common/fs/zfs/vdev_indirect.c @@ -1812,16 +1812,17 @@ vdev_indirect_io_done(zio_t *zio) } vdev_ops_t vdev_indirect_ops = { - vdev_indirect_open, - vdev_indirect_close, - vdev_default_asize, - vdev_indirect_io_start, - vdev_indirect_io_done, - NULL, - NULL, - NULL, - vdev_indirect_remap, - NULL, - VDEV_TYPE_INDIRECT, /* name of this vdev type */ - B_FALSE /* leaf vdev */ + .vdev_op_open = vdev_indirect_open, + .vdev_op_close = vdev_indirect_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_indirect_io_start, + .vdev_op_io_done = vdev_indirect_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = vdev_indirect_remap, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_INDIRECT, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_mirror.c b/usr/src/uts/common/fs/zfs/vdev_mirror.c index 133558d3d3d1..f489bb196774 100644 --- a/usr/src/uts/common/fs/zfs/vdev_mirror.c +++ b/usr/src/uts/common/fs/zfs/vdev_mirror.c @@ -555,46 +555,49 @@ vdev_mirror_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_mirror_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_MIRROR, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_MIRROR, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_replacing_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_REPLACING, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_REPLACING, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; vdev_ops_t vdev_spare_ops = { - vdev_mirror_open, - vdev_mirror_close, - vdev_default_asize, - vdev_mirror_io_start, - vdev_mirror_io_done, - vdev_mirror_state_change, - NULL, - NULL, - NULL, - vdev_default_xlate, - VDEV_TYPE_SPARE, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_mirror_open, + .vdev_op_close = vdev_mirror_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_mirror_io_start, + .vdev_op_io_done = vdev_mirror_io_done, + .vdev_op_state_change = vdev_mirror_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_default_xlate, + .vdev_op_type = VDEV_TYPE_SPARE, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_missing.c b/usr/src/uts/common/fs/zfs/vdev_missing.c index c761de8a204c..205b23eba7f5 100644 --- a/usr/src/uts/common/fs/zfs/vdev_missing.c +++ b/usr/src/uts/common/fs/zfs/vdev_missing.c @@ -80,31 +80,33 @@ vdev_missing_io_done(zio_t *zio) } vdev_ops_t vdev_missing_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_MISSING, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_MISSING, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; vdev_ops_t vdev_hole_ops = { - vdev_missing_open, - vdev_missing_close, - vdev_default_asize, - vdev_missing_io_start, - vdev_missing_io_done, - NULL, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_HOLE, /* name of this vdev type */ - B_TRUE /* leaf vdev */ + .vdev_op_open = vdev_missing_open, + .vdev_op_close = vdev_missing_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = vdev_missing_io_start, + .vdev_op_io_done = vdev_missing_io_done, + .vdev_op_state_change = NULL, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_HOLE, /* name of this vdev type */ + .vdev_op_leaf = B_TRUE /* leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_queue.c b/usr/src/uts/common/fs/zfs/vdev_queue.c index dff83e3108f5..0643c05f573d 100644 --- a/usr/src/uts/common/fs/zfs/vdev_queue.c +++ b/usr/src/uts/common/fs/zfs/vdev_queue.c @@ -169,7 +169,7 @@ int zfs_vdev_async_write_active_max_dirty_percent = 60; * we include spans of optional I/Os to aid aggregation at the disk even when * they aren't able to help us aggregate at this level. */ -int zfs_vdev_aggregation_limit = SPA_OLD_MAXBLOCKSIZE; +int zfs_vdev_aggregation_limit = 1 << 20; int zfs_vdev_read_gap_limit = 32 << 10; int zfs_vdev_write_gap_limit = 4 << 10; @@ -519,6 +519,7 @@ static zio_t * vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) { zio_t *first, *last, *aio, *dio, *mandatory, *nio; + zio_link_t *zl = NULL; uint64_t maxgap = 0; uint64_t size; boolean_t stretch = B_FALSE; @@ -657,9 +658,18 @@ vdev_queue_aggregate(vdev_queue_t *vq, zio_t *zio) zio_add_child(dio, aio); vdev_queue_io_remove(vq, dio); + } while (dio != last); + + /* + * We need to drop the vdev queue's lock to avoid a deadlock that we + * could encounter since this I/O will complete immediately. + */ + mutex_exit(&vq->vq_lock); + while ((dio = zio_walk_parents(aio, &zl)) != NULL) { zio_vdev_io_bypass(dio); zio_execute(dio); - } while (dio != last); + } + mutex_enter(&vq->vq_lock); return (aio); } @@ -797,3 +807,54 @@ vdev_queue_io_done(zio_t *zio) mutex_exit(&vq->vq_lock); } + +void +vdev_queue_change_io_priority(zio_t *zio, zio_priority_t priority) +{ + vdev_queue_t *vq = &zio->io_vd->vdev_queue; + avl_tree_t *tree; + + /* + * ZIO_PRIORITY_NOW is used by the vdev cache code and the aggregate zio + * code to issue IOs without adding them to the vdev queue. In this + * case, the zio is already going to be issued as quickly as possible + * and so it doesn't need any reprioitization to help. + */ + if (zio->io_priority == ZIO_PRIORITY_NOW) + return; + + ASSERT3U(zio->io_priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (zio->io_type == ZIO_TYPE_READ) { + if (priority != ZIO_PRIORITY_SYNC_READ && + priority != ZIO_PRIORITY_ASYNC_READ && + priority != ZIO_PRIORITY_SCRUB) + priority = ZIO_PRIORITY_ASYNC_READ; + } else { + ASSERT(zio->io_type == ZIO_TYPE_WRITE); + if (priority != ZIO_PRIORITY_SYNC_WRITE && + priority != ZIO_PRIORITY_ASYNC_WRITE) + priority = ZIO_PRIORITY_ASYNC_WRITE; + } + + mutex_enter(&vq->vq_lock); + + /* + * If the zio is in none of the queues we can simply change + * the priority. If the zio is waiting to be submitted we must + * remove it from the queue and re-insert it with the new priority. + * Otherwise, the zio is currently active and we cannot change its + * priority. + */ + tree = vdev_queue_class_tree(vq, zio->io_priority); + if (avl_find(tree, zio, NULL) == zio) { + avl_remove(vdev_queue_class_tree(vq, zio->io_priority), zio); + zio->io_priority = priority; + avl_add(vdev_queue_class_tree(vq, zio->io_priority), zio); + } else if (avl_find(&vq->vq_active_tree, zio, NULL) != zio) { + zio->io_priority = priority; + } + + mutex_exit(&vq->vq_lock); +} diff --git a/usr/src/uts/common/fs/zfs/vdev_raidz.c b/usr/src/uts/common/fs/zfs/vdev_raidz.c index 0e6dfcc2c02e..65023535421d 100644 --- a/usr/src/uts/common/fs/zfs/vdev_raidz.c +++ b/usr/src/uts/common/fs/zfs/vdev_raidz.c @@ -2598,6 +2598,44 @@ vdev_raidz_state_change(vdev_t *vd, int faulted, int degraded) vdev_set_state(vd, B_FALSE, VDEV_STATE_HEALTHY, VDEV_AUX_NONE); } +/* + * Determine if any portion of the provided block resides on a child vdev + * with a dirty DTL and therefore needs to be resilvered. The function + * assumes that at least one DTL is dirty which imples that full stripe + * width blocks must be resilvered. + */ +static boolean_t +vdev_raidz_need_resilver(vdev_t *vd, uint64_t offset, size_t psize) +{ + uint64_t dcols = vd->vdev_children; + uint64_t nparity = vd->vdev_nparity; + uint64_t ashift = vd->vdev_top->vdev_ashift; + /* The starting RAIDZ (parent) vdev sector of the block. */ + uint64_t b = offset >> ashift; + /* The zio's size in units of the vdev's minimum sector size. */ + uint64_t s = ((psize - 1) >> ashift) + 1; + /* The first column for this stripe. */ + uint64_t f = b % dcols; + + if (s + nparity >= dcols) + return (B_TRUE); + + for (uint64_t c = 0; c < s + nparity; c++) { + uint64_t devidx = (f + c) % dcols; + vdev_t *cvd = vd->vdev_child[devidx]; + + /* + * dsl_scan_need_resilver() already checked vd with + * vdev_dtl_contains(). So here just check cvd with + * vdev_dtl_empty(), cheaper and a good approximation. + */ + if (!vdev_dtl_empty(cvd, DTL_PARTIAL)) + return (B_TRUE); + } + + return (B_FALSE); +} + static void vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) { @@ -2630,16 +2668,17 @@ vdev_raidz_xlate(vdev_t *cvd, const range_seg_t *in, range_seg_t *res) } vdev_ops_t vdev_raidz_ops = { - vdev_raidz_open, - vdev_raidz_close, - vdev_raidz_asize, - vdev_raidz_io_start, - vdev_raidz_io_done, - vdev_raidz_state_change, - NULL, - NULL, - NULL, - vdev_raidz_xlate, - VDEV_TYPE_RAIDZ, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_raidz_open, + .vdev_op_close = vdev_raidz_close, + .vdev_op_asize = vdev_raidz_asize, + .vdev_op_io_start = vdev_raidz_io_start, + .vdev_op_io_done = vdev_raidz_io_done, + .vdev_op_state_change = vdev_raidz_state_change, + .vdev_op_need_resilver = vdev_raidz_need_resilver, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = vdev_raidz_xlate, + .vdev_op_type = VDEV_TYPE_RAIDZ, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/vdev_root.c b/usr/src/uts/common/fs/zfs/vdev_root.c index edb52d6ca7fa..7170f7013608 100644 --- a/usr/src/uts/common/fs/zfs/vdev_root.c +++ b/usr/src/uts/common/fs/zfs/vdev_root.c @@ -140,16 +140,17 @@ vdev_root_state_change(vdev_t *vd, int faulted, int degraded) } vdev_ops_t vdev_root_ops = { - vdev_root_open, - vdev_root_close, - vdev_default_asize, - NULL, /* io_start - not applicable to the root */ - NULL, /* io_done - not applicable to the root */ - vdev_root_state_change, - NULL, - NULL, - NULL, - NULL, - VDEV_TYPE_ROOT, /* name of this vdev type */ - B_FALSE /* not a leaf vdev */ + .vdev_op_open = vdev_root_open, + .vdev_op_close = vdev_root_close, + .vdev_op_asize = vdev_default_asize, + .vdev_op_io_start = NULL, /* not applicable to the root */ + .vdev_op_io_done = NULL, /* not applicable to the root */ + .vdev_op_state_change = vdev_root_state_change, + .vdev_op_need_resilver = NULL, + .vdev_op_hold = NULL, + .vdev_op_rele = NULL, + .vdev_op_remap = NULL, + .vdev_op_xlate = NULL, + .vdev_op_type = VDEV_TYPE_ROOT, /* name of this vdev type */ + .vdev_op_leaf = B_FALSE /* not a leaf vdev */ }; diff --git a/usr/src/uts/common/fs/zfs/zio.c b/usr/src/uts/common/fs/zfs/zio.c index 619dad47f340..76312e6a74c7 100644 --- a/usr/src/uts/common/fs/zfs/zio.c +++ b/usr/src/uts/common/fs/zfs/zio.c @@ -41,6 +41,7 @@ #include #include #include +#include #include #include #include @@ -370,6 +371,8 @@ zio_walk_children(zio_t *pio, zio_link_t **zl) { list_t *cl = &pio->io_child_list; + ASSERT(MUTEX_HELD(&pio->io_lock)); + *zl = (*zl == NULL) ? list_head(cl) : list_next(cl, *zl); if (*zl == NULL) return (NULL); @@ -404,8 +407,8 @@ zio_add_child(zio_t *pio, zio_t *cio) zl->zl_parent = pio; zl->zl_child = cio; - mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); ASSERT(pio->io_state[ZIO_WAIT_DONE] == 0); @@ -418,8 +421,8 @@ zio_add_child(zio_t *pio, zio_t *cio) pio->io_child_count++; cio->io_parent_count++; - mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); + mutex_exit(&pio->io_lock); } static void @@ -428,8 +431,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) ASSERT(zl->zl_parent == pio); ASSERT(zl->zl_child == cio); - mutex_enter(&cio->io_lock); mutex_enter(&pio->io_lock); + mutex_enter(&cio->io_lock); list_remove(&pio->io_child_list, zl); list_remove(&cio->io_parent_list, zl); @@ -437,8 +440,8 @@ zio_remove_child(zio_t *pio, zio_t *cio, zio_link_t *zl) pio->io_child_count--; cio->io_parent_count--; - mutex_exit(&pio->io_lock); mutex_exit(&cio->io_lock); + mutex_exit(&pio->io_lock); kmem_cache_free(zio_link_cache, zl); } @@ -921,6 +924,7 @@ zio_free_sync(zio_t *pio, spa_t *spa, uint64_t txg, const blkptr_t *bp, metaslab_check_free(spa, bp); arc_freed(spa, bp); + dsl_scan_freed(spa, bp); /* * GANG and DEDUP blocks can induce a read (for the gang block header, @@ -1760,14 +1764,16 @@ zio_reexecute(zio_t *pio) * cannot be affected by any side effects of reexecuting 'cio'. */ zio_link_t *zl = NULL; + mutex_enter(&pio->io_lock); for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { cio_next = zio_walk_children(pio, &zl); - mutex_enter(&pio->io_lock); for (int w = 0; w < ZIO_WAIT_TYPES; w++) pio->io_children[cio->io_child_type][w]++; mutex_exit(&pio->io_lock); zio_reexecute(cio); + mutex_enter(&pio->io_lock); } + mutex_exit(&pio->io_lock); /* * Now that all children have been reexecuted, execute the parent. @@ -3142,37 +3148,17 @@ zio_vdev_io_start(zio_t *zio) if (zio->io_type == ZIO_TYPE_WRITE) { ASSERT(spa->spa_trust_config); + /* + * Note: the code can handle other kinds of writes, + * but we don't expect them. + */ if (zio->io_vd->vdev_removing) { - /* - * Note: the code can handle other kinds of writes, - * but we don't expect them. - */ ASSERT(zio->io_flags & (ZIO_FLAG_PHYSICAL | ZIO_FLAG_SELF_HEAL | ZIO_FLAG_RESILVER | ZIO_FLAG_INDUCE_DAMAGE)); } } - /* - * We keep track of time-sensitive I/Os so that the scan thread - * can quickly react to certain workloads. In particular, we care - * about non-scrubbing, top-level reads and writes with the following - * characteristics: - * - synchronous writes of user data to non-slog devices - * - any reads of user data - * When these conditions are met, adjust the timestamp of spa_last_io - * which allows the scan thread to adjust its workload accordingly. - */ - if (!(zio->io_flags & ZIO_FLAG_SCAN_THREAD) && zio->io_bp != NULL && - vd == vd->vdev_top && !vd->vdev_islog && - zio->io_bookmark.zb_objset != DMU_META_OBJSET && - zio->io_txg != spa_syncing_txg(spa)) { - uint64_t old = spa->spa_last_io; - uint64_t new = ddi_get_lbolt64(); - if (old != new) - (void) atomic_cas_64(&spa->spa_last_io, old, new); - } - align = 1ULL << vd->vdev_top->vdev_ashift; if (!(zio->io_flags & ZIO_FLAG_PHYSICAL) && @@ -3311,6 +3297,35 @@ zio_vdev_io_done(zio_t *zio) return (ZIO_PIPELINE_CONTINUE); } +/* + * This function is used to change the priority of an existing zio that is + * currently in-flight. This is used by the arc to upgrade priority in the + * event that a demand read is made for a block that is currently queued + * as a scrub or async read IO. Otherwise, the high priority read request + * would end up having to wait for the lower priority IO. + */ +void +zio_change_priority(zio_t *pio, zio_priority_t priority) +{ + zio_t *cio, *cio_next; + zio_link_t *zl = NULL; + + ASSERT3U(priority, <, ZIO_PRIORITY_NUM_QUEUEABLE); + + if (pio->io_vd != NULL && pio->io_vd->vdev_ops->vdev_op_leaf) { + vdev_queue_change_io_priority(pio, priority); + } else { + pio->io_priority = priority; + } + + mutex_enter(&pio->io_lock); + for (cio = zio_walk_children(pio, &zl); cio != NULL; cio = cio_next) { + cio_next = zio_walk_children(pio, &zl); + zio_change_priority(cio, priority); + } + mutex_exit(&pio->io_lock); +} + /* * For non-raidz ZIOs, we can just copy aside the bad data read from the * disk, and use that to finish the checksum ereport later. diff --git a/usr/src/uts/common/os/taskq.c b/usr/src/uts/common/os/taskq.c index d9fa450c999c..fcdfe292dd27 100644 --- a/usr/src/uts/common/os/taskq.c +++ b/usr/src/uts/common/os/taskq.c @@ -1366,6 +1366,12 @@ taskq_wait(taskq_t *tq) } } +void +taskq_wait_id(taskq_t *tq, taskqid_t id __unused) +{ + taskq_wait(tq); +} + /* * Suspend execution of tasks. * diff --git a/usr/src/uts/common/sys/fs/zfs.h b/usr/src/uts/common/sys/fs/zfs.h index ed80492dd583..b54fb27d1e84 100644 --- a/usr/src/uts/common/sys/fs/zfs.h +++ b/usr/src/uts/common/sys/fs/zfs.h @@ -806,17 +806,22 @@ typedef struct pool_scan_stat { uint64_t pss_start_time; /* scan start time */ uint64_t pss_end_time; /* scan end time */ uint64_t pss_to_examine; /* total bytes to scan */ - uint64_t pss_examined; /* total examined bytes */ + uint64_t pss_examined; /* total bytes located by scanner */ uint64_t pss_to_process; /* total bytes to process */ uint64_t pss_processed; /* total processed bytes */ uint64_t pss_errors; /* scan errors */ /* values not stored on disk */ uint64_t pss_pass_exam; /* examined bytes per scan pass */ + uint64_t pss_pass_issued; /* issued bytes per scan pass */ uint64_t pss_pass_start; /* start time of a scan pass */ uint64_t pss_pass_scrub_pause; /* pause time of a scurb pass */ /* cumulative time scrub spent paused, needed for rate calculation */ uint64_t pss_pass_scrub_spent_paused; + + /* Sorted scrubbing new fields */ + /* Stored on disk */ + uint64_t pss_issued; /* total bytes checked by scanner */ } pool_scan_stat_t; typedef struct pool_removal_stat { diff --git a/usr/src/uts/common/sys/taskq.h b/usr/src/uts/common/sys/taskq.h index 0fd72bd89140..adc260bc70f6 100644 --- a/usr/src/uts/common/sys/taskq.h +++ b/usr/src/uts/common/sys/taskq.h @@ -83,6 +83,7 @@ extern taskqid_t taskq_dispatch(taskq_t *, task_func_t, void *, uint_t); extern void nulltask(void *); extern void taskq_destroy(taskq_t *); extern void taskq_wait(taskq_t *); +void taskq_wait_id(taskq_t *, taskqid_t); extern boolean_t taskq_empty(taskq_t *); extern void taskq_suspend(taskq_t *); extern int taskq_suspended(taskq_t *);