Merge pull request ceph#32844 from smithfarm/wip-43239-nautilus

nautilus: mgr/DaemonServer: fix 'osd ok-to-stop' for EC pools Reviewed-by: Sage Weil <sage@redhat.com> Reviewed-by: David Zafman <dzafman@redhat.com>
kunpengcompute · Feb 14, 2020 · 59eedd8 · 59eedd8
2 parents 21a2166 + b9e8232
commit 59eedd8
Show file tree

Hide file tree

Showing 3 changed files with 142 additions and 2 deletions.
diff --git a/qa/standalone/ceph-helpers.sh b/qa/standalone/ceph-helpers.sh
@@ -1313,6 +1313,36 @@ function test_get_num_active_clean() {
     teardown $dir || return 1
 }
 
+##
+# Return the number of active or peered PGs in the cluster. A PG matches if
+# ceph pg dump pgs reports it is either **active** or **peered** and that
+# not **stale**.
+#
+# @param STDOUT the number of active PGs
+# @return 0 on success, 1 on error
+#
+function get_num_active_or_peered() {
+    local expression
+    expression+="select(contains(\"active\") or contains(\"peered\")) | "
+    expression+="select(contains(\"stale\") | not)"
+    ceph --format json pg dump pgs 2>/dev/null | \
+        jq ".pg_stats | [.[] | .state | $expression] | length"
+}
+
+function test_get_num_active_or_peered() {
+    local dir=$1
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=1 || return 1
+    run_mgr $dir x || return 1
+    run_osd $dir 0 || return 1
+    create_rbd_pool || return 1
+    wait_for_clean || return 1
+    local num_peered=$(get_num_active_or_peered)
+    test "$num_peered" = $PG_NUM || return 1
+    teardown $dir || return 1
+}
+
 #######################################################################
 
 ##
@@ -1588,6 +1618,64 @@ function test_wait_for_clean() {
     teardown $dir || return 1
 }
 
+##
+# Wait until the cluster becomes peered or if it does not make progress
+# for $WAIT_FOR_CLEAN_TIMEOUT seconds.
+# Progress is measured either via the **get_is_making_recovery_progress**
+# predicate or if the number of peered PGs changes (as returned by get_num_active_or_peered)
+#
+# @return 0 if the cluster is clean, 1 otherwise
+#
+function wait_for_peered() {
+    local cmd=$1
+    local num_peered=-1
+    local cur_peered
+    local -a delays=($(get_timeout_delays $WAIT_FOR_CLEAN_TIMEOUT .1))
+    local -i loop=0
+
+    flush_pg_stats || return 1
+    while test $(get_num_pgs) == 0 ; do
+	sleep 1
+    done
+
+    while true ; do
+        # Comparing get_num_active_clean & get_num_pgs is used to determine
+        # if the cluster is clean. That's almost an inline of is_clean() to
+        # get more performance by avoiding multiple calls of get_num_active_clean.
+        cur_peered=$(get_num_active_or_peered)
+        test $cur_peered = $(get_num_pgs) && break
+        if test $cur_peered != $num_peered ; then
+            loop=0
+            num_peered=$cur_peered
+        elif get_is_making_recovery_progress ; then
+            loop=0
+        elif (( $loop >= ${#delays[*]} )) ; then
+            ceph report
+            return 1
+        fi
+	# eval is a no-op if cmd is empty
+        eval $cmd
+        sleep ${delays[$loop]}
+        loop+=1
+    done
+    return 0
+}
+
+function test_wait_for_peered() {
+    local dir=$1
+
+    setup $dir || return 1
+    run_mon $dir a --osd_pool_default_size=2 || return 1
+    run_osd $dir 0 || return 1
+    run_mgr $dir x || return 1
+    create_rbd_pool || return 1
+    ! WAIT_FOR_CLEAN_TIMEOUT=1 wait_for_clean || return 1
+    run_osd $dir 1 || return 1
+    wait_for_peered || return 1
+    teardown $dir || return 1
+}
+
+
 #######################################################################
 
 ##

diff --git a/qa/standalone/misc/ok-to-stop.sh b/qa/standalone/misc/ok-to-stop.sh
@@ -237,5 +237,53 @@ function TEST_0_mds() {
     kill_daemons $dir KILL mds.a
 }
 
+function TEST_0_osd() {
+    local dir=$1
+
+    CEPH_ARGS="$ORIG_CEPH_ARGS --mon-host=$CEPH_MON_A "
+
+    run_mon $dir a --public-addr=$CEPH_MON_A || return 1
+    run_mgr $dir x || return 1
+    run_osd $dir 0 || return 1
+    run_osd $dir 1 || return 1
+    run_osd $dir 2 || return 1
+    run_osd $dir 3 || return 1
+
+    ceph osd erasure-code-profile set ec-profile m=2 k=2 crush-failure-domain=osd || return 1
+    ceph osd pool create ec 8 erasure ec-profile || return 1
+
+    wait_for_clean || return 1
+
+    # with min_size 3, we can stop only 1 osd
+    ceph osd pool set ec min_size 3 || return 1
+    wait_for_clean || return 1
+
+    ceph osd ok-to-stop 0 || return 1
+    ceph osd ok-to-stop 1 || return 1
+    ceph osd ok-to-stop 2 || return 1
+    ceph osd ok-to-stop 3 || return 1
+    ! ceph osd ok-to-stop 0 1 || return 1
+    ! ceph osd ok-to-stop 2 3 || return 1
+
+    # with min_size 2 we can stop 1 osds
+    ceph osd pool set ec min_size 2 || return 1
+    wait_for_clean || return 1
+
+    ceph osd ok-to-stop 0 1 || return 1
+    ceph osd ok-to-stop 2 3 || return 1
+    ! ceph osd ok-to-stop 0 1 2 || return 1
+    ! ceph osd ok-to-stop 1 2 3 || return 1
+
+    # we should get the same result with one of the osds already down
+    kill_daemons $dir TERM osd.0 || return 1
+    ceph osd down 0 || return 1
+    wait_for_peered || return 1
+
+    ceph osd ok-to-stop 0 || return 1
+    ceph osd ok-to-stop 0 1 || return 1
+    ! ceph osd ok-to-stop 0 1 2 || return 1
+    ! ceph osd ok-to-stop 1 2 3 || return 1
+}
+
 
 main ok-to-stop "$@"
diff --git a/src/mgr/DaemonServer.cc b/src/mgr/DaemonServer.cc
@@ -1561,15 +1561,19 @@ bool DaemonServer::_handle_command(
 		found = true;
 		continue;
 	      }
-	      pg_acting.insert(anm.osd);
+	      if (anm.osd != CRUSH_ITEM_NONE) {
+		pg_acting.insert(anm.osd);
+	      }
 	    }
 	  } else {
 	    for (auto& a : q.second.acting) {
 	      if (osds.count(a)) {
 		found = true;
 		continue;
 	      }
-	      pg_acting.insert(a);
+	      if (a != CRUSH_ITEM_NONE) {
+		pg_acting.insert(a);
+	      }
 	    }
 	  }
 	  if (!found) {