Skip to content

Commit

Permalink
test: reenable test_topology::test_decommission_node_add_column
Browse files Browse the repository at this point in the history
Also improve the test to increase the probability of reproducing scylladb#11780
by injecting sleeps in appropriate places.

Without the fix for scylladb#11780 from the earlier commit, the test reproduces
the issue in roughly half of all runs in dev build on my laptop.
  • Loading branch information
kbr-scylla committed Nov 10, 2022
1 parent 8d355e7 commit 071251a
Show file tree
Hide file tree
Showing 3 changed files with 39 additions and 4 deletions.
6 changes: 6 additions & 0 deletions service/storage_service.cc
Expand Up @@ -64,6 +64,7 @@
#include <seastar/coroutine/maybe_yield.hh>
#include <seastar/coroutine/parallel_for_each.hh>
#include "utils/stall_free.hh"
#include "utils/error_injection.hh"

#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/classification.hpp>
Expand Down Expand Up @@ -2566,6 +2567,8 @@ future<node_ops_cmd_response> storage_service::node_ops_cmd_handler(gms::inet_ad
} else if (req.cmd == node_ops_cmd::removenode_abort) {
node_ops_abort(ops_uuid).get();
} else if (req.cmd == node_ops_cmd::decommission_prepare) {
utils::get_local_injector().inject(
"storage_service_decommission_prepare_handler_sleep", std::chrono::milliseconds{1500}).get();
if (req.leaving_nodes.size() > 1) {
auto msg = format("decommission[{}]: Could not decommission more than one node at a time: leaving_nodes={}", req.ops_uuid, req.leaving_nodes);
slogger.warn("{}", msg);
Expand Down Expand Up @@ -3595,6 +3598,9 @@ future<> storage_service::notify_joined(inet_address endpoint) {
co_return;
}

co_await utils::get_local_injector().inject(
"storage_service_notify_joined_sleep", std::chrono::milliseconds{500});

co_await container().invoke_on_all([endpoint] (auto&& ss) {
ss._messaging.local().remove_rpc_client_with_ignored_topology(netw::msg_addr{endpoint, 0});
return ss._lifecycle_notifier.notify_joined(endpoint);
Expand Down
7 changes: 7 additions & 0 deletions test/pylib/rest_client.py
Expand Up @@ -182,6 +182,13 @@ async def get_gossip_generation_number(self, node_ip: str, target_ip: str) -> in
assert(type(data) == int)
return data

async def get_joining_nodes(self, node_ip: str) -> list:
"""Get the list of joining nodes according to `node_ip`."""
resp = await self.client.get(f"/storage_service/nodes/joining", host=node_ip)
data = await resp.json()
assert(type(data) == list)
return data

async def enable_injection(self, node_ip: str, injection: str, one_shot: bool) -> None:
"""Enable error injection named `injection` on `node_ip`. Depending on `one_shot`,
the injection will be executed only once or every time the process passes the injection point.
Expand Down
30 changes: 26 additions & 4 deletions test/topology/test_topology.py
Expand Up @@ -10,6 +10,9 @@
import logging
import asyncio
import random
import time

from test.pylib.util import wait_for

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -58,13 +61,32 @@ async def test_remove_node_add_column(manager, random_tables):


@pytest.mark.asyncio
@pytest.mark.skip(reason="Flaky due to #11780")
async def test_decommission_node_add_column(manager, random_tables):
"""Add a node, remove an original node, add a column"""
servers = await manager.running_servers()
table = await random_tables.add_table(ncolumns=5)
await manager.server_add()
await manager.decommission_node(servers[1]) # Decommission [1]
servers = await manager.running_servers()
decommission_target = servers[1]
# The sleep injections significantly increase the probability of reproducing #11780:
# 1. bootstrapped_server finishes bootstrapping and enters NORMAL state
# 2. decommission_target starts storage_service::handle_state_normal(bootstrapped_server),
# enters sleep before calling storage_service::notify_joined
# 3. we start decommission on decommission_target
# 4. decommission_target sends node_ops_verb with decommission_prepare request to bootstrapped_server
# 5. bootstrapped_server receives the RPC and enters sleep
# 6. decommission_target handle_state_normal wakes up,
# calls storage_service::notify_joined which drops some RPC clients
# 7. If #11780 is not fixed, this will fail the node_ops_verb RPC, causing decommission to fail
await manager.api.enable_injection(decommission_target, 'storage_service_notify_joined_sleep', one_shot=True)
bootstrapped_server = await manager.server_add()
async def no_joining_nodes():
joining_nodes = await manager.api.get_joining_nodes(decommission_target)
return not joining_nodes
# Wait until decommission_target thinks that bootstrapped_server is NORMAL
# note: when this wait finishes, we're usually in the middle of storage_service::handle_state_normal
await wait_for(no_joining_nodes, time.time() + 30, period=.1)
await manager.api.enable_injection(
bootstrapped_server, 'storage_service_decommission_prepare_handler_sleep', one_shot=True)
await manager.decommission_node(decommission_target)
await table.add_column()
await random_tables.verify_schema()

Expand Down

0 comments on commit 071251a

Please sign in to comment.