Permalink
Browse files

Resolve issues with cluster scale when removing the leader

This branch removes the erronious logic when tracking the -departed
relation states to remove all participating peers. Instead we are now
relying on the behavior of the -broken hook to only execute on the unit
that is being removed from the cluster. This may not be perfect but
works 100% better than the current implementation.

Resolves kubernetes/kubernetes#43461
  • Loading branch information...
lazypower committed Mar 24, 2017
1 parent 0e0979f commit 89b3ae2d9240d8b0add49244a933f5b6c9c83fdd
Showing with 31 additions and 30 deletions.
  1. +12 −6 lib/etcdctl.py
  2. +10 −24 reactive/etcd.py
  3. +9 −0 tests/10-deploy.py
@@ -50,16 +50,22 @@ def register(self, cluster_data):
reg['cluster'] = line.split('="')[-1].rstrip('"')
return reg
def unregister(self, unit_id):
def unregister(self, unit_id, leader_address=None):
''' Perform self deregistration during unit teardown
@params cluster_data - a dict of data to fill out the request to push
our deregister command to the leader. requires keys: leader_address,
port, etcd_unit_guid
@params unit_id - the ID for the unit assigned by etcd. Obtainable from
member_list method.
The unit_id can be obtained from the EtcdDatabag dict
@params leader_address - The endpoint to communicate with the leader in
the event of self deregistration.
'''
command = "{0} member remove {1}".format(self.ETCDCTL_COMMAND, unit_id)
if leader_address:
cmd = "{0} --endpoint {1} member remove {2}"
command = cmd.format(self.ETCDCTL_COMMAND, leader_address, unit_id)
else:
cmd = "{0} member remove {1}"
command = cmd.format(self.ETCDCTL_COMMAND, unit_id)
return self.run(command)
@@ -411,32 +411,18 @@ def render_default_user_ssl_exports():
set_state('etcd.pillowmints')
@when('cluster.departing')
@when('leadership.is_leader')
def unregister(cluster):
''' The leader will process the departing event and attempt unregistration
for the departing unit. If the leader is departing, it will unregister
all units prior to termination.
'''
@hook('cluster-relation-broken')
def perform_self_unregistration(cluster=None):
''' Attempt self removal during unit teardown. '''
# This method is not without its flaws. If you scale down to 2 units
# its likely to tank the cluster. Please be wary until
# https://bugs.launchpad.net/juju/+bug/1417874 is resolved.
etcdctl = EtcdCtl()
peers = cluster.get_peers()
leader_address = leader_get('leader_address')
unit_name = os.getenv('JUJU_UNIT_NAME').replace('/', '')
members = etcdctl.member_list()
for unit in peers:
cluster_name = unit.replace('/', '')
if cluster_name in members.keys():
log("Unregistering {0}".format(unit))
etcdctl.unregister(members[cluster_name]['unit_id'])
else:
log("Received removal for disconnected member {}".format(unit))
cluster.dismiss()
@when('cluster.departing')
@when_not('leadership.is_leader')
def passive_dismiss_context(cluster):
''' All units undergo the departing phase. This is a no-op unless you
are the leader '''
cluster.dismiss()
# Self Unregistration
etcdctl.unregister(members[unit_name]['unit_id'], leader_address)
@hook('data-storage-attached')
@@ -81,6 +81,15 @@ def test_leader_knows_all_members(self):
self.assertTrue('etcd cluster is unavailable' not in members)
self.assertTrue(len(members) == len(self.etcd))
def test_node_scale_down_members(self):
''' Scale the cluster down and ensure the cluster state is still
healthy '''
# Remove the leader
self.d.remove_unit(self.leader.info['unit_name'])
self.d.sentry.wait()
# re-use the cluster-health test to validate we are still healthy.
self.test_cluster_health()
if __name__ == '__main__':
unittest.main()

0 comments on commit 89b3ae2

Please sign in to comment.