Skip to content

Commit

Permalink
#85: Resolve issues with cluster scale when removing the leader (fixes
Browse files Browse the repository at this point in the history
…kubernetes/kubernetes#43461)

This branch removes the erronious logic when tracking the -departed
relation states to remove all participating peers. Instead we are now
relying on the behavior of the -broken hook to only execute on the unit
that is being removed from the cluster. This may not be perfect but
works 100% better than the current implementation.
  • Loading branch information
Charles Butler authored and marcoceppi committed Mar 24, 2017
1 parent 0e0979f commit 62cce56
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 30 deletions.
18 changes: 12 additions & 6 deletions lib/etcdctl.py
Expand Up @@ -50,16 +50,22 @@ def register(self, cluster_data):
reg['cluster'] = line.split('="')[-1].rstrip('"')
return reg

def unregister(self, unit_id):
def unregister(self, unit_id, leader_address=None):
''' Perform self deregistration during unit teardown
@params cluster_data - a dict of data to fill out the request to push
our deregister command to the leader. requires keys: leader_address,
port, etcd_unit_guid
@params unit_id - the ID for the unit assigned by etcd. Obtainable from
member_list method.
The unit_id can be obtained from the EtcdDatabag dict
@params leader_address - The endpoint to communicate with the leader in
the event of self deregistration.
'''
command = "{0} member remove {1}".format(self.ETCDCTL_COMMAND, unit_id)

if leader_address:
cmd = "{0} --endpoint {1} member remove {2}"
command = cmd.format(self.ETCDCTL_COMMAND, leader_address, unit_id)
else:
cmd = "{0} member remove {1}"
command = cmd.format(self.ETCDCTL_COMMAND, unit_id)

return self.run(command)

Expand Down
31 changes: 7 additions & 24 deletions reactive/etcd.py
Expand Up @@ -411,32 +411,15 @@ def render_default_user_ssl_exports():
set_state('etcd.pillowmints')


@when('cluster.departing')
@when('leadership.is_leader')
def unregister(cluster):
''' The leader will process the departing event and attempt unregistration
for the departing unit. If the leader is departing, it will unregister
all units prior to termination.
'''
@hook('cluster-relation-broken')
def perform_self_unregistration(cluster=None):
''' Attempt self removal during unit teardown. '''
etcdctl = EtcdCtl()
peers = cluster.get_peers()
leader_address = leader_get('leader_address')
unit_name = os.getenv('JUJU_UNIT_NAME').replace('/', '')
members = etcdctl.member_list()
for unit in peers:
cluster_name = unit.replace('/', '')
if cluster_name in members.keys():
log("Unregistering {0}".format(unit))
etcdctl.unregister(members[cluster_name]['unit_id'])
else:
log("Received removal for disconnected member {}".format(unit))
cluster.dismiss()


@when('cluster.departing')
@when_not('leadership.is_leader')
def passive_dismiss_context(cluster):
''' All units undergo the departing phase. This is a no-op unless you
are the leader '''
cluster.dismiss()
# Self Unregistration
etcdctl.unregister(members[unit_name]['unit_id'], leader_address)


@hook('data-storage-attached')
Expand Down
9 changes: 9 additions & 0 deletions tests/10-deploy.py
Expand Up @@ -81,6 +81,15 @@ def test_leader_knows_all_members(self):
self.assertTrue('etcd cluster is unavailable' not in members)
self.assertTrue(len(members) == len(self.etcd))

def test_node_scale_down_members(self):
''' Scale the cluster down and ensure the cluster state is still
healthy '''
# Remove the leader
self.d.remove_unit(self.leader.info['unit_name'])
self.d.sentry.wait()
# re-use the cluster-health test to validate we are still healthy.
self.test_cluster_health()


if __name__ == '__main__':
unittest.main()

0 comments on commit 62cce56

Please sign in to comment.