#85: Resolve issues with cluster scale when removing the leader (fixes …

…kubernetes/kubernetes#43461) This branch removes the erronious logic when tracking the -departed relation states to remove all participating peers. Instead we are now relying on the behavior of the -broken hook to only execute on the unit that is being removed from the cluster. This may not be perfect but works 100% better than the current implementation.
charmed-kubernetes · Mar 24, 2017 · 62cce56 · 62cce56
1 parent 0e0979f
commit 62cce56
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 30 deletions.
diff --git a/lib/etcdctl.py b/lib/etcdctl.py
@@ -50,16 +50,22 @@ def register(self, cluster_data):
                 reg['cluster'] = line.split('="')[-1].rstrip('"')
         return reg
 
-    def unregister(self, unit_id):
+    def unregister(self, unit_id, leader_address=None):
         ''' Perform self deregistration during unit teardown
 
-        @params cluster_data - a dict of data to fill out the request to push
-        our deregister command to the leader. requires  keys: leader_address,
-        port, etcd_unit_guid
+        @params unit_id - the ID for the unit assigned by etcd. Obtainable from
+        member_list method.
 
-        The unit_id can be obtained from the EtcdDatabag dict
+        @params leader_address - The endpoint to communicate with the leader in
+        the event of self deregistration.
         '''
-        command = "{0} member remove {1}".format(self.ETCDCTL_COMMAND, unit_id)
+
+        if leader_address:
+            cmd = "{0} --endpoint {1} member remove {2}"
+            command = cmd.format(self.ETCDCTL_COMMAND, leader_address, unit_id)
+        else:
+            cmd = "{0} member remove {1}"
+            command = cmd.format(self.ETCDCTL_COMMAND, unit_id)
 
         return self.run(command)
 

diff --git a/reactive/etcd.py b/reactive/etcd.py
@@ -411,32 +411,15 @@ def render_default_user_ssl_exports():
     set_state('etcd.pillowmints')
 
 
-@when('cluster.departing')
-@when('leadership.is_leader')
-def unregister(cluster):
-    ''' The leader will process the departing event and attempt unregistration
-        for the departing unit. If the leader is departing, it will unregister
-        all units prior to termination.
-    '''
+@hook('cluster-relation-broken')
+def perform_self_unregistration(cluster=None):
+    ''' Attempt self removal during unit teardown. '''
     etcdctl = EtcdCtl()
-    peers = cluster.get_peers()
+    leader_address = leader_get('leader_address')
+    unit_name = os.getenv('JUJU_UNIT_NAME').replace('/', '')
     members = etcdctl.member_list()
-    for unit in peers:
-        cluster_name = unit.replace('/', '')
-        if cluster_name in members.keys():
-            log("Unregistering {0}".format(unit))
-            etcdctl.unregister(members[cluster_name]['unit_id'])
-        else:
-            log("Received removal for disconnected member {}".format(unit))
-    cluster.dismiss()
-
-
-@when('cluster.departing')
-@when_not('leadership.is_leader')
-def passive_dismiss_context(cluster):
-    ''' All units undergo the departing phase. This is a no-op unless you
-        are the leader '''
-    cluster.dismiss()
+    # Self Unregistration
+    etcdctl.unregister(members[unit_name]['unit_id'], leader_address)
 
 
 @hook('data-storage-attached')

diff --git a/tests/10-deploy.py b/tests/10-deploy.py
@@ -81,6 +81,15 @@ def test_leader_knows_all_members(self):
             self.assertTrue('etcd cluster is unavailable' not in members)
         self.assertTrue(len(members) == len(self.etcd))
 
+    def test_node_scale_down_members(self):
+        ''' Scale the cluster down and ensure the cluster state is still
+        healthy '''
+        # Remove the leader
+        self.d.remove_unit(self.leader.info['unit_name'])
+        self.d.sentry.wait()
+        # re-use the cluster-health test to validate we are still healthy.
+        self.test_cluster_health()
+
 
 if __name__ == '__main__':
     unittest.main()