From 6495f80f89e0618b06315e00f9cb6aa4520be406 Mon Sep 17 00:00:00 2001 From: RainbowMango Date: Tue, 16 Aug 2022 21:13:48 +0800 Subject: [PATCH] enable node pod list only when needed for better performance Signed-off-by: RainbowMango --- cmd/agent/app/agent.go | 1 + cmd/agent/app/options/options.go | 4 ++ cmd/controller-manager/app/options/options.go | 5 +- pkg/controllers/context/context.go | 5 +- .../status/cluster_status_controller.go | 48 ++++++++++--------- 5 files changed, 39 insertions(+), 24 deletions(-) diff --git a/cmd/agent/app/agent.go b/cmd/agent/app/agent.go index b875c6bd8504..40b54ed38f4f 100644 --- a/cmd/agent/app/agent.go +++ b/cmd/agent/app/agent.go @@ -225,6 +225,7 @@ func setupControllers(mgr controllerruntime.Manager, opts *options.Options, stop ClusterAPIBurst: opts.ClusterAPIBurst, ConcurrentWorkSyncs: opts.ConcurrentWorkSyncs, RateLimiterOptions: opts.RateLimiterOpts, + EnableClusterResourceModeling: opts.EnableClusterResourceModeling, }, StopChan: stopChan, ResourceInterpreter: resourceInterpreter, diff --git a/cmd/agent/app/options/options.go b/cmd/agent/app/options/options.go index bcee5cefd7da..b9ffb891ae57 100644 --- a/cmd/agent/app/options/options.go +++ b/cmd/agent/app/options/options.go @@ -108,6 +108,10 @@ type Options struct { // ClusterRegion represents the region of the cluster locate in. ClusterRegion string + // EnableClusterResourceModeling indicates if enable cluster resource modeling. + // The resource modeling might be used by the scheduler to make scheduling decisions + // in scenario of dynamic replica assignment based on cluster free resources. + // Disable if it does not fit your cases for better performance. EnableClusterResourceModeling bool } diff --git a/cmd/controller-manager/app/options/options.go b/cmd/controller-manager/app/options/options.go index acd9aab518db..a29b218a4ba0 100644 --- a/cmd/controller-manager/app/options/options.go +++ b/cmd/controller-manager/app/options/options.go @@ -120,7 +120,10 @@ type Options struct { RateLimiterOpts ratelimiterflag.Options ProfileOpts profileflag.Options - + // EnableClusterResourceModeling indicates if enable cluster resource modeling. + // The resource modeling might be used by the scheduler to make scheduling decisions + // in scenario of dynamic replica assignment based on cluster free resources. + // Disable if it does not fit your cases for better performance. EnableClusterResourceModeling bool } diff --git a/pkg/controllers/context/context.go b/pkg/controllers/context/context.go index 90d2e83fff67..52448e0c1dbb 100644 --- a/pkg/controllers/context/context.go +++ b/pkg/controllers/context/context.go @@ -64,7 +64,10 @@ type Options struct { // GracefulEvictionTimeout is the timeout period waiting for the grace-eviction-controller performs the final // removal since the workload(resource) has been moved to the graceful eviction tasks. GracefulEvictionTimeout metav1.Duration - + // EnableClusterResourceModeling indicates if enable cluster resource modeling. + // The resource modeling might be used by the scheduler to make scheduling decisions + // in scenario of dynamic replica assignment based on cluster free resources. + // Disable if it does not fit your cases for better performance. EnableClusterResourceModeling bool } diff --git a/pkg/controllers/status/cluster_status_controller.go b/pkg/controllers/status/cluster_status_controller.go index b6395f450bd8..7059b487c596 100644 --- a/pkg/controllers/status/cluster_status_controller.go +++ b/pkg/controllers/status/cluster_status_controller.go @@ -92,6 +92,10 @@ type ClusterStatusController struct { ClusterCacheSyncTimeout metav1.Duration RateLimiterOptions ratelimiterflag.Options + // EnableClusterResourceModeling indicates if enable cluster resource modeling. + // The resource modeling might be used by the scheduler to make scheduling decisions + // in scenario of dynamic replica assignment based on cluster free resources. + // Disable if it does not fit your cases for better performance. EnableClusterResourceModeling bool } @@ -172,16 +176,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu } // skip collecting cluster status if not ready - if online && healthy && readyCondition.Status == metav1.ConditionTrue && c.EnableClusterResourceModeling { - // get or create informer for pods and nodes in member cluster - clusterInformerManager, err := c.buildInformerForCluster(clusterClient) - if err != nil { - klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err) - // in large-scale clusters, the timeout may occur. - // if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer - return controllerruntime.Result{Requeue: true}, err - } - + if online && healthy && readyCondition.Status == metav1.ConditionTrue { if cluster.Spec.SyncMode == clusterv1alpha1.Pull { // init the lease controller for pull mode clusters c.initLeaseController(cluster) @@ -191,6 +186,7 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu if err != nil { klog.Errorf("Failed to get Kubernetes version for Cluster %s. Error: %v.", cluster.GetName(), err) } + currentClusterStatus.KubernetesVersion = clusterVersion // get the list of APIs installed in the member cluster apiEnables, err := getAPIEnablements(clusterClient) @@ -199,21 +195,29 @@ func (c *ClusterStatusController) syncClusterStatus(cluster *clusterv1alpha1.Clu } else if err != nil { klog.Warningf("Maybe get partial(%d) APIs installed in Cluster %s. Error: %v.", len(apiEnables), cluster.GetName(), err) } + currentClusterStatus.APIEnablements = apiEnables - nodes, err := listNodes(clusterInformerManager) - if err != nil { - klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err) - } + if c.EnableClusterResourceModeling { + // get or create informer for pods and nodes in member cluster + clusterInformerManager, err := c.buildInformerForCluster(clusterClient) + if err != nil { + klog.Errorf("Failed to get or create informer for Cluster %s. Error: %v.", cluster.GetName(), err) + // in large-scale clusters, the timeout may occur. + // if clusterInformerManager fails to be built, should be returned, otherwise, it may cause a nil pointer + return controllerruntime.Result{Requeue: true}, err + } + nodes, err := listNodes(clusterInformerManager) + if err != nil { + klog.Errorf("Failed to list nodes for Cluster %s. Error: %v.", cluster.GetName(), err) + } - pods, err := listPods(clusterInformerManager) - if err != nil { - klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err) + pods, err := listPods(clusterInformerManager) + if err != nil { + klog.Errorf("Failed to list pods for Cluster %s. Error: %v.", cluster.GetName(), err) + } + currentClusterStatus.NodeSummary = getNodeSummary(nodes) + currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods) } - - currentClusterStatus.KubernetesVersion = clusterVersion - currentClusterStatus.APIEnablements = apiEnables - currentClusterStatus.NodeSummary = getNodeSummary(nodes) - currentClusterStatus.ResourceSummary = getResourceSummary(nodes, pods) } setTransitionTime(currentClusterStatus.Conditions, readyCondition)