Feat: bootstrap sharding

Signed-off-by: Somefive <yd219913@alibaba-inc.com>
kubevela · Jan 28, 2023 · b0861c3 · b0861c3
1 parent f733d74
commit b0861c3
Show file tree

Hide file tree

Showing 30 changed files with 1,015 additions and 53 deletions.
diff --git a/.github/workflows/apiserver-test.yml b/.github/workflows/apiserver-test.yml
@@ -180,7 +180,9 @@ jobs:
           make e2e-apiserver-test
 
       - name: Stop kubevela, get profile
-        run: make end-e2e-core
+        run: |
+          make end-e2e-core
+          CORE_NAME=kubevela-shard sh ./hack/e2e/end_e2e_core.sh
 
       - name: Upload coverage report
         uses: codecov/codecov-action@d9f34f8cd5cb3b3eb79b3e4b5dae3a16df499a70

diff --git a/.github/workflows/e2e-multicluster-test.yml b/.github/workflows/e2e-multicluster-test.yml
@@ -114,7 +114,9 @@ jobs:
           make e2e-multicluster-test
 
       - name: Stop kubevela, get profile
-        run: make end-e2e-core
+        run: |
+          make end-e2e-core
+          CORE_NAME=kubevela-shard sh ./hack/e2e/end_e2e_core.sh
 
       - name: Upload coverage report
         uses: codecov/codecov-action@d9f34f8cd5cb3b3eb79b3e4b5dae3a16df499a70

diff --git a/charts/vela-core/README.md b/charts/vela-core/README.md
@@ -100,6 +100,7 @@ helm install --create-namespace -n vela-system kubevela kubevela/vela-core --wai
 | `featureGates.gzipApplicationRevision`            | compress apprev using gzip (good) before being stored. This is reduces network throughput when dealing with huge apprevs.                                                                                                        | `false` |
 | `featureGates.zstdApplicationRevision`            | compress apprev using zstd (fast and good) before being stored. This is reduces network throughput when dealing with huge apprevs. Note that zstd will be prioritized if you enable other compression options.                   | `true`  |
 | `featureGates.preDispatchDryRun`                  | enable dryrun before dispatching resources. Enable this flag can help prevent unsuccessful dispatch resources entering resourcetracker and improve the user experiences of gc but at the cost of increasing network requests.    | `true`  |
+| `featureGates.validateComponentWhenSharding`      | enable component validation in webhook when sharding mode enabled                                                                                                                                                                | `false` |
 
 
 ### MultiCluster parameters
@@ -153,6 +154,8 @@ helm install --create-namespace -n vela-system kubevela kubevela/vela-core --wai
 | `authentication.withUser`     | Application authentication will impersonate as the request User                                                            | `true`               |
 | `authentication.defaultUser`  | Application authentication will impersonate as the User if no user provided in Application                                 | `kubevela:vela-core` |
 | `authentication.groupPattern` | Application authentication will impersonate as the request Group that matches the pattern                                  | `kubevela:*`         |
+| `sharding.enabled`            | Enable sharding for core controller                                                                                        | `false`              |
+| `sharding.shards`             | The shards available for scheduling. If empty, dynamic discovery will be used.                                             | `""`                 |
 
 
 ## Uninstallation

diff --git a/charts/vela-core/templates/kubevela-controller.yaml b/charts/vela-core/templates/kubevela-controller.yaml
@@ -51,7 +51,6 @@ rules:
   - apiGroups: ["authorization.k8s.io"]
     resources: ["subjectaccessreviews"]
     verbs: ["*"]
-
 ---
 
 apiVersion: rbac.authorization.k8s.io/v1
@@ -85,6 +84,34 @@ subjects:
     namespace: {{ .Release.Namespace }}
 
 {{ end }}
+
+
+{{ if and .Values.sharding.enabled .Values.authentication.enabled }}
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: {{ include "kubevela.fullname" . }}:shard-scheduler
+  namespace: {{ .Release.Namespace }}
+rules:
+  - apiGroups: [""]
+    resources: ["pods"]
+    verbs: ["get", "list", "watch"]
+---
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: {{ include "kubevela.fullname" . }}:shard-scheduler
+  namespace: {{ .Release.Namespace }}
+roleRef:
+  apiGroup: rbac.authorization.k8s.io
+  kind: Role
+  name: {{ include "kubevela.fullname" . }}:shard-scheduler
+subjects:
+  - kind: ServiceAccount
+    name: {{ include "kubevela.serviceAccountName" . }}
+{{ end }}
+
 ---
 # permissions to do leader election.
 apiVersion: rbac.authorization.k8s.io/v1
@@ -183,6 +210,9 @@ spec:
     metadata:
       labels:
     {{- include "kubevela.selectorLabels" . | nindent 8 }}
+        {{ if .Values.sharding.enabled }}
+        controller.core.oam.dev/shard-id: master
+        {{ end }}
       annotations:
           prometheus.io/path: /metrics
           prometheus.io/port: "8080"
@@ -282,6 +312,11 @@ spec:
             - "--authentication-default-user={{ .Values.authentication.defaultUser }}"
             - "--authentication-group-pattern={{ .Values.authentication.groupPattern }}"
             {{ end }}
+            {{ if .Values.sharding.enabled }}
+            - "--enable-sharding"
+            - "--schedulable-shards={{ .Values.sharding.shards }}"
+            - "--feature-gates=ValidateComponentWhenSharding={{- .Values.featureGates.validateComponentWhenSharding | toString -}}"
+            {{ end }}
           image: {{ .Values.imageRegistry }}{{ .Values.image.repository }}:{{ .Values.image.tag }}
           imagePullPolicy: {{ quote .Values.image.pullPolicy }}
           resources:

diff --git a/charts/vela-core/values.yaml b/charts/vela-core/values.yaml
@@ -114,6 +114,7 @@ optimize:
 ##@param featureGates.gzipApplicationRevision compress apprev using gzip (good) before being stored. This is reduces network throughput when dealing with huge apprevs.
 ##@param featureGates.zstdApplicationRevision compress apprev using zstd (fast and good) before being stored. This is reduces network throughput when dealing with huge apprevs. Note that zstd will be prioritized if you enable other compression options.
 ##@param featureGates.preDispatchDryRun enable dryrun before dispatching resources. Enable this flag can help prevent unsuccessful dispatch resources entering resourcetracker and improve the user experiences of gc but at the cost of increasing network requests.
+##@param featureGates.validateComponentWhenSharding enable component validation in webhook when sharding mode enabled
 ##@param
 featureGates:
   enableLegacyComponentRevision: false
@@ -124,6 +125,7 @@ featureGates:
   gzipApplicationRevision: false
   zstdApplicationRevision: true
   preDispatchDryRun: true
+  validateComponentWhenSharding: false
 
 ## @section MultiCluster parameters
 
@@ -268,3 +270,9 @@ authentication:
   withUser: true
   defaultUser: kubevela:vela-core
   groupPattern: kubevela:*
+
+## @param sharding.enabled When sharding enabled, the controller will run as master mode. Refer to https://github.com/kubevela/kubevela/blob/master/design/vela-core/sharding.md for details.
+## @param sharding.shards The shards available for scheduling. If empty, dynamic discovery will be used.
+sharding:
+  enabled: false
+  shards: ""
diff --git a/cmd/core/app/options/options.go b/cmd/core/app/options/options.go
@@ -17,19 +17,19 @@ limitations under the License.
 package options
 
 import (
-	"flag"
 	"strconv"
 	"time"
 
 	ctrlrec "github.com/kubevela/pkg/controller/reconciler"
 	pkgmulticluster "github.com/kubevela/pkg/multicluster"
+	utillog "github.com/kubevela/pkg/util/log"
 	wfTypes "github.com/kubevela/workflow/pkg/types"
 	utilfeature "k8s.io/apiserver/pkg/util/feature"
 	cliflag "k8s.io/component-base/cli/flag"
-	"k8s.io/klog/v2"
 
 	standardcontroller "github.com/oam-dev/kubevela/pkg/controller"
 	commonconfig "github.com/oam-dev/kubevela/pkg/controller/common"
+	"github.com/oam-dev/kubevela/pkg/controller/sharding"
 	"github.com/oam-dev/kubevela/pkg/oam"
 	"github.com/oam-dev/kubevela/pkg/resourcekeeper"
 
@@ -165,11 +165,9 @@ func (s *CoreOptions) Flags() cliflag.NamedFlagSets {
 	pkgmulticluster.AddFlags(fss.FlagSet("multicluster"))
 	ctrlrec.AddFlags(fss.FlagSet("controllerreconciles"))
 	utilfeature.DefaultMutableFeatureGate.AddFlag(fss.FlagSet("featuregate"))
-
+	sharding.AddFlags(fss.FlagSet("sharding"))
 	kfs := fss.FlagSet("klog")
-	local := flag.NewFlagSet("klog", flag.ExitOnError)
-	klog.InitFlags(local)
-	kfs.AddGoFlagSet(local)
+	utillog.AddFlags(kfs)
 
 	if s.LogDebug {
 		_ = kfs.Set("v", strconv.Itoa(int(commonconfig.LogDebug)))

diff --git a/cmd/core/app/server.go b/cmd/core/app/server.go
@@ -34,6 +34,7 @@ import (
 	"k8s.io/klog/v2/klogr"
 	ctrl "sigs.k8s.io/controller-runtime"
 	"sigs.k8s.io/controller-runtime/pkg/healthz"
+	"sigs.k8s.io/controller-runtime/pkg/manager"
 	"sigs.k8s.io/controller-runtime/pkg/manager/signals"
 
 	"github.com/oam-dev/kubevela/apis/core.oam.dev/v1beta1"
@@ -44,6 +45,8 @@ import (
 	standardcontroller "github.com/oam-dev/kubevela/pkg/controller"
 	commonconfig "github.com/oam-dev/kubevela/pkg/controller/common"
 	oamv1alpha2 "github.com/oam-dev/kubevela/pkg/controller/core.oam.dev/v1alpha2"
+	"github.com/oam-dev/kubevela/pkg/controller/core.oam.dev/v1alpha2/application"
+	"github.com/oam-dev/kubevela/pkg/controller/sharding"
 	"github.com/oam-dev/kubevela/pkg/controller/utils"
 	"github.com/oam-dev/kubevela/pkg/features"
 	"github.com/oam-dev/kubevela/pkg/monitor/watcher"
@@ -133,6 +136,7 @@ func run(ctx context.Context, s *options.CoreOptions) error {
 	}
 
 	leaderElectionID := util.GenerateLeaderElectionID(types.KubeVelaName, s.ControllerArgs.IgnoreAppWithoutControllerRequirement)
+	leaderElectionID += sharding.GetShardIDSuffix()
 	mgr, err := ctrl.NewManager(restConfig, ctrl.Options{
 		Scheme:                     scheme,
 		MetricsBindAddress:         s.MetricsAddr,
@@ -153,6 +157,7 @@ func run(ctx context.Context, s *options.CoreOptions) error {
 		// controller but also all other controllers like definition controller. Therefore, for
 		// functionalities like state-keep, they should be invented in other ways.
 		NewClient: velaclient.DefaultNewControllerClient,
+		NewCache:  sharding.BuildCache(scheme, &v1beta1.Application{}, &v1beta1.ApplicationRevision{}, &v1beta1.ResourceTracker{}),
 	})
 	if err != nil {
 		klog.ErrorS(err, "Unable to create a controller manager")
@@ -184,6 +189,44 @@ func run(ctx context.Context, s *options.CoreOptions) error {
 	}
 	s.ControllerArgs.PackageDiscover = pd
 
+	if !sharding.EnableSharding {
+		if err = prepareRun(ctx, mgr, s); err != nil {
+			return err
+		}
+	} else {
+		if err = prepareRunInShardingMode(ctx, mgr, s); err != nil {
+			return err
+		}
+	}
+
+	if err := mgr.Start(ctx); err != nil {
+		klog.ErrorS(err, "Failed to run manager")
+		return err
+	}
+	if s.LogFilePath != "" {
+		klog.Flush()
+	}
+	klog.Info("Safely stops Program...")
+	return nil
+}
+
+func prepareRunInShardingMode(ctx context.Context, mgr manager.Manager, s *options.CoreOptions) error {
+	if sharding.IsMaster() {
+		klog.Infof("controller running in sharding mode, current shard is master")
+		go sharding.DefaultApplicationScheduler.Get().Start(ctx)
+		if err := prepareRun(ctx, mgr, s); err != nil {
+			return err
+		}
+	} else {
+		klog.Infof("controller running in sharding mode, current shard id: %s", sharding.ShardID)
+		if err := application.Setup(mgr, *s.ControllerArgs); err != nil {
+			return err
+		}
+	}
+	return nil
+}
+
+func prepareRun(ctx context.Context, mgr manager.Manager, s *options.CoreOptions) error {
 	if s.UseWebhook {
 		klog.InfoS("Enable webhook", "server port", strconv.Itoa(s.WebhookPort))
 		oamwebhook.Register(mgr, *s.ControllerArgs)
@@ -193,23 +236,23 @@ func run(ctx context.Context, s *options.CoreOptions) error {
 		}
 	}
 
-	if err = oamv1alpha2.Setup(mgr, *s.ControllerArgs); err != nil {
+	if err := oamv1alpha2.Setup(mgr, *s.ControllerArgs); err != nil {
 		klog.ErrorS(err, "Unable to setup the oam controller")
 		return err
 	}
 
-	if err = standardcontroller.Setup(mgr, s.DisableCaps, *s.ControllerArgs); err != nil {
+	if err := standardcontroller.Setup(mgr, s.DisableCaps, *s.ControllerArgs); err != nil {
 		klog.ErrorS(err, "Unable to setup the vela core controller")
 		return err
 	}
 
-	if err = multicluster.InitClusterInfo(restConfig); err != nil {
+	if err := multicluster.InitClusterInfo(mgr.GetConfig()); err != nil {
 		klog.ErrorS(err, "Init control plane cluster info")
 		return err
 	}
 
 	klog.Info("Start the vela application monitor")
-	informer, err := mgr.GetCache().GetInformer(context.Background(), &v1beta1.Application{})
+	informer, err := mgr.GetCache().GetInformer(ctx, &v1beta1.Application{})
 	if err != nil {
 		klog.ErrorS(err, "Unable to get informer for application")
 	}
@@ -223,14 +266,6 @@ func run(ctx context.Context, s *options.CoreOptions) error {
 		}
 	}
 
-	if err := mgr.Start(ctx); err != nil {
-		klog.ErrorS(err, "Failed to run manager")
-		return err
-	}
-	if s.LogFilePath != "" {
-		klog.Flush()
-	}
-	klog.Info("Safely stops Program...")
 	return nil
 }
 

diff --git a/design/vela-core/sharding.md b/design/vela-core/sharding.md
@@ -0,0 +1,78 @@
+# Application Controller Sharding
+
+### Background
+
+As many adopters start to choose KubeVela to manage thousands of Applications in production, the performance issue becomes increasingly critical.
+
+The typical Kubernetes controller usually run one controller to manage all related custom resources. The operator reconciliation is designed to be relatively lightweight. In KubeVela, to support customized delivery process and manage lifecycles of tens of resources in one application makes the reconciliation process heavier compared to typical controllers.
+
+Although multiple replicas of KubeVela controller could provide high availability, in avoid of multiple concurrent reconciliation for the same application (which could lead to conflict problems), only one replica could work and this is achieved by acquiring the same leader-election lock.
+
+So usually, users add more resources to KubeVela controller (more CPUs, Memory...) and set up more threads and larger rate limiter to support more applications in the system. This can lead to potential problems in different cases:
+1. There is limitations for the growth of resources for a single KubeVela controller. If the Kubernetes cluster of the control plane uses lots of small machines, a single KubeVela controller can only run one of them, instead of spreading out.
+2. The failure of the single KubeVela controller could block all the delivery process of applications. The failure could be caused by various reasons and some frequently seen reasons (like OOM or Crash due to some weird application input) are not recoverable by restarting controller.
+3. In multi-tenancy scenario, there is no fairness that could be guaranteed if some users have a huge number of applications. They could make the controller run slow and users with a small number of applications could also be affected, due to the sharing of the single KubeVela controller.
+
+Therefore, this KEP proposes to support dividing a single large KubeVela controller into multiple small application controllers for run.
+
+### Proposal
+
+![image](./vela-core-sharding-arch.png)
+
+When running with `--enable-sharding` arg in KubeVela core controller, the KubeVela core controller will run in sharding mode.
+- If the `--shard-id` arg is set to `master`, this one will run in **master** mode.
+- If the `--shard-id` arg is set to some other value, like `shard-0`, this one will run in **slave** mode.
+
+#### master mode
+
+The **master** mode will enable all the controllers, such as ComponentDefinitionController, TraitDefinitionController, ApplicationController, Webhooks, etc.
+
+The application controller will only handle applications with the label `controller.core.oam.dev/scheduled-shard-id: master`, and only the applications, applicationrevisions and resourcetrackers that carries this label will be watched and cached.
+
+By default, it will watch the pods within the same namespace as it runs. The pods with labels `app.kubernetes.io/name: vela-core` and carries `controller.core.oam.dev/shard-id` label key will be selected and their health status will be recorded. The selected ready pods will be registered as schedulable shards. The mutating Webhook will automatically assign shard-id for non-assigned applications when requests come in.
+
+#### slave mode
+
+The **slave** mode controller will only start the ApplicationController and will not enable others like Webhook or ComponentDefinitionController. It is dedicated to applications that carries the matched labels `controller.core.oam.dev/scheduled-shard-id=<shard-id>`.
+
+### Example
+
+First, install KubeVela with sharding enabled. This will let the KubeVela core controller deployed in **master** mode.
+
+```shell
+helm install kubevela kubevela/vela-core -n vela-system --set sharding.enabled=true
+```
+
+Second, deploy **slave mode** application controller.
+
+There are different ways to do it.
+1. Use addons for installation. `vela addon enable vela-core-shard-manager nShards=3`. Supported by https://github.com/kubevela/catalog/pull/606
+2. Use kubectl to copy the deployment of **master** and modify it. `kubectl get deploy kubevela-vela-core -oyaml -n vela-system | sed 's/schedulable-shards=/shard-id=shard-0/g' | sed 's/instance: kubevela/instance: kubevela-shard/g' | sed 's/shard-id: master/shard-id: shard-0/g' | sed 's/name: kubevela/name: kubevela-shard/g' | kubectl apply -f -` This will create a copy of the **master** vela-core and run a **slave** replica with shard-id as *shard-0*.
+3. Just create a new deployment and set the labels to match the above mentioned requirements.
+
+> In the case you do not want dynamic discovery for available application controller, you can specify what shards are schedulable by add arg `--schedulable-shards=shard-0,shard-1` to the **master mode** vela-core.
+
+### Future Work
+
+This Webhook implemented will only schedule applications when
+1. The application is going to be mutated (create or update).
+2. The application has not been scheduled yet (no `controller.core.oam.dev/scheduled-shard-id` label).
+3. There is available controller to assign (the **master mode** controller will not be automatically schedulable in order to divide the burden of handling application into **slave mode** controllers).
+
+So it cannot handle re-schedule scenario and cannot make automatic scheduling for previous application.
+
+For the next step, we can support:
+1. `vela up <app-name> --shard-id <shard-id>` Use vela CLI to help user manually reschedule application. NOTE: reschedule application not only need to reset the label of that application but also need to reset the labels of related applicationrevisions and resourcetrackers.
+2. Support a background scheduler or a scheduling script to make schedule for all unscheduled application.
+3. Support `DisableAutoSchedule` for the case where user wants to disable the automatic schedule for the mutating webhook.
+4. Observability tools (Prometheus, Loki, Grafana) should be able to collect data from all vela-core controllers in sharding mode.
+
+### Extend Usage
+1. The sharding package could potentially be moved to `kubevela/pkg` if later we find it also helpful for other controller like `kubevela/workflow`.
+2. It is possible to let different shards to use different vela-core version so as to handle applications in multiple versions by different controller.
+3. It is possible to let different tenant to use different vela-core controller to help improve fairness and reduce the damage scope when one controller fails.
+4. The original multi-replica KubeVela core controller for high availability is deferred to each shard's deployment, which means each shard could have multiple replicas (and only one can work for one time, but multiple shards can be parallelized).
+
+### Tech Details
+1. Since the cache of Application/ApplicationRevision/ResourceTracker are divided into multiple shards, the memory consumption of each controller is expected to be divided as well. Other resources like `ConfigMap` (used by Workflow Context) are not divided.
+2. The validating webhook needs to read ApplicationRevision when `publish-version` is set in application, but since ApplicationRevision is sharded, therefore the cache of the vela-core in **master mode** will not hold the ApplicationRevision. So here we use native Kubernetes API request to read ApplicationRevision. This may lead to performance drop for validating webhook and this is disabled by default when sharding enabled.
diff --git a/design/vela-core/vela-core-sharding-arch.png b/design/vela-core/vela-core-sharding-arch.png