diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 77b2a34..7e1c960 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -29,6 +29,8 @@ jobs: uses: actions/setup-go@v5 with: go-version-file: 'go.mod' + - name: List open ports and processes + run: sudo lsof -i -P -n | grep LISTEN - name: Unit test run: make utest race: @@ -42,4 +44,4 @@ jobs: with: go-version-file: 'go.mod' - name: Test race - run: go test -count=1 -parallel 1 -race ./... + run: go test -count=1 -parallel 1 -race -skip 'TestTracesForSanity/rdt_trace|TestPowerForSanity/power_efficiency' ./... diff --git a/Dockerfile b/Dockerfile index acb15be..a5465d4 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ # Copyright (c) 2022 Intel Corporation # SPDX-License-Identifier: Apache-2.0 -FROM golang:1.24.2 AS build +FROM golang:1.24.6 AS build WORKDIR /app diff --git a/Makefile b/Makefile index e20b431..b186b9e 100644 --- a/Makefile +++ b/Makefile @@ -3,8 +3,9 @@ SCALEOUT_PLUGIN=scale_out RMPOD_PLUGIN=rm_pod RDT_PLUGIN=rdt CPU_PLUGIN=cpu_scale +ENERGY_PLUGIN=energy GO_CILINT_CHECKERS=errcheck,goimports,gosec,gosimple,govet,ineffassign,nilerr,revive,staticcheck,unused -DOCKER_IMAGE_VERSION=0.3.0 +DOCKER_IMAGE_VERSION=0.4.0 api: hack/generate_code.sh @@ -32,7 +33,10 @@ build-plugin-rdt: build-plugin-cpu: CGO_ENABLED=0 go build -o bin/plugins/${CPU_PLUGIN} plugins/${CPU_PLUGIN}/cmd/${CPU_PLUGIN}.go -build-plugins: build-plugin-scaleout build-plugin-rmpod build-plugin-rdt build-plugin-cpu +build-plugin-energy: + CGO_ENABLED=0 go build -o bin/plugins/${ENERGY_PLUGIN} plugins/${ENERGY_PLUGIN}/cmd/${ENERGY_PLUGIN}.go + +build-plugins: build-plugin-scaleout build-plugin-rmpod build-plugin-rdt build-plugin-cpu build-plugin-energy controller-images: docker build -t planner:${DOCKER_IMAGE_VERSION} . --no-cache --pull @@ -42,6 +46,7 @@ plugin-images: docker build -t rmpod:${DOCKER_IMAGE_VERSION} -f plugins/rm_pod/Dockerfile . --no-cache --pull docker build -t rdt:${DOCKER_IMAGE_VERSION} -f plugins/rdt/Dockerfile . --no-cache --pull docker build -t cpuscale:${DOCKER_IMAGE_VERSION} -f plugins/cpu_scale/Dockerfile . --no-cache --pull + docker build -t energy:${DOCKER_IMAGE_VERSION} -f plugins/energy/Dockerfile . --no-cache --pull all-images: controller-images plugin-images @@ -57,7 +62,8 @@ prepare-build: go mod tidy utest: - go test -count=1 -parallel 1 -v ./... + # Skipping certain trace tests, as they cannot be run safely on public runners. + go test -count=1 -parallel 1 -v -skip 'TestTracesForSanity/rdt_trace|TestPowerForSanity/power_efficiency' ./... test: hack/run_test.sh diff --git a/README.md b/README.md index 9bed2e8..37ea5c6 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,4 @@ - # Intent Driven Orchestration Planner ![planner.png](planner.png) diff --git a/artefacts/deploy/manifest.yaml b/artefacts/deploy/manifest.yaml index 2fd0134..0c1b707 100644 --- a/artefacts/deploy/manifest.yaml +++ b/artefacts/deploy/manifest.yaml @@ -199,7 +199,7 @@ spec: serviceAccountName: planner-service-account containers: - name: planner - image: 127.0.0.1:5000/planner:0.3.0 + image: 127.0.0.1:5000/planner:0.4.0 ports: - containerPort: 33333 imagePullPolicy: Always diff --git a/artefacts/examples/default_profiles.yaml b/artefacts/examples/default_profiles.yaml index dfe3dca..429aac3 100644 --- a/artefacts/examples/default_profiles.yaml +++ b/artefacts/examples/default_profiles.yaml @@ -5,7 +5,7 @@ metadata: name: p50latency spec: type: "latency" - description: "Measures P50 latency in ms over a 30ms time window as reported by Linkerd service mesh." + description: "Measures P50 latency in ms over a 30s time window as reported by Linkerd service mesh." --- apiVersion: "ido.intel.com/v1alpha1" kind: KPIProfile @@ -13,7 +13,7 @@ metadata: name: p95latency spec: type: "latency" - description: "Measures P95 latency in ms over a 30ms time window as reported by Linkerd service mesh." + description: "Measures P95 latency in ms over a 30s time window as reported by Linkerd service mesh." --- apiVersion: "ido.intel.com/v1alpha1" kind: KPIProfile @@ -21,7 +21,7 @@ metadata: name: p99latency spec: type: "latency" - description: "Measures P99 latency in ms over a 30ms time window as reported by Linkerd service mesh." + description: "Measures P99 latency in ms over a 30s time window as reported by Linkerd service mesh." --- apiVersion: "ido.intel.com/v1alpha1" kind: KPIProfile @@ -29,11 +29,13 @@ metadata: name: throughput spec: type: "throughput" - description: "Measures requests per second aggregated over a 30ms time window as reported by Linkerd service mesh." + minimize: False + description: "Measures requests per second aggregated over a 30s time window as reported by Linkerd service mesh." --- apiVersion: "ido.intel.com/v1alpha1" kind: KPIProfile metadata: name: availability spec: - type: "availability" \ No newline at end of file + type: "availability" + minimize: False diff --git a/artefacts/examples/example_deployment.yaml b/artefacts/examples/example_deployment.yaml index f45bb8f..4f2e9cf 100644 --- a/artefacts/examples/example_deployment.yaml +++ b/artefacts/examples/example_deployment.yaml @@ -16,7 +16,7 @@ spec: spec: containers: - name: sample-function - image: testfunction/rust_function:0.1 + image: testfunction/rust_function:0.2 ports: - containerPort: 8080 env: diff --git a/artefacts/intents_crds_v1alpha1.yaml b/artefacts/intents_crds_v1alpha1.yaml index 332a31a..a4a8e02 100644 --- a/artefacts/intents_crds_v1alpha1.yaml +++ b/artefacts/intents_crds_v1alpha1.yaml @@ -20,7 +20,7 @@ spec: type: object properties: kind: - description: 'Kind of the owner.' + description: 'Kind of the owner (defaults to Deployment kind).' type: string enum: - Deployment @@ -33,11 +33,15 @@ spec: - name priority: type: number - description: "Priority for a set of PODs" + description: "Priority for a set of PODs (defaults to 0.01)." format: float minimum: 0.01 # prevents any div 0! maximum: 1.0 default: 0.01 + active: + type: boolean + description: "Indicates if the planner should actively managed this intent (defaults to true)." + default: true objectives: type: array description: "Objectives for a set of PODs." @@ -54,6 +58,12 @@ spec: measuredBy: type: string description: "Defines what kind of an objective this is. Also defines if the objective is an upper or lower bound objective." + tolerance: + type: number + description: "Indicates a tolerance as percentage in context of the specified target value for the objective (defaults to 0.0) - e.g. 0.1 & target 10ms ==> 11ms." + format: float + minimum: 0.0 + default: 0.0 required: - name - value @@ -108,13 +118,16 @@ spec: spec: type: object properties: - # TODO: add weight. query: type: string description: "This is an optional parameter - if defined, the user needs to provide a query string defining how to capture the objective's KPI. Optional parameters - in accordance with the provide documentation - can be detailed under props." description: type: string description: "Ideally includes a description on what is measured by the query - including e.g. information on units etc." + minimize: + type: boolean + description: "Indicates whether the planner should try to minimize this or not (defaults to true)." + default: true type: type: string description: "Defines the type of the KPI." diff --git a/cmd/main.go b/cmd/main.go index ca50dd5..c6dc8a8 100644 --- a/cmd/main.go +++ b/cmd/main.go @@ -2,6 +2,8 @@ package main import ( "flag" + "io" + "os" "time" "github.com/intel/intent-driven-orchestration/pkg/controller" @@ -38,7 +40,32 @@ func main() { } cfg, err := common.ParseConfig(config) if err != nil { - klog.Fatalf("Error loading planner config: %s", err) + klog.Fatalf("Error loading planner config: %v", err) + } + + // set logFile + if cfg.Generic.LogFile != "" { + err := flag.Set("logtostderr", "false") + if err != nil { + klog.Fatalf("Error setting flag logtostderr: %v", err) + } + err = flag.Set("alsologtostderr", "true") + if err != nil { + klog.Fatalf("Error setting flag alsologtostderr: %v", err) + } + + logFile, err := os.OpenFile(cfg.Generic.LogFile, os.O_CREATE|os.O_WRONLY|os.O_APPEND, 0600) + if err != nil { + klog.Fatalf("Failed to open log file: %v", err) + } + defer logFile.Close() + + multiWriter := io.MultiWriter(os.Stdout, logFile) + klog.SetOutput(multiWriter) + defer func() { + klog.Flush() + }() + klog.Infof("Successfuly added to klog output the log file: %s", cfg.Generic.LogFile) } // K8s genClient setup @@ -65,7 +92,7 @@ func main() { planner := astar.NewAPlanner(actuatorList, cfg) defer planner.Stop() - // This is main controller. + // This is the main controller. tracer := controller.NewMongoTracer(cfg.Generic.MongoEndpoint) c := controller.NewController(cfg, tracer, k8sClient, podInformerFactory.Core().V1().Pods()) c.SetPlanner(planner) diff --git a/docs/actuators.md b/docs/actuators.md index 5fffb80..17e9b85 100644 --- a/docs/actuators.md +++ b/docs/actuators.md @@ -68,6 +68,10 @@ functions in the next section to better understand how the system selects what t Furthermore, the implementation of ***NextState()*** can support the opportunistic planning capabilities, by adding new states, that although they do not satisfy the desired still at least move the system in the right direction. +Note that the parameters of the actions are defined by **interface{}**. Ideally a map is used to represent the +parameters. For example, they can be represented as a _map[string]int64_ or _map[string]string_. Other +types of values in the map will be cast to string to support the GRPC plugin mechanism. + #### Utility/Cost functions Utilities are used to steer the planner. Planners will deem an action to be favorable if the actuator returns a low diff --git a/docs/fig/intents_objectives_kpis.png b/docs/fig/intents_objectives_kpis.png index a8233ae..fedfd6d 100644 Binary files a/docs/fig/intents_objectives_kpis.png and b/docs/fig/intents_objectives_kpis.png differ diff --git a/docs/fig/intents_objectives_kpis.puml b/docs/fig/intents_objectives_kpis.puml index 67779a9..444571b 100644 --- a/docs/fig/intents_objectives_kpis.puml +++ b/docs/fig/intents_objectives_kpis.puml @@ -14,11 +14,13 @@ hide class circle class Intent { targetKey targetKind - Priority + priority + active } class Objective { name value + tolerance } enum KPIType { latency @@ -30,6 +32,7 @@ class KPIProfile { query: string endpoint: address external: bool + minimize: bool } Intent "1" *-right- "1..n" Objective: objectives Objective "0..*" -- "1" KPIProfile: measuredBy diff --git a/docs/getting_started.md b/docs/getting_started.md index 5ee2d9d..7564636 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -117,6 +117,7 @@ sections for each of the [framework's major components](framework.md) as well as | Property | Description | |----------------|-----------------------------------------------------------------------------| | mongo_endpoint | URI for the Mongo database - representing the knowledge base of the system. | +| log_file | (Optional) Path to a log file to config klog. | ### Controller @@ -185,21 +186,22 @@ Each actuator will have its own configuration. ### cpu scale actuator -| Property | Description | -|------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| interpreter | Path to a python interpreter. | -| analytics_script | Path to the analytics python script used to determine the scaling model. | -| cpu_max | Maximum CPU resource units (in millis) that the actuator will allow. | -| cpu_rounding | Multiple of 10 defining how to round up CPU resource units. | -| cpu_safeguard_factor | Define the factor the actuator will use to stay below the targeted objective. | -| look_back | Time in minutes defining how old the ML model can be. | -| max_proactive_cpu | Maximum CPU resource units (in millis) that the actuator will allow when proactively scaling. If set to 0, proactive planning is disabled. A fraction of this value is used for proactive scale ups/downs. | -| proactive_latency_percentage | Float defining the potential percentage change in latency by scaling the resources. | -| endpoint | Name of the endpoint to use for registering this plugin. | -| port | Port this actuator should listen on. | -| mongo_endpoint | URI for the Mongo database - representing the knowledge base of the system. | -| plugin_manager_endpoint | String defining the plugin manager's endpoint to which actuators can register themselves. | -| plugin_manager_port | Port number of the plugin manager's endpoint to which actuators can register themselves. | +| Property | Description | +|------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| interpreter | Path to a python interpreter. | +| analytics_script | Path to the analytics python script used to determine the scaling model. | +| cpu_max | Maximum CPU resource units (in millis) that the actuator will allow. | +| cpu_rounding | Multiple of 10 defining how to round up CPU resource units. | +| cpu_safeguard_factor | Define the factor the actuator will use to stay below the targeted objective. | +| boostFactor | Defines the multiplication factor for calculating resource limits from requests. If set to 1.0 PODs will be in a Guaranteed QoS, smaller or larger values lead to a BestEffort or Burstable QoS accordingly. | +| look_back | Time in minutes defining how old the ML model can be. | +| max_proactive_cpu | Maximum CPU resource units (in millis) that the actuator will allow when proactively scaling. If set to 0, proactive planning is disabled. A fraction of this value is used for proactive scale ups/downs. | +| proactive_latency_percentage | Float defining the potential percentage change in latency by scaling the resources. | +| endpoint | Name of the endpoint to use for registering this plugin. | +| port | Port this actuator should listen on. | +| mongo_endpoint | URI for the Mongo database - representing the knowledge base of the system. | +| plugin_manager_endpoint | String defining the plugin manager's endpoint to which actuators can register themselves. | +| plugin_manager_port | Port number of the plugin manager's endpoint to which actuators can register themselves. | ### RDT actuator diff --git a/docs/planner_logs.md b/docs/planner_logs.md new file mode 100644 index 0000000..bc07501 --- /dev/null +++ b/docs/planner_logs.md @@ -0,0 +1,200 @@ +# Observability for Planner Logs + +This document outlines how to set up the IDO planner config, logging utilities and observability tools in order to +collect, export and query logs. The logging utilities (Fluent-bit and Logrotate) and the observability tools +(OpenTelemetry Collector, Loki, and Grafana) used here are provided as examples, and the same principles should apply +to any other similar tools. **Please note that we do not maintain or support these specific tools**. + +## 1. IDO Planner + +This section explains how to set up the [IDO framework](framework.md) to save logs to a file (in addition to the +standard output), along with setting up logging utilities that handle log forwarding and log rotation, respectively. + +### 1.1. Log file configuration +To enable the IDO planner saving the logs to a file, add the `log_file` attribute to the configuration file as shown +below: + +```json + ... + "generic": { + "log_file": "" + } + ... +``` + +Key points to consider when configuring the log file: +* **Empty String**: If the value of the `log_file` attribute is an empty string, the planner will treat it as if the log + file is not set at all. +* **Incorrect Path**: If the specified log file path is incorrect, the planner will throw a panic error. +* **Path Types**: Both relative and absolute paths are accepted. +* **File Creation**: If the log file path is correct but the log file does not exist, it will be created automatically. +* **File Append**: If the log file already exists, new log entries will be appended to it rather than overwriting the + existing file. + +### 1.2. Log forwarder +The log forwarder is responsible for collecting log data from the `log_file` produced by the IDO planner and forwarding +it to a centralized logging system. Below is a sample configuration for [Fluent-bit](https://docs.fluentbit.io/manual) forwarding logs to +OpenTelemetry endpoint: + +```ini +[SERVICE] + Flush 1 + Log_Level info + Parsers_File parsers.conf` +[INPUT] + Name tail + Path + Parser docker + Tag kube.* + Refresh_Interval 5 + Skip_Long_Lines On + DB /var/log/flb_kube.db + DB.Sync Normal +[OUTPUT] + Name opentelemetry + Match * + Host + Port 4318 + Logs_uri /v1/logs +parsers.conf: | +[PARSER] + Name docker + Format json + Time_Key time + Time_Format %Y-%m-%dT%H:%M:%S.%L +``` + +### 1.3. Log rotation +Log rotation is responsible for managing the size and lifecycle of the IDO planner's log file, ensuring it does not +consume excessive disk space. Below is a sample configuration for [Logrotate](https://github.com/blacklabelops/logrotate), derived from the +default [kubelet log rotate configuration](https://kubernetes.io/docs/concepts/cluster-administration/logging/#log-rotation): + +```ini + { + su root root + size 10M + rotate 5 + compress + copytruncate + missingok + notifempty + } +``` + +### 1.4. Deployment considerations +To ensure all the components work correctly together: +* The `` should be consistent across the IDO planner configuration and the utilities for log forwarding + and log rotation. +* In a Kubernetes deployment: + * Consider creating a shared volume to store the log file, allowing it to be mounted and accessed by each of the + IDO planner and the log utilities containers. + * The utilies for log forwarding and log rotation can be deployed as sidecars within the same IDO planner Pod. + +## 2. Observability Collector +The observability collector is responsible for receiving the logs from the log forwarder and preparing them for +ingestion into a log aggregator. Below is a sample configuration for the [OpenTelemetry Collector](https://opentelemetry.io/docs/collector/) ingesting logs +to the log aggregator [Loki](https://grafana.com/docs/loki/latest/send-data/otel/): + +```yaml +receivers: + otlp: + protocols: + http: + endpoint: 0.0.0.0:4318 +processors: + batch: +exporters: + otlphttp: + endpoint: :3100/otlp + tls: + insecure: true +service: + pipelines: + logs: + receivers: [otlp] + processors: [batch] + exporters: [otlphttp] +``` + +## 3. Log Aggregator +The log aggregator is responsible for collecting, storing, and querying log entries. Below is a sample configuration of +the log aggregator [Loki](https://grafana.com/docs/loki/latest/): + +```yaml +auth_enabled: false +limits_config: + allow_structured_metadata: true + volume_enabled: true + reject_old_samples: false +server: + http_listen_port: 3100 +common: + instance_addr: 0.0.0.0 + ring: + kvstore: + store: inmemory + replication_factor: 1 + path_prefix: /tmp/loki +schema_config: + configs: + - from: 2020-05-15 + store: tsdb + object_store: filesystem + schema: v13 + index: + prefix: index_ + period: 24h +storage_config: + tsdb_shipper: + active_index_directory: /tmp/loki/index + cache_location: /tmp/loki/index_cache + filesystem: + directory: /tmp/loki/chunks +pattern_ingester: + enabled: true +``` + +## 4. Log Queries +Log queries for the IDO planner can be efficiently handled using a log aggregator. For instance, with Loki, it is +possible to perform log queries through HTTP requests or a dashboard. + +### 4.1. HTTP Requests +Loki expose [HTTP endpoints](https://grafana.com/docs/loki/latest/reference/loki-http-api/#query-endpoints) to query logs related data. For more information on the query logic and syntax, refer +to the [Loki log queries documentation](https://grafana.com/docs/loki/latest/query/log_queries/). Find below some query examples: + +* Get all labels in log entries: +```sh +curl -G "http://:3100/loki/api/v1/labels" +``` +* Get logs filtered using a label, within a time interval: +```sh +curl -G ":3100/loki/api/v1/query" --data-urlencode 'query={