From 1e2274bb4f54c66c736aa7afd32e61ed07868a53 Mon Sep 17 00:00:00 2001 From: Ukri Niemimuukko Date: Thu, 8 Jun 2023 14:46:09 +0300 Subject: [PATCH] version updates and lint fixes Tighter linter settings, version updates and related fixes. Signed-off-by: Ukri Niemimuukko --- gpu-aware-scheduling/.golangci.yml | 15 +- gpu-aware-scheduling/Makefile | 4 +- .../cmd/gas-scheduler-extender/main.go | 8 +- gpu-aware-scheduling/go.mod | 45 +- gpu-aware-scheduling/go.sum | 64 +++ .../pkg/gpuscheduler/cache_api.go | 3 +- .../pkg/gpuscheduler/node_resource_cache.go | 298 +++++++------ .../gpuscheduler/node_resource_cache_test.go | 198 ++++----- .../pkg/gpuscheduler/resource_map_test.go | 48 +-- .../pkg/gpuscheduler/scheduler.go | 392 +++++++++++------- .../pkg/gpuscheduler/scheduler_test.go | 127 +++--- .../pkg/gpuscheduler/utils.go | 64 +-- .../pkg/gpuscheduler/utils_test.go | 8 +- 13 files changed, 767 insertions(+), 507 deletions(-) diff --git a/gpu-aware-scheduling/.golangci.yml b/gpu-aware-scheduling/.golangci.yml index 093b5ff8..ab9cba7c 100644 --- a/gpu-aware-scheduling/.golangci.yml +++ b/gpu-aware-scheduling/.golangci.yml @@ -7,18 +7,13 @@ linters: enable-all: true disable: - paralleltest - - gomoddirectives - exhaustivestruct - - varnamelen - - gofumpt - - nonamedreturns - - exhaustruct linters-settings: gofmt: simplify: true gofumpt: - lang-version: "1.18" + lang-version: "1.19" golint: min-confidence: 0.9 govet: @@ -26,13 +21,13 @@ linters-settings: enable: - "fieldalignment" gocyclo: - min-complexity: 15 + min-complexity: 10 gocognit: min-complexity: 31 funlen: lines: 70 cyclop: - max-complexity: 12 + max-complexity: 10 issues: exclude-rules: @@ -45,3 +40,7 @@ issues: - goimports - gofmt - unparam + - exhaustivestruct + - exhaustruct + - gocyclo + - cyclop diff --git a/gpu-aware-scheduling/Makefile b/gpu-aware-scheduling/Makefile index 5c2f708e..27d1bd13 100644 --- a/gpu-aware-scheduling/Makefile +++ b/gpu-aware-scheduling/Makefile @@ -2,7 +2,7 @@ ifneq ($(TAG),) IMAGE_TAG=:$(TAG) endif -GOLICENSES_VERSION?=v1.5.0 +GOLICENSES_VERSION?=v1.6.0 BUILD_OUTPUT_DIR?=./bin ifneq ("$(wildcard licenses/)","") @@ -40,7 +40,7 @@ image: -f deploy/images/Dockerfile_gpu-extender ../ -t $(IMAGE_PATH)gpu-extender$(IMAGE_TAG) release-image: clean - docker build --build-arg GOLICENSES_VERSION=$(GOLICENSES_IN_GO_MOD) -f deploy/images/Dockerfile_gpu-extender ../ \ + docker build --build-arg GOLICENSES_VERSION=$(GOLICENSES_VERSION) -f deploy/images/Dockerfile_gpu-extender ../ \ -t $(IMAGE_PATH)gpu-extender$(IMAGE_TAG) format: diff --git a/gpu-aware-scheduling/cmd/gas-scheduler-extender/main.go b/gpu-aware-scheduling/cmd/gas-scheduler-extender/main.go index b9315dfa..95a28470 100644 --- a/gpu-aware-scheduling/cmd/gas-scheduler-extender/main.go +++ b/gpu-aware-scheduling/cmd/gas-scheduler-extender/main.go @@ -23,7 +23,7 @@ var ( ) const ( - l1 = klog.Level(1) + logL1 = klog.Level(1) defaultQPS = 5 defaultBurst = 10 maxQPSandBurst = 1000 @@ -44,12 +44,13 @@ func main() { flag.BoolVar(&enableAllowlist, "enableAllowlist", false, "enable allowed GPUs annotation (csv list of names)") flag.BoolVar(&enableDenylist, "enableDenylist", false, "enable denied GPUs annotation (csv list of names)") flag.StringVar(&balancedRes, "balancedResource", "", "enable resource balacing within a node") - flag.UintVar(&burst, "burst", defaultBurst, fmt.Sprintf("burst value used with kube client (limited to %d)", maxQPSandBurst)) + flag.UintVar(&burst, "burst", defaultBurst, fmt.Sprintf("burst value used with kube client (limited to %d)", + maxQPSandBurst)) flag.UintVar(&qps, "qps", defaultQPS, fmt.Sprintf("qps value used with kube client (limited to %d)", maxQPSandBurst)) klog.InitFlags(nil) flag.Parse() - klog.V(l1).Infof("%s built on %s with go %s", version, buildDate, goVersion) + klog.V(logL1).Infof("%s built on %s with go %s", version, buildDate, goVersion) for _, ptr := range []*uint{&qps, &burst} { if *ptr > maxQPSandBurst { @@ -59,7 +60,6 @@ func main() { } kubeClient, _, err := extender.GetKubeClientExt(kubeConfig, int(burst), float32(qps)) - if err != nil { klog.Error("couldn't get kube client, cannot continue: ", err.Error()) os.Exit(1) diff --git a/gpu-aware-scheduling/go.mod b/gpu-aware-scheduling/go.mod index e43b36ba..97a091b6 100644 --- a/gpu-aware-scheduling/go.mod +++ b/gpu-aware-scheduling/go.mod @@ -5,29 +5,30 @@ go 1.19 require ( github.com/intel/platform-aware-scheduling/extender v0.5.0 github.com/pkg/errors v0.9.1 - github.com/smartystreets/goconvey v1.7.2 - github.com/stretchr/testify v1.8.1 - k8s.io/api v0.26.0 - k8s.io/apimachinery v0.26.0 - k8s.io/client-go v0.26.0 - k8s.io/klog/v2 v2.80.1 + github.com/smartystreets/goconvey v1.8.0 + github.com/stretchr/testify v1.8.3 + k8s.io/api v0.27.2 + k8s.io/apimachinery v0.27.2 + k8s.io/client-go v0.27.2 + k8s.io/klog/v2 v2.100.1 ) require ( github.com/davecgh/go-spew v1.1.1 // indirect - github.com/emicklei/go-restful/v3 v3.10.1 // indirect + github.com/emicklei/go-restful/v3 v3.10.2 // indirect github.com/evanphx/json-patch v5.6.0+incompatible // indirect - github.com/go-logr/logr v1.2.3 // indirect - github.com/go-openapi/jsonpointer v0.19.5 // indirect - github.com/go-openapi/jsonreference v0.20.0 // indirect + github.com/go-logr/logr v1.2.4 // indirect + github.com/go-openapi/jsonpointer v0.19.6 // indirect + github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.22.3 // indirect github.com/gogo/protobuf v1.3.2 // indirect - github.com/golang/protobuf v1.5.2 // indirect + github.com/golang/protobuf v1.5.3 // indirect github.com/google/gnostic v0.6.9 // indirect github.com/google/go-cmp v0.5.9 // indirect github.com/google/gofuzz v1.2.0 // indirect + github.com/google/uuid v1.3.0 // indirect github.com/gopherjs/gopherjs v1.17.2 // indirect - github.com/imdario/mergo v0.3.13 // indirect + github.com/imdario/mergo v0.3.15 // indirect github.com/josharian/intern v1.0.0 // indirect github.com/json-iterator/go v1.1.12 // indirect github.com/jtolds/gls v4.20.0+incompatible // indirect @@ -36,22 +37,24 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pmezard/go-difflib v1.0.0 // indirect - github.com/smartystreets/assertions v1.13.0 // indirect + github.com/smartystreets/assertions v1.13.1 // indirect github.com/spf13/pflag v1.0.5 // indirect github.com/stretchr/objx v0.5.0 // indirect - golang.org/x/net v0.7.0 // indirect - golang.org/x/oauth2 v0.3.0 // indirect - golang.org/x/sys v0.5.0 // indirect - golang.org/x/term v0.5.0 // indirect - golang.org/x/text v0.7.0 // indirect + golang.org/x/mod v0.10.0 // indirect + golang.org/x/net v0.10.0 // indirect + golang.org/x/oauth2 v0.8.0 // indirect + golang.org/x/sys v0.8.0 // indirect + golang.org/x/term v0.8.0 // indirect + golang.org/x/text v0.9.0 // indirect golang.org/x/time v0.3.0 // indirect + golang.org/x/tools v0.9.1 // indirect google.golang.org/appengine v1.6.7 // indirect - google.golang.org/protobuf v1.28.1 // indirect + google.golang.org/protobuf v1.30.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/kube-openapi v0.0.0-20221207184640-f3cff1453715 // indirect - k8s.io/utils v0.0.0-20221128185143-99ec85e7a448 // indirect + k8s.io/kube-openapi v0.0.0-20230525220651-2546d827e515 // indirect + k8s.io/utils v0.0.0-20230505201702-9f6742963106 // indirect sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect sigs.k8s.io/yaml v1.3.0 // indirect diff --git a/gpu-aware-scheduling/go.sum b/gpu-aware-scheduling/go.sum index c3ec6dff..eb15cbf2 100644 --- a/gpu-aware-scheduling/go.sum +++ b/gpu-aware-scheduling/go.sum @@ -10,12 +10,15 @@ github.com/client9/misspell v0.3.4/go.mod h1:qj6jICC3Q7zFZvVWo7KLAzC3yx5G7kyvSDk github.com/cncf/udpa/go v0.0.0-20191209042840-269d4d468f6f/go.mod h1:M8M6+tZqaGXZJjfX53e64911xZQV5JYwmTeXPW+k8Sc= github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/cncf/xds/go v0.0.0-20210312221358-fbca930ec8ed/go.mod h1:eXthEFrGJvWHgFFCl3hGmgk+/aYT6PnTQLykKQRLhEs= +github.com/creack/pty v1.1.9/go.mod h1:oKZEueFk5CKHvIhNR5MUki03XCEU+Q6VDXinZuGJ33E= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/docopt/docopt-go v0.0.0-20180111231733-ee0de3bc6815/go.mod h1:WwZ+bS3ebgob9U8Nd0kOddGdZWjyMGR8Wziv+TBNwSE= github.com/emicklei/go-restful/v3 v3.10.1 h1:rc42Y5YTp7Am7CS630D7JmhRjq4UlEUuEKfrDac4bSQ= github.com/emicklei/go-restful/v3 v3.10.1/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= +github.com/emicklei/go-restful/v3 v3.10.2 h1:hIovbnmBTLjHXkqEBUz3HGpXZdM7ZrE9fJIZIqlJLqE= +github.com/emicklei/go-restful/v3 v3.10.2/go.mod h1:6n3XBCmQQb25CM2LCACGz8ukIrRry+4bhvbpWn3mrbc= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= @@ -29,11 +32,17 @@ github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeME github.com/go-logr/logr v1.2.0/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-logr/logr v1.2.3 h1:2DntVwHkVopvECVRSlL5PSo9eG+cAkDCuckLubN+rq0= github.com/go-logr/logr v1.2.3/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= +github.com/go-logr/logr v1.2.4 h1:g01GSCwiDw2xSZfjJ2/T9M+S6pFdcNtFYsp+Y43HYDQ= +github.com/go-logr/logr v1.2.4/go.mod h1:jdQByPbusPIv2/zmleS9BjJVeZ6kBagPoEUsqbVz/1A= github.com/go-openapi/jsonpointer v0.19.3/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= github.com/go-openapi/jsonpointer v0.19.5 h1:gZr+CIYByUqjcgeLXnQu2gHYQC9o73G2XUeOFYEICuY= github.com/go-openapi/jsonpointer v0.19.5/go.mod h1:Pl9vOtqEWErmShwVjC8pYs9cog34VGT37dQOVbmoatg= +github.com/go-openapi/jsonpointer v0.19.6 h1:eCs3fxoIi3Wh6vtgmLTOjdhSpiqphQ+DaPn38N2ZdrE= +github.com/go-openapi/jsonpointer v0.19.6/go.mod h1:osyAmYz/mB/C3I+WsTTSgw1ONzaLJoLCyoi6/zppojs= github.com/go-openapi/jsonreference v0.20.0 h1:MYlu0sBgChmCfJxxUKZ8g1cPWFOB37YSZqewK7OKeyA= github.com/go-openapi/jsonreference v0.20.0/go.mod h1:Ag74Ico3lPc+zR+qjn4XBUmXymS4zJbYVCZmcgkasdo= +github.com/go-openapi/jsonreference v0.20.2 h1:3sVjiK66+uXK/6oQ8xgcRKcFgQ5KXa2KvnJRumpMGbE= +github.com/go-openapi/jsonreference v0.20.2/go.mod h1:Bl1zwGIM8/wsvqjsOQLJ/SH+En5Ap4rVB5KVcIDZG2k= github.com/go-openapi/swag v0.19.5/go.mod h1:POnQmlKehdgb5mhVOsnJFsivZCEZ/vjK9gh66Z9tfKk= github.com/go-openapi/swag v0.22.3 h1:yMBqmnQ0gyZvEb/+KzuWZOXgllrXT4SADYbvDaXHv/g= github.com/go-openapi/swag v0.22.3/go.mod h1:UzaqsxGiab7freDnrUUra0MwWfN/q7tE4j+VcZ0yl14= @@ -56,6 +65,8 @@ github.com/golang/protobuf v1.4.3/go.mod h1:oDoupMAO8OvCJWAcko0GGGIgR6R6ocIYbsSw github.com/golang/protobuf v1.5.0/go.mod h1:FsONVRAS9T7sI+LIUmWTfcYkHO4aIWwzhcaSAoJOfIk= github.com/golang/protobuf v1.5.2 h1:ROPKBNFfQgOUMifHyP+KYbvpjbdoFNs+aK7DXlji0Tw= github.com/golang/protobuf v1.5.2/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= +github.com/golang/protobuf v1.5.3 h1:KhyjKVUg7Usr/dYsdSqoFveMYd5ko72D+zANwlG1mmg= +github.com/golang/protobuf v1.5.3/go.mod h1:XVQd3VNwM+JqD3oG2Ue2ip4fOMUkwXdXDdiuN0vRsmY= github.com/google/gnostic v0.6.9 h1:ZK/5VhkoX835RikCHpSUJV9a+S3e1zLh59YnyWeBW+0= github.com/google/gnostic v0.6.9/go.mod h1:Nm8234We1lq6iB9OmlgNv3nH91XLLVZHCDayfA3xq+E= github.com/google/go-cmp v0.2.0/go.mod h1:oXzfMopK8JAjlY9xF4vHSVASa0yLyX7SntLO5aqRK0M= @@ -71,12 +82,16 @@ github.com/google/gofuzz v1.2.0 h1:xRy4A+RhZaiKjJ1bPfwQ8sedCA+YS2YcCHW6ec7JMi0= github.com/google/gofuzz v1.2.0/go.mod h1:dBl0BpW6vV/+mYPU4Po3pmUjxk6FQPldtuIdl/M65Eg= github.com/google/uuid v1.1.2 h1:EVhdT+1Kseyi1/pUmXKaFxYsDNy9RQYkMWRH68J/W7Y= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= +github.com/google/uuid v1.3.0 h1:t6JiXgmwXMjEs8VusXIJk2BXHsn+wx8BZdTaoZ5fu7I= +github.com/google/uuid v1.3.0/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/gopherjs/gopherjs v0.0.0-20181017120253-0766667cb4d1/go.mod h1:wJfORRmW1u3UXTncJ5qlYoELFm8eSnnEO6hX4iZ3EWY= github.com/gopherjs/gopherjs v1.17.2 h1:fQnZVsXk8uxXIStYb0N4bGk7jeyTalG/wsZjQ25dO0g= github.com/gopherjs/gopherjs v1.17.2/go.mod h1:pRRIvn/QzFLrKfvEz3qUuEhtE/zLCWfreZ6J5gM2i+k= github.com/grpc-ecosystem/grpc-gateway v1.16.0/go.mod h1:BDjrQk3hbvj6Nolgz8mAMFbcEtjT1g+wF4CSlocrBnw= github.com/imdario/mergo v0.3.13 h1:lFzP57bqS/wsqKssCGmtLAb8A0wKjLGrve2q3PPVcBk= github.com/imdario/mergo v0.3.13/go.mod h1:4lJ1jqUDcsbIECGy0RUJAXNIhg+6ocWgb1ALK2O4oXg= +github.com/imdario/mergo v0.3.15 h1:M8XP7IuFNsqUx6VPK2P9OSmsYsI/YFaGil0uD21V3dM= +github.com/imdario/mergo v0.3.15/go.mod h1:WBLT9ZmE3lPoWsEzCh9LPo3TiwVN+ZKEjmz+hD27ysY= github.com/intel/platform-aware-scheduling/extender v0.5.0 h1:8uVRUpDJH1k3/dti99PdVe8oFjQ8P1zQsDxccT6qo4M= github.com/intel/platform-aware-scheduling/extender v0.5.0/go.mod h1:chYNDpdZ4P1veKCf5etN6nEYkgU1rWHRApViSHcxDxQ= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -90,9 +105,12 @@ github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+o github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.2.0 h1:s5hAObm+yFO5uHYt5dYjxi2rXrsnmRpJx4OYvIWUaQs= github.com/kr/pretty v0.2.0/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pretty v0.3.0 h1:WgNl7dwNpEZ6jJ9k1snq4pZsg7DOEN8hP9Xw0Tsjwk0= github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/kr/text v0.2.0 h1:5Nx0Ya0ZqY2ygV366QzturHI13Jq95ApcVaJBhpS+AY= +github.com/kr/text v0.2.0/go.mod h1:eLer722TekiGuMkidMxC/pM04lWEeraHUUmBw8l2grE= github.com/mailru/easyjson v0.0.0-20190614124828-94de47d64c63/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.0.0-20190626092158-b2ccc519800e/go.mod h1:C1wdFJiN94OJF2b5HbByQZoLdCWB1Yqtg26g4irojpc= github.com/mailru/easyjson v0.7.7 h1:UGYAvKxe3sBsEDzO8ZeWOSlIQfWFlxbzLZe7hwFURr0= @@ -105,7 +123,9 @@ github.com/modern-go/reflect2 v1.0.2/go.mod h1:yWuevngMOJpCy52FWWMvUC8ws7m/LJsjY github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq1c1nUAm88MOHcQC9l5mIlSMApZMrHA= github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/onsi/ginkgo/v2 v2.4.0 h1:+Ig9nvqgS5OBSACXNk15PLdp0U9XPYROt9CFzVdFGIs= +github.com/onsi/ginkgo/v2 v2.9.1 h1:zie5Ly042PD3bsCvsSOPvRnFwyo3rKe64TJlD6nu0mk= github.com/onsi/gomega v1.23.0 h1:/oxKu9c2HVap+F3PfKort2Hw5DEU+HGlW8n+tguWsys= +github.com/onsi/gomega v1.27.4 h1:Z2AnStgsdSayCMDiCU42qIz+HLqEPcgiOCXjAU/w+8E= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= @@ -115,8 +135,12 @@ github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6L github.com/smartystreets/assertions v1.2.0/go.mod h1:tcbTF8ujkAEcZ8TElKY+i30BzYlVhC/LOxJk7iOWnoo= github.com/smartystreets/assertions v1.13.0 h1:Dx1kYM01xsSqKPno3aqLnrwac2LetPvN23diwyr69Qs= github.com/smartystreets/assertions v1.13.0/go.mod h1:wDmR7qL282YbGsPy6H/yAsesrxfxaaSlJazyFLYVFx8= +github.com/smartystreets/assertions v1.13.1 h1:Ef7KhSmjZcK6AVf9YbJdvPYG9avaF0ZxudX+ThRdWfU= +github.com/smartystreets/assertions v1.13.1/go.mod h1:cXr/IwVfSo/RbCSPhoAPv73p3hlSdrBH/b3SdnW/LMY= github.com/smartystreets/goconvey v1.7.2 h1:9RBaZCeXEQ3UselpuwUQHltGVXvdwm6cv1hgR6gDIPg= github.com/smartystreets/goconvey v1.7.2/go.mod h1:Vw0tHAZW6lzCRk3xgdin6fKYcG+G3Pg9vgXWeJpQFMM= +github.com/smartystreets/goconvey v1.8.0 h1:Oi49ha/2MURE0WexF052Z0m+BNSGirfjg5RL+JXWq3w= +github.com/smartystreets/goconvey v1.8.0/go.mod h1:EdX8jtrTIj26jmjCOVNMVSIYAtgexqXKHOXW2Dx9JLg= github.com/spaolacci/murmur3 v0.0.0-20180118202830-f09979ecbc72/go.mod h1:JwIasOWyU6f++ZhiEuf87xNszmSA2myDM2Kzu9HwQUA= github.com/spf13/pflag v1.0.5 h1:iy+VFUOCP1a+8yFto/drg2CJ5u0yRoB7fZw3DKv/JXA= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= @@ -131,6 +155,8 @@ github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= github.com/stretchr/testify v1.8.1 h1:w7B6lhMri9wdJUVmEZPGGhZzrYTPvgJArz7wNPgYKsk= github.com/stretchr/testify v1.8.1/go.mod h1:w2LPCIKwWwSfY2zedu0+kehJoqGctiVI29o6fzry7u4= +github.com/stretchr/testify v1.8.3 h1:RP3t2pwF7cMEbC1dqtB6poj3niw/9gnV4Cjg5oW5gtY= +github.com/stretchr/testify v1.8.3/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= github.com/xeipuuv/gojsonpointer v0.0.0-20180127040702-4e3ac2762d5f/go.mod h1:N2zxlSyiKSe5eX1tZViRH5QA0qijqEDrYZiPEAiq3wU= github.com/xeipuuv/gojsonreference v0.0.0-20180127040603-bd5ef7bd5415/go.mod h1:GwrjFmJcFw6At/Gs6z4yjiIwzuJ1/+UwLxMQDVQXShQ= github.com/xeipuuv/gojsonschema v1.2.0/go.mod h1:anYRn/JVcOK2ZgGU+IjEV4nwlhoK5sQluxsYJ78Id3Y= @@ -146,6 +172,10 @@ golang.org/x/lint v0.0.0-20190227174305-5b3e6a55c961/go.mod h1:wehouNa3lNwaWXcvx golang.org/x/lint v0.0.0-20190313153728-d0100b6bd8b3/go.mod h1:6SW0HCj/g11FgYtHlgUYUwCkIfeOF89ocIRzGO/8vkc= golang.org/x/mod v0.2.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= golang.org/x/mod v0.3.0/go.mod h1:s0Qsj1ACt9ePp/hMypM3fl4fZqREWJwdYDEqhRiZZUA= +golang.org/x/mod v0.9.0 h1:KENHtAZL2y3NLMYZeHY9DW8HW8V+kQyJsY/V9JlKvCs= +golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= +golang.org/x/mod v0.10.0 h1:lFO9qtOdlre5W1jxS3r/4szv2/6iXxScdzjoBMXNhYk= +golang.org/x/mod v0.10.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -161,10 +191,15 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/net v0.0.0-20210805182204-aaa1db679c0d/go.mod h1:9nx3DQGgdP8bBQD5qxJ1jj9UTztislL4KSBs9R2vV5Y= golang.org/x/net v0.7.0 h1:rJrUqqhjsgNp7KqAIc25s9pZnjU7TUcSY7HcVZjdn1g= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= +golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= +golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= +golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20200107190931-bf48bf16ab8d/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.3.0 h1:6l90koy8/LaBLmLu8jpHeHexzMwEita0zFfYlggy2F8= golang.org/x/oauth2 v0.3.0/go.mod h1:rQrIauxkUhJ6CuwEXwymO2/eh4xz2ZWF1nBkcxS+tGk= +golang.org/x/oauth2 v0.8.0 h1:6dkIjl3j3LtZ/O3sTgZTMsLKSftL/B8Zgq4huOIIUu8= +golang.org/x/oauth2 v0.8.0/go.mod h1:yr7u4HXZRm1R1kBWqr/xKNqewf0plRYoB7sla+BCIXE= golang.org/x/sync v0.0.0-20180314180146-1d60e4601c6f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181108010431-42b317875d0f/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20181221193216-37e7f081c4d4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= @@ -182,9 +217,16 @@ golang.org/x/sys v0.0.0-20210423082822-04245dca01da/go.mod h1:h1NjWce9XRLGQEsW7w golang.org/x/sys v0.0.0-20210510120138-977fb7262007/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0 h1:MUK/U/4lj1t1oPg0HfuXDN/Z1wv31ZJ/YcPiGccS4DU= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.6.0 h1:MVltZSvRTcU2ljQOhs94SXPftV6DCNnZViHeQps87pQ= +golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= +golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.5.0 h1:n2a8QNdAb0sZNpU9R1ALUXBbY+w51fCQDN+7EdxNBsY= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= +golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= +golang.org/x/term v0.8.0 h1:n5xxQn2i3PC0yLAbjTpNT85q/Kgzcr2gIoX9OrJUols= +golang.org/x/term v0.8.0/go.mod h1:xPskH00ivmX89bAKVGSKKtLOWNx2+17Eiy94tnKShWo= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.2/go.mod h1:bEr9sfX3Q8Zfm5fL9x+3itogRgK3+ptLWKqgva+5dAk= golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= @@ -192,6 +234,9 @@ golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.6/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.7.0 h1:4BRB4x83lYWy72KwLD/qYDuTu7q9PjSagHvijDw7cLo= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= +golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= +golang.org/x/text v0.9.0 h1:2sjJmO8cDvYveuX97RDLsxlyUxLl+GHoLxBiRdHllBE= +golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/time v0.3.0 h1:rg5rLMjNzMS1RkNLzCG38eapWhnYLFYXDXj2gOlr8j4= golang.org/x/time v0.3.0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= @@ -203,6 +248,10 @@ golang.org/x/tools v0.0.0-20190524140312-2c0ae7006135/go.mod h1:RgjU9mgBXZiqYHBn golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= golang.org/x/tools v0.0.0-20200619180055-7c47624df98f/go.mod h1:EkVYQZoAsY45+roYkvgYkIh4xh/qjgUK9TdY2XT94GE= golang.org/x/tools v0.0.0-20210106214847-113979e3529a/go.mod h1:emZCQorbCU4vsT4fOWvOPXz4eW1wZW4PmDk9uLelYpA= +golang.org/x/tools v0.7.0 h1:W4OVu8VVOaIO0yzWMNdepAulS7YfoS3Zabrm8DOXXU4= +golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= +golang.org/x/tools v0.9.1 h1:8WMNJAz3zrtPmnYC7ISf5dEn3MT0gY7jBJfw27yrrLo= +golang.org/x/tools v0.9.1/go.mod h1:owI94Op576fPu3cIGQeHs3joujW/2Oc6MtlxbF5dfNc= golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191011141410-1b5146add898/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= golang.org/x/xerrors v0.0.0-20191204190536-9bdfabe68543/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= @@ -237,10 +286,13 @@ google.golang.org/protobuf v1.26.0/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQ google.golang.org/protobuf v1.27.1/go.mod h1:9q0QmTI4eRPtz6boOQmLYwt+qCgq0jsYwAQnmE0givc= google.golang.org/protobuf v1.28.1 h1:d0NfwRgPtno5B1Wa6L2DAG+KivqkdutMf1UhdNx175w= google.golang.org/protobuf v1.28.1/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= +google.golang.org/protobuf v1.30.0 h1:kPPoIgf3TsEvrm0PFe15JQ+570QVxYzEvvHqChK+cng= +google.golang.org/protobuf v1.30.0/go.mod h1:HV8QOd/L58Z+nl8r43ehVNZIU/HEI6OcFqwMG9pJV4I= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20190902080502-41f04d3bba15/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/inf.v0 v0.9.1 h1:73M5CoZyi3ZLMOyDlQh031Cx6N9NDJ2Vvfl76EDAgDc= gopkg.in/inf.v0 v0.9.1/go.mod h1:cWUDdTG/fYaXco+Dcufb5Vnc6Gp2YChqWtbxRZE0mXw= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= @@ -257,16 +309,28 @@ honnef.co/go/tools v0.0.0-20190102054323-c2f93a96b099/go.mod h1:rf3lG4BRIbNafJWh honnef.co/go/tools v0.0.0-20190523083050-ea95bdfd59fc/go.mod h1:rf3lG4BRIbNafJWhAfAdb/ePZxsR/4RtNHQocxwk9r4= k8s.io/api v0.26.0 h1:IpPlZnxBpV1xl7TGk/X6lFtpgjgntCg8PJ+qrPHAC7I= k8s.io/api v0.26.0/go.mod h1:k6HDTaIFC8yn1i6pSClSqIwLABIcLV9l5Q4EcngKnQg= +k8s.io/api v0.27.2 h1:+H17AJpUMvl+clT+BPnKf0E3ksMAzoBBg7CntpSuADo= +k8s.io/api v0.27.2/go.mod h1:ENmbocXfBT2ADujUXcBhHV55RIT31IIEvkntP6vZKS4= k8s.io/apimachinery v0.26.0 h1:1feANjElT7MvPqp0JT6F3Ss6TWDwmcjLypwoPpEf7zg= k8s.io/apimachinery v0.26.0/go.mod h1:tnPmbONNJ7ByJNz9+n9kMjNP8ON+1qoAIIC70lztu74= +k8s.io/apimachinery v0.27.2 h1:vBjGaKKieaIreI+oQwELalVG4d8f3YAMNpWLzDXkxeg= +k8s.io/apimachinery v0.27.2/go.mod h1:XNfZ6xklnMCOGGFNqXG7bUrQCoR04dh/E7FprV6pb+E= k8s.io/client-go v0.26.0 h1:lT1D3OfO+wIi9UFolCrifbjUUgu7CpLca0AD8ghRLI8= k8s.io/client-go v0.26.0/go.mod h1:I2Sh57A79EQsDmn7F7ASpmru1cceh3ocVT9KlX2jEZg= +k8s.io/client-go v0.27.2 h1:vDLSeuYvCHKeoQRhCXjxXO45nHVv2Ip4Fe0MfioMrhE= +k8s.io/client-go v0.27.2/go.mod h1:tY0gVmUsHrAmjzHX9zs7eCjxcBsf8IiNe7KQ52biTcQ= k8s.io/klog/v2 v2.80.1 h1:atnLQ121W371wYYFawwYx1aEY2eUfs4l3J72wtgAwV4= k8s.io/klog/v2 v2.80.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= +k8s.io/klog/v2 v2.100.1 h1:7WCHKK6K8fNhTqfBhISHQ97KrnJNFZMcQvKp7gP/tmg= +k8s.io/klog/v2 v2.100.1/go.mod h1:y1WjHnz7Dj687irZUWR/WLkLc5N1YHtjLdmgWjndZn0= k8s.io/kube-openapi v0.0.0-20221207184640-f3cff1453715 h1:tBEbstoM+K0FiBV5KGAKQ0kuvf54v/hwpldiJt69w1s= k8s.io/kube-openapi v0.0.0-20221207184640-f3cff1453715/go.mod h1:+Axhij7bCpeqhklhUTe3xmOn6bWxolyZEeyaFpjGtl4= +k8s.io/kube-openapi v0.0.0-20230525220651-2546d827e515 h1:OmK1d0WrkD3IPfkskvroRykOulHVHf0s0ZIFRjyt+UI= +k8s.io/kube-openapi v0.0.0-20230525220651-2546d827e515/go.mod h1:kzo02I3kQ4BTtEfVLaPbjvCkX97YqGve33wzlb3fofQ= k8s.io/utils v0.0.0-20221128185143-99ec85e7a448 h1:KTgPnR10d5zhztWptI952TNtt/4u5h3IzDXkdIMuo2Y= k8s.io/utils v0.0.0-20221128185143-99ec85e7a448/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= +k8s.io/utils v0.0.0-20230505201702-9f6742963106 h1:EObNQ3TW2D+WptiYXlApGNLVy0zm/JIBVY9i+M4wpAU= +k8s.io/utils v0.0.0-20230505201702-9f6742963106/go.mod h1:OLgZIPagt7ERELqWJFomSt595RzquPNLL48iOWgYOg0= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd h1:EDPBXCAspyGV4jQlpZSudPeMmr1bNJefnuqLsRAsHZo= sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd/go.mod h1:B8JuhiUyNFVKdsE8h686QcCxMaH6HrOAZj4vswFpcB0= sigs.k8s.io/structured-merge-diff/v4 v4.2.3 h1:PRbqxJClWWYMNV1dhaG4NsibJbArud9kFxnAMREiWFE= diff --git a/gpu-aware-scheduling/pkg/gpuscheduler/cache_api.go b/gpu-aware-scheduling/pkg/gpuscheduler/cache_api.go index ebf77472..385ed8b8 100644 --- a/gpu-aware-scheduling/pkg/gpuscheduler/cache_api.go +++ b/gpu-aware-scheduling/pkg/gpuscheduler/cache_api.go @@ -27,7 +27,8 @@ func (r *cacheAPI) GetNodeResourceStatus(cache *Cache, nodeName string) nodeReso } func (r *cacheAPI) AdjustPodResourcesL(cache *Cache, pod *v1.Pod, adj bool, annotation, - tileAnnotation, nodeName string) error { + tileAnnotation, nodeName string, +) error { return cache.adjustPodResourcesL(pod, adj, annotation, tileAnnotation, nodeName) } diff --git a/gpu-aware-scheduling/pkg/gpuscheduler/node_resource_cache.go b/gpu-aware-scheduling/pkg/gpuscheduler/node_resource_cache.go index b52110cc..df9a8ea8 100644 --- a/gpu-aware-scheduling/pkg/gpuscheduler/node_resource_cache.go +++ b/gpu-aware-scheduling/pkg/gpuscheduler/node_resource_cache.go @@ -163,13 +163,13 @@ func NewCache(client kubernetes.Interface) *Cache { podLister := podInformer.Lister() stopChannel := signalHandler() - klog.V(l1).Info("starting shared informer factory (cache)") + klog.V(logL1).Info("starting shared informer factory (cache)") go sharedInformerFactory.Start(stopChannel) syncOk := internCacheAPI.WaitForCacheSync(stopChannel, nodeInformer.Informer().HasSynced) if syncOk { - klog.V(l2).Info("node cache created and synced successfully") + klog.V(logL2).Info("node cache created and synced successfully") } else { klog.Error("Couldn't sync clientgo cache for nodes") @@ -178,14 +178,14 @@ func NewCache(client kubernetes.Interface) *Cache { syncOk = internCacheAPI.WaitForCacheSync(stopChannel, podInformer.Informer().HasSynced) if syncOk { - klog.V(l2).Info("POD cache created and synced successfully") + klog.V(logL2).Info("POD cache created and synced successfully") } else { klog.Error("Couldn't sync clientgo cache for PODs") return nil } - c := Cache{ + cache := Cache{ clientset: client, sharedInformerFactory: sharedInformerFactory, nodeLister: nodeLister, @@ -193,15 +193,16 @@ func NewCache(client kubernetes.Interface) *Cache { nodeWorkQueue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "nodeWorkQueue"), podLister: podLister, annotatedPods: make(map[string]string), + nodeStatuses: make(map[string]nodeResources), + nodeTileStatuses: make(map[string]nodeTiles), previousDeschedCards: make(map[string][]string), previousDeschedTiles: make(map[string][]string), podDeschedStatuses: make(map[string]bool), - nodeStatuses: make(map[string]nodeResources), - nodeTileStatuses: make(map[string]nodeTiles), + rwmutex: sync.RWMutex{}, } - _, err := podInformer.Informer().AddEventHandler(c.createFilteringPodResourceHandler()) - _, err2 := nodeInformer.Informer().AddEventHandler(c.createFilteringNodeResourceHandler()) + _, err := podInformer.Informer().AddEventHandler(cache.createFilteringPodResourceHandler()) + _, err2 := nodeInformer.Informer().AddEventHandler(cache.createFilteringNodeResourceHandler()) if err != nil || err2 != nil { klog.Errorf("informer event handler init failure (%v, %v)", err, err2) @@ -209,24 +210,24 @@ func NewCache(client kubernetes.Interface) *Cache { return nil } - go func() { c.startPodWork(stopChannel) }() - go func() { c.startNodeWork(stopChannel) }() + go func() { cache.startPodWork(stopChannel) }() + go func() { cache.startNodeWork(stopChannel) }() - return &c + return &cache } func (c *Cache) podFilter(obj interface{}) bool { var pod *v1.Pod - var ok bool + var ok1 bool switch t := obj.(type) { case *v1.Pod: pod, _ = obj.(*v1.Pod) case cache.DeletedFinalStateUnknown: - pod, ok = t.Obj.(*v1.Pod) + pod, ok1 = t.Obj.(*v1.Pod) - if !ok { + if !ok1 { return false } default: @@ -239,15 +240,15 @@ func (c *Cache) podFilter(obj interface{}) bool { func (c *Cache) nodeFilter(obj interface{}) bool { var node *v1.Node - var ok bool + var ok1 bool switch t := obj.(type) { case *v1.Node: node, _ = obj.(*v1.Node) case cache.DeletedFinalStateUnknown: - node, ok = t.Obj.(*v1.Node) + node, ok1 = t.Obj.(*v1.Node) - if !ok { + if !ok1 { return false } default: @@ -260,9 +261,9 @@ func (c *Cache) nodeFilter(obj interface{}) bool { // This must be called with rwmutex unlocked // set add=true to add, false to remove resources. func (c *Cache) adjustPodResourcesL(pod *v1.Pod, adj bool, annotation, tileAnnotation, nodeName string) error { - klog.V(l4).Infof("adjustPodResourcesL %v %v", nodeName, pod.Name) + klog.V(logL4).Infof("adjustPodResourcesL %v %v", nodeName, pod.Name) c.rwmutex.Lock() - klog.V(l5).Infof("adjustPodResourcesL %v %v locked", nodeName, pod.Name) + klog.V(logL5).Infof("adjustPodResourcesL %v %v locked", nodeName, pod.Name) defer c.rwmutex.Unlock() err := c.adjustPodResources(pod, adj, annotation, tileAnnotation, nodeName) @@ -289,28 +290,35 @@ func (c *Cache) newCopyNodeStatus(nodeName string) nodeResources { // This must be called with the rwmutex at least read-locked. // set adj=true to add, false to remove resources. func (c *Cache) checkPodResourceAdjustment(containerRequests []resourceMap, - nodeName string, containerCards []string, adj bool) error { + nodeName string, containerCards []string, adj bool, +) error { if len(containerRequests) != len(containerCards) || nodeName == "" { klog.Errorf("bad args, node %v pod creqs %v ccards %v", nodeName, containerRequests, containerCards) return errBadArgs } + return c.checkPodResourceAdjustmentImpl(containerRequests, nodeName, containerCards, adj) +} + +func (c *Cache) checkPodResourceAdjustmentImpl(containerRequests []resourceMap, + nodeName string, containerCards []string, adj bool, +) error { numContainers := len(containerRequests) nodeRes := c.newCopyNodeStatus(nodeName) var err error - for i := 0; i < numContainers; i++ { + for index := 0; index < numContainers; index++ { // get slice of card names from the CSV list of container nr i - cardNames := strings.Split(containerCards[i], ",") + cardNames := strings.Split(containerCards[index], ",") numCards := len(cardNames) - if numCards == 0 || len(containerCards[i]) == 0 { + if numCards == 0 || len(containerCards[index]) == 0 { continue } - request := containerRequests[i].newCopy() + request := containerRequests[index].newCopy() err = request.divide(numCards) if err != nil { @@ -403,6 +411,14 @@ func (c *Cache) adjustTiles(adj bool, nodeName, tileAnnotation string) { } } +func (c *Cache) blindAdjustResources(adj bool, srcResMap, dstResMap resourceMap) { + if adj { // add + _ = dstResMap.addRM(srcResMap) + } else { + _ = dstResMap.subtractRM(srcResMap) + } +} + // This must be called with rwmutex locked // set adj=true to add, false to remove resources. func (c *Cache) adjustPodResources(pod *v1.Pod, adj bool, annotation, tileAnnotation, nodeName string) error { @@ -420,16 +436,16 @@ func (c *Cache) adjustPodResources(pod *v1.Pod, adj bool, annotation, tileAnnota // now that we have checked, error checks are omitted below numContainers := len(containerRequests) - for i := 0; i < numContainers; i++ { + for index := 0; index < numContainers; index++ { // get slice of card names from the CSV list of container nr i - cardNames := strings.Split(containerCards[i], ",") + cardNames := strings.Split(containerCards[index], ",") numCards := len(cardNames) - if numCards == 0 || len(containerCards[i]) == 0 { + if numCards == 0 || len(containerCards[index]) == 0 { continue } - err = containerRequests[i].divide(numCards) + err = containerRequests[index].divide(numCards) if err != nil { return err } @@ -444,11 +460,7 @@ func (c *Cache) adjustPodResources(pod *v1.Pod, adj bool, annotation, tileAnnota c.nodeStatuses[nodeName][cardName] = resourceMap{} } - if adj { // add - _ = c.nodeStatuses[nodeName][cardName].addRM(containerRequests[i]) - } else { - _ = c.nodeStatuses[nodeName][cardName].subtractRM(containerRequests[i]) - } + c.blindAdjustResources(adj, containerRequests[index], c.nodeStatuses[nodeName][cardName]) } } @@ -465,7 +477,7 @@ func (c *Cache) adjustPodResources(pod *v1.Pod, adj bool, annotation, tileAnnota return nil } -func signalHandler() (stopChannel <-chan struct{}) { +func signalHandler() <-chan struct{} { stopChan := make(chan struct{}) //nolint:gomnd signalChan := make(chan os.Signal, 2) @@ -542,7 +554,7 @@ func (c *Cache) addNodeToCache(nodeObj interface{}) { c.nodeWorkQueue.Add(item) } -func (c *Cache) updateNodeInCache(oldNodeObj, newNodeObj interface{}) { +func (c *Cache) updateNodeInCache(_, newNodeObj interface{}) { node, ok := newNodeObj.(*v1.Node) if !ok { klog.Warningf("cannot convert to *v1.Node: %v", newNodeObj) @@ -560,20 +572,20 @@ func (c *Cache) updateNodeInCache(oldNodeObj, newNodeObj interface{}) { func (c *Cache) deleteNodeFromCache(nodeObj interface{}) { var node *v1.Node - switch t := nodeObj.(type) { + switch aType := nodeObj.(type) { case *v1.Node: - node = t + node = aType case cache.DeletedFinalStateUnknown: var ok bool - node, ok = t.Obj.(*v1.Node) + node, ok = aType.Obj.(*v1.Node) if !ok { - klog.Warningf("cannot convert to *v1.Node: %v", t.Obj) + klog.Warningf("cannot convert to *v1.Node: %v", aType.Obj) return } default: - klog.Warningf("cannot convert to *v1.Node: %v", t) + klog.Warningf("cannot convert to *v1.Node: %v", aType) return } @@ -595,8 +607,8 @@ func (c *Cache) addPodToCache(podObj interface{}) { } // if POD does not have the necessary annotation, working on it is futile, then update must be waited for - annotation, ok := pod.Annotations[cardAnnotationName] - if !ok { + annotation, ok2 := pod.Annotations[cardAnnotationName] + if !ok2 { return } @@ -613,7 +625,7 @@ func (c *Cache) addPodToCache(podObj interface{}) { c.podWorkQueue.Add(item) } -func (c *Cache) updatePodInCache(oldPodObj, newPodObj interface{}) { +func (c *Cache) updatePodInCache(_, newPodObj interface{}) { newPod, ok := newPodObj.(*v1.Pod) if !ok { klog.Warningf("conversion of newObj -> pod failed: %v", newPodObj) @@ -622,8 +634,8 @@ func (c *Cache) updatePodInCache(oldPodObj, newPodObj interface{}) { } // if POD does not have the necessary annotation, can't work on it yet - annotation, ok := newPod.Annotations[cardAnnotationName] - if !ok { + annotation, ok2 := newPod.Annotations[cardAnnotationName] + if !ok2 { return } @@ -647,26 +659,26 @@ func (c *Cache) updatePodInCache(oldPodObj, newPodObj interface{}) { } func (c *Cache) deletePodFromCache(podObj interface{}) { - klog.V(l4).Infof("deletePodFromCache") + klog.V(logL4).Infof("deletePodFromCache") c.rwmutex.RLock() // reads c.annotatedPods - klog.V(l5).Infof("deletePodFromCache locked") + klog.V(logL5).Infof("deletePodFromCache locked") defer c.rwmutex.RUnlock() var pod *v1.Pod - switch t := podObj.(type) { + switch aType := podObj.(type) { case *v1.Pod: - pod = t + pod = aType case cache.DeletedFinalStateUnknown: var ok bool - pod, ok = t.Obj.(*v1.Pod) + pod, ok = aType.Obj.(*v1.Pod) if !ok { - klog.Warningf("cannot convert to *v1.Pod: %v", t.Obj) + klog.Warningf("cannot convert to *v1.Pod: %v", aType.Obj) return } default: - klog.Warningf("cannot convert to *v1.Pod: %v", t) + klog.Warningf("cannot convert to *v1.Pod: %v", aType) return } @@ -674,7 +686,7 @@ func (c *Cache) deletePodFromCache(podObj interface{}) { key := getKey(pod) _, annotatedPod := c.annotatedPods[key] - klog.V(l4).Infof("delete pod %s in ns %s annotated:%v", pod.Name, pod.Namespace, annotatedPod) + klog.V(logL4).Infof("delete pod %s in ns %s annotated:%v", pod.Name, pod.Namespace, annotatedPod) if !annotatedPod { return @@ -695,12 +707,12 @@ func (c *Cache) startNodeWork(stopChannel <-chan struct{}) { defer c.nodeWorkQueue.ShutDown() defer runtime.HandleCrash() - klog.V(l2).Info("starting node worker") + klog.V(logL2).Info("starting node worker") // block calling goroutine wait.Until(c.nodeWorkerRun, workerWaitTime, stopChannel) - klog.V(l2).Info("node worker shutting down") + klog.V(logL2).Info("node worker shutting down") } // This steals the calling goroutine and blocks doing work. @@ -708,37 +720,39 @@ func (c *Cache) startPodWork(stopChannel <-chan struct{}) { defer c.podWorkQueue.ShutDown() defer runtime.HandleCrash() - klog.V(l2).Info("starting pod worker") + klog.V(logL2).Info("starting pod worker") // block calling goroutine wait.Until(c.podWorkerRun, workerWaitTime, stopChannel) - klog.V(l2).Info("pod worker shutting down") + klog.V(logL2).Info("pod worker shutting down") } func (c *Cache) podWorkerRun() { + //nolint:revive for c.podWork() { } } func (c *Cache) nodeWorkerRun() { + //nolint:revive for c.nodeWork() { } } func (c *Cache) nodeWork() bool { - klog.V(l5).Info("node worker started") + klog.V(logL5).Info("node worker started") itemI, quit := c.nodeWorkQueue.Get() if quit { - klog.V(l2).Info("node worker quitting") + klog.V(logL2).Info("node worker quitting") return false } defer c.nodeWorkQueue.Done(itemI) - defer klog.V(l5).Info("node worker ended work") + defer klog.V(logL5).Info("node worker ended work") item, ok := itemI.(nodeWorkQueueItem) @@ -763,18 +777,18 @@ func (c *Cache) nodeWork() bool { } func (c *Cache) podWork() bool { - klog.V(l5).Info("pod worker started") + klog.V(logL5).Info("pod worker started") itemI, quit := c.podWorkQueue.Get() if quit { - klog.V(l2).Info("pod worker quitting") + klog.V(logL2).Info("pod worker quitting") return false } defer c.podWorkQueue.Done(itemI) - defer klog.V(l5).Info("pod worker ended work") + defer klog.V(logL5).Info("pod worker ended work") item, ok := itemI.(podWorkQueueItem) @@ -827,9 +841,9 @@ func (c *Cache) fetchPod(ns, name string) (*v1.Pod, error) { // getNodeTileStatus returns a copy of current tile status for a node. func (c *Cache) getNodeTileStatus(nodeName string) nodeTiles { - klog.V(l4).Infof("getNodeTileStatus %v", nodeName) + klog.V(logL4).Infof("getNodeTileStatus %v", nodeName) c.rwmutex.RLock() - klog.V(l5).Infof("getNodeTileStatus %v locked", nodeName) + klog.V(logL5).Infof("getNodeTileStatus %v locked", nodeName) defer c.rwmutex.RUnlock() dstNodeTiles := nodeTiles{} @@ -844,9 +858,9 @@ func (c *Cache) getNodeTileStatus(nodeName string) nodeTiles { // getNodeResourceStatus returns a copy of current resource status for a node (map of per card resource maps). func (c *Cache) getNodeResourceStatus(nodeName string) nodeResources { - klog.V(l4).Infof("getNodeResourceStatus %v", nodeName) + klog.V(logL4).Infof("getNodeResourceStatus %v", nodeName) c.rwmutex.RLock() - klog.V(l5).Infof("getNodeResourceStatus %v locked", nodeName) + klog.V(logL5).Infof("getNodeResourceStatus %v locked", nodeName) defer c.rwmutex.RUnlock() dstNodeResources := nodeResources{} @@ -942,9 +956,18 @@ func (c *Cache) handlePodDescheduleLabeling(deschedule bool, pod *v1.Pod) error } _, err := c.clientset.CoreV1().Pods(pod.GetNamespace()).Patch( - context.TODO(), pod.GetName(), types.JSONPatchType, payloadBytes, metav1.PatchOptions{}) + context.TODO(), pod.GetName(), types.JSONPatchType, payloadBytes, metav1.PatchOptions{ + TypeMeta: metav1.TypeMeta{ + Kind: "", + APIVersion: "", + }, + DryRun: []string{}, + Force: new(bool), + FieldManager: "", + FieldValidation: "", + }) if err == nil { - klog.V(l4).Infof("Pod %s labeled successfully.", pod.GetName()) + klog.V(logL4).Infof("Pod %s labeled successfully.", pod.GetName()) return nil } @@ -954,84 +977,107 @@ func (c *Cache) handlePodDescheduleLabeling(deschedule bool, pod *v1.Pod) error return fmt.Errorf("pod label failed: %w", err) } -func (c *Cache) handleNode(item nodeWorkQueueItem) error { - klog.V(l4).Infof("handleNode %s", item.nodeName) +func createListOptions(selector string) *metav1.ListOptions { + return &metav1.ListOptions{ + TypeMeta: metav1.TypeMeta{Kind: "", APIVersion: ""}, + LabelSelector: "", + FieldSelector: selector, + Watch: false, + AllowWatchBookmarks: false, + ResourceVersion: "", + ResourceVersionMatch: "", + TimeoutSeconds: new(int64), + Limit: 0, + Continue: "", + SendInitialEvents: nil, + } +} - c.rwmutex.Lock() // reads and writes c. fields - klog.V(l5).Infof("handleNode %v locked", item.nodeName) - defer c.rwmutex.Unlock() +func (c *Cache) handleNodeUpdated(item nodeWorkQueueItem) error { + // add and remove related labels + // calculate set of cards that trigger descheduling and compare it to the previous + // set of cards. then if it has changed, move to study pods/containers for changes. + descheduledCards := calculateCardsFromDescheduleLabels(item.node) + descheduledTiles := calculateTilesFromDescheduleLabels(item.node) - switch item.action { - case nodeAdded: - fallthrough - case nodeUpdated: - // add and remove related labels - // calculate set of cards that trigger descheduling and compare it to the previous - // set of cards. then if it has changed, move to study pods/containers for changes. - descheduledCards := calculateCardsFromDescheduleLabels(item.node) - descheduledTiles := calculateTilesFromDescheduleLabels(item.node) + sort.Strings(descheduledCards) + sort.Strings(descheduledTiles) - sort.Strings(descheduledCards) - sort.Strings(descheduledTiles) + prevDescheduleCards := c.previousDeschedCards[item.nodeName] + prevDescheduleTiles := c.previousDeschedTiles[item.nodeName] - prevDescheduleCards := c.previousDeschedCards[item.nodeName] - prevDescheduleTiles := c.previousDeschedTiles[item.nodeName] + if reflect.DeepEqual(descheduledCards, prevDescheduleCards) && + reflect.DeepEqual(descheduledTiles, prevDescheduleTiles) { + return nil + } - if reflect.DeepEqual(descheduledCards, prevDescheduleCards) && - reflect.DeepEqual(descheduledTiles, prevDescheduleTiles) { - return nil - } + selector, err := fields.ParseSelector("spec.nodeName=" + item.nodeName + + ",status.phase=" + string(v1.PodRunning)) + if err != nil { + klog.Error(err.Error()) - selector, err := fields.ParseSelector("spec.nodeName=" + item.nodeName + - ",status.phase=" + string(v1.PodRunning)) + return fmt.Errorf("error with fetching object: %w", err) + } - if err != nil { - klog.Error(err.Error()) + runningPodList, err := c.clientset.CoreV1().Pods(v1.NamespaceAll).List(context.TODO(), + *createListOptions(selector.String())) + if err != nil { + klog.Error(err.Error()) - return fmt.Errorf("error with fetching object: %w", err) - } + return fmt.Errorf("error with listing pods: %w", err) + } - runningPodList, err := c.clientset.CoreV1().Pods(v1.NamespaceAll).List(context.TODO(), metav1.ListOptions{ - FieldSelector: selector.String(), - }) + for index := range runningPodList.Items { + podName := runningPodList.Items[index].Name + needDeschedule := (isDeschedulingNeededCards(&runningPodList.Items[index], descheduledCards) || + isDeschedulingNeededTiles(&runningPodList.Items[index], descheduledTiles)) - if err != nil { - klog.Error(err.Error()) + // change pod's descheduling label based on the need (if it doesn't exist vs. if it does) + if needDeschedule != c.podDeschedStatuses[podName] { + if err := c.handlePodDescheduleLabeling(needDeschedule, &runningPodList.Items[index]); err != nil { + return err + } - return fmt.Errorf("error with listing pods: %w", err) + c.podDeschedStatuses[podName] = needDeschedule } + } - for i := range runningPodList.Items { - podName := runningPodList.Items[i].Name - needDeschedule := (isDeschedulingNeededCards(&runningPodList.Items[i], descheduledCards) || - isDeschedulingNeededTiles(&runningPodList.Items[i], descheduledTiles)) + // update previous descheduling cards + c.previousDeschedCards[item.nodeName] = descheduledCards + c.previousDeschedTiles[item.nodeName] = descheduledTiles - // change pod's descheduling label based on the need (if it doesn't exist vs. if it does) - if needDeschedule != c.podDeschedStatuses[podName] { - if err := c.handlePodDescheduleLabeling(needDeschedule, &runningPodList.Items[i]); err != nil { - return err - } + return nil +} - c.podDeschedStatuses[podName] = needDeschedule - } - } +func (c *Cache) handleNode(item nodeWorkQueueItem) error { + klog.V(logL4).Infof("handleNode %s", item.nodeName) + + c.rwmutex.Lock() // reads and writes c. fields + klog.V(logL5).Infof("handleNode %v locked", item.nodeName) + defer c.rwmutex.Unlock() + + var err error - // update previous descheduling cards - c.previousDeschedCards[item.nodeName] = descheduledCards - c.previousDeschedTiles[item.nodeName] = descheduledTiles + switch item.action { + case nodeAdded: + fallthrough + case nodeUpdated: + err = c.handleNodeUpdated(item) case nodeDeleted: delete(c.previousDeschedCards, item.nodeName) delete(c.previousDeschedTiles, item.nodeName) } - return nil + return err } -func (c *Cache) handlePod(item podWorkQueueItem) (forget bool, err error) { - klog.V(l4).Infof("handlePod %s in ns %s", item.name, item.ns) +func (c *Cache) handlePod(item podWorkQueueItem) (bool, error) { + var err error + + klog.V(logL4).Infof("handlePod %s in ns %s", item.name, item.ns) c.rwmutex.Lock() // adjusts podresources - klog.V(l5).Infof("handlePod %v locked", item.name) + klog.V(logL5).Infof("handlePod %v locked", item.name) defer c.rwmutex.Unlock() msg := "" @@ -1071,7 +1117,7 @@ func (c *Cache) handlePod(item podWorkQueueItem) (forget bool, err error) { err = errUnknownAction } - klog.V(l4).Infof(msg) + klog.V(logL4).Infof(msg) c.printNodeStatus(item.pod.Spec.NodeName) @@ -1079,7 +1125,7 @@ func (c *Cache) handlePod(item podWorkQueueItem) (forget bool, err error) { } func (c *Cache) printNodeStatus(nodeName string) { - if klog.V(l4).Enabled() { + if klog.V(logL4).Enabled() { klog.Info(nodeName, ":") resources, ok := c.nodeStatuses[nodeName] @@ -1089,9 +1135,9 @@ func (c *Cache) printNodeStatus(nodeName string) { } } - tileUsage, ok := c.nodeTileStatuses[nodeName] + tileUsage, ok2 := c.nodeTileStatuses[nodeName] - if ok { + if ok2 { for key, value := range tileUsage { klog.Info(" ", key, " used tiles:", value) } diff --git a/gpu-aware-scheduling/pkg/gpuscheduler/node_resource_cache_test.go b/gpu-aware-scheduling/pkg/gpuscheduler/node_resource_cache_test.go index 3ad1c0bb..1ee879f0 100644 --- a/gpu-aware-scheduling/pkg/gpuscheduler/node_resource_cache_test.go +++ b/gpu-aware-scheduling/pkg/gpuscheduler/node_resource_cache_test.go @@ -75,13 +75,13 @@ func getDummyCache() *Cache { } func TestCacheFilters(t *testing.T) { - c := getDummyCache() + dummyCache := getDummyCache() Convey("When the object is wrong type", t, func() { s := "wrong object" - result := c.podFilter(s) + result := dummyCache.podFilter(s) So(result, ShouldBeFalse) - result = c.nodeFilter(s) + result = dummyCache.nodeFilter(s) So(result, ShouldBeFalse) }) Convey("When the object is DeleteFinalStateUnknown", t, func() { @@ -89,44 +89,44 @@ func TestCacheFilters(t *testing.T) { Key: "unknown", Obj: &v1.Pod{}, } - result := c.podFilter(unknown) + result := dummyCache.podFilter(unknown) So(result, ShouldBeFalse) unknown.Obj = &v1.Node{} - result = c.nodeFilter(unknown) + result = dummyCache.nodeFilter(unknown) So(result, ShouldBeFalse) }) } func TestPodCacheEventFunctions(t *testing.T) { // we need a mock cache which doesn't call work() itself to avoid race conditions at work queue length checks - c := createMockCache() + mockCache := createMockCache() badType := "bad type" Convey("When trying to add a non-pod object to cache", t, func() { - wqLen := c.podWorkQueue.Len() - c.addPodToCache(badType) - So(c.podWorkQueue.Len(), ShouldEqual, wqLen) + wqLen := mockCache.podWorkQueue.Len() + mockCache.addPodToCache(badType) + So(mockCache.podWorkQueue.Len(), ShouldEqual, wqLen) }) // annotated pod doesn't always get to cache during validation run, // so let's do that here always Convey("When a pod with a proper annotation is added to the cache", t, func() { - wqLen := c.podWorkQueue.Len() + wqLen := mockCache.podWorkQueue.Len() pod := v1.Pod{} pod.Annotations = map[string]string{} pod.Annotations[cardAnnotationName] = properAnnotation - c.addPodToCache(&pod) - So(c.podWorkQueue.Len(), ShouldEqual, wqLen+1) + mockCache.addPodToCache(&pod) + So(mockCache.podWorkQueue.Len(), ShouldEqual, wqLen+1) }) Convey("When trying to update a non-pod object in cache", t, func() { - wqLen := c.podWorkQueue.Len() - c.updatePodInCache(badType, badType) - So(c.podWorkQueue.Len(), ShouldEqual, wqLen) + wqLen := mockCache.podWorkQueue.Len() + mockCache.updatePodInCache(badType, badType) + So(mockCache.podWorkQueue.Len(), ShouldEqual, wqLen) }) Convey("When trying to delete a non-pod object from cache", t, func() { - wqLen := c.podWorkQueue.Len() - c.deletePodFromCache(badType) - So(c.podWorkQueue.Len(), ShouldEqual, wqLen) + wqLen := mockCache.podWorkQueue.Len() + mockCache.deletePodFromCache(badType) + So(mockCache.podWorkQueue.Len(), ShouldEqual, wqLen) }) unknown := cache.DeletedFinalStateUnknown{ @@ -135,41 +135,41 @@ func TestPodCacheEventFunctions(t *testing.T) { } Convey("When trying to delete a non-pod state-unknown-object from cache", t, func() { - wqLen := c.podWorkQueue.Len() - c.deletePodFromCache(unknown) - So(c.podWorkQueue.Len(), ShouldEqual, wqLen) + wqLen := mockCache.podWorkQueue.Len() + mockCache.deletePodFromCache(unknown) + So(mockCache.podWorkQueue.Len(), ShouldEqual, wqLen) }) Convey("When deleting a proper POD from a proper namespace with a proper annotation", t, func() { - wqLen := c.podWorkQueue.Len() + wqLen := mockCache.podWorkQueue.Len() pod := v1.Pod{} pod.Name = properName pod.Namespace = properName - c.annotatedPods[getKey(&pod)] = properAnnotation - c.deletePodFromCache(&pod) - So(c.podWorkQueue.Len(), ShouldEqual, wqLen+1) + mockCache.annotatedPods[getKey(&pod)] = properAnnotation + mockCache.deletePodFromCache(&pod) + So(mockCache.podWorkQueue.Len(), ShouldEqual, wqLen+1) }) } func TestNodeCacheEventFunctions(t *testing.T) { // we need a mock cache which doesn't call work() itself to avoid race conditions at work queue length checks - c := createMockCache() + mockCache := createMockCache() badType := "bad type" Convey("When trying to add a non-node object to cache", t, func() { - wqLen := c.nodeWorkQueue.Len() - c.addNodeToCache(badType) - So(c.nodeWorkQueue.Len(), ShouldEqual, wqLen) + wqLen := mockCache.nodeWorkQueue.Len() + mockCache.addNodeToCache(badType) + So(mockCache.nodeWorkQueue.Len(), ShouldEqual, wqLen) }) Convey("When trying to update a non-node object in cache", t, func() { - wqLen := c.nodeWorkQueue.Len() - c.updateNodeInCache(badType, badType) - So(c.nodeWorkQueue.Len(), ShouldEqual, wqLen) + wqLen := mockCache.nodeWorkQueue.Len() + mockCache.updateNodeInCache(badType, badType) + So(mockCache.nodeWorkQueue.Len(), ShouldEqual, wqLen) }) Convey("When trying to delete a non-node object from cache", t, func() { - wqLen := c.nodeWorkQueue.Len() - c.deleteNodeFromCache(badType) - So(c.nodeWorkQueue.Len(), ShouldEqual, wqLen) + wqLen := mockCache.nodeWorkQueue.Len() + mockCache.deleteNodeFromCache(badType) + So(mockCache.nodeWorkQueue.Len(), ShouldEqual, wqLen) }) unknown := cache.DeletedFinalStateUnknown{ @@ -178,21 +178,21 @@ func TestNodeCacheEventFunctions(t *testing.T) { } Convey("When trying to delete a non-node state-unknown-object from cache", t, func() { - wqLen := c.nodeWorkQueue.Len() - c.deleteNodeFromCache(unknown) - So(c.nodeWorkQueue.Len(), ShouldEqual, wqLen) + wqLen := mockCache.nodeWorkQueue.Len() + mockCache.deleteNodeFromCache(unknown) + So(mockCache.nodeWorkQueue.Len(), ShouldEqual, wqLen) }) Convey("When deleting a proper Node from cache", t, func() { - wqLen := c.nodeWorkQueue.Len() + wqLen := mockCache.nodeWorkQueue.Len() node := v1.Node{} node.Name = properName - c.deleteNodeFromCache(&node) - So(c.nodeWorkQueue.Len(), ShouldEqual, wqLen+1) + mockCache.deleteNodeFromCache(&node) + So(mockCache.nodeWorkQueue.Len(), ShouldEqual, wqLen+1) }) } func TestHandlePodError(t *testing.T) { - c := getDummyCache() + dummyCache := getDummyCache() item := podWorkQueueItem{ action: -1, pod: &v1.Pod{ @@ -212,14 +212,14 @@ func TestHandlePodError(t *testing.T) { } Convey("When I call HandlePod with a bad action", t, func() { - forget, err := c.handlePod(item) + forget, err := dummyCache.handlePod(item) So(forget, ShouldBeTrue) So(err, ShouldNotBeNil) }) Convey("When I call HandlePod with podAdded action", t, func() { item.action = podAdded - forget, err := c.handlePod(item) + forget, err := dummyCache.handlePod(item) So(forget, ShouldBeTrue) So(err, ShouldBeNil) }) @@ -241,60 +241,60 @@ func createMockCache() *Cache { func TestPodWork(t *testing.T) { // to be able to call work() directly, we need a mock cache which doesn't call work() itself - c := createMockCache() + cache := createMockCache() Convey("When working on a bad pod", t, func() { - wqLen := c.podWorkQueue.Len() + wqLen := cache.podWorkQueue.Len() badPod := v1.Pod{} item := podWorkQueueItem{ action: -1, pod: &badPod, } - c.podWorkQueue.Add(item) - So(c.podWorkQueue.Len(), ShouldEqual, wqLen+1) - ret := c.podWork() + cache.podWorkQueue.Add(item) + So(cache.podWorkQueue.Len(), ShouldEqual, wqLen+1) + ret := cache.podWork() So(ret, ShouldBeTrue) - So(c.podWorkQueue.Len(), ShouldEqual, wqLen) + So(cache.podWorkQueue.Len(), ShouldEqual, wqLen) }) Convey("When the work queue is shutting down", t, func() { - c.podWorkQueue.ShutDown() - ret := c.podWork() + cache.podWorkQueue.ShutDown() + ret := cache.podWork() So(ret, ShouldBeFalse) }) } func TestNodeWork(t *testing.T) { // to be able to call work() directly, we need a mock cache which doesn't call work() itself - c := createMockCache() + cache := createMockCache() Convey("When working on a bad node", t, func() { - wqLen := c.nodeWorkQueue.Len() + wqLen := cache.nodeWorkQueue.Len() badNode := v1.Node{} item := nodeWorkQueueItem{ action: -1, node: &badNode, } - c.nodeWorkQueue.Add(item) - So(c.nodeWorkQueue.Len(), ShouldEqual, wqLen+1) - ret := c.nodeWork() + cache.nodeWorkQueue.Add(item) + So(cache.nodeWorkQueue.Len(), ShouldEqual, wqLen+1) + ret := cache.nodeWork() So(ret, ShouldBeTrue) - So(c.nodeWorkQueue.Len(), ShouldEqual, wqLen) + So(cache.nodeWorkQueue.Len(), ShouldEqual, wqLen) }) Convey("When the node work queue is shutting down", t, func() { - c.nodeWorkQueue.ShutDown() - ret := c.nodeWork() + cache.nodeWorkQueue.ShutDown() + ret := cache.nodeWork() So(ret, ShouldBeFalse) }) } func TestAdjustTiles(t *testing.T) { - c := getDummyCache() + dummyCache := getDummyCache() Convey("When node's tile statuses doesn't exist yet", t, func() { - c.nodeTileStatuses = make(map[string]nodeTiles) - c.adjustTiles(true, "node1", "card0:gt0+gt1") + dummyCache.nodeTileStatuses = make(map[string]nodeTiles) + dummyCache.adjustTiles(true, "node1", "card0:gt0+gt1") - statuses, ok := c.nodeTileStatuses["node1"] + statuses, ok := dummyCache.nodeTileStatuses["node1"] So(ok, ShouldEqual, true) tileInfo, ok := statuses["card0"] @@ -305,11 +305,11 @@ func TestAdjustTiles(t *testing.T) { }) Convey("When node's tile statuses are updated", t, func() { - c.nodeTileStatuses = make(map[string]nodeTiles) - c.adjustTiles(true, "node1", "card0:gt0+gt1") - c.adjustTiles(true, "node1", "card0:gt0+gt1+gt3") + dummyCache.nodeTileStatuses = make(map[string]nodeTiles) + dummyCache.adjustTiles(true, "node1", "card0:gt0+gt1") + dummyCache.adjustTiles(true, "node1", "card0:gt0+gt1+gt3") - statuses := c.nodeTileStatuses["node1"] + statuses := dummyCache.nodeTileStatuses["node1"] tileInfo := statuses["card0"] So(0, ShouldBeIn, tileInfo) So(1, ShouldBeIn, tileInfo) @@ -318,22 +318,22 @@ func TestAdjustTiles(t *testing.T) { }) Convey("When node's tile statuses are removed", t, func() { - c.nodeTileStatuses = make(map[string]nodeTiles) - c.adjustTiles(true, "node1", "card0:gt0+gt1") - c.adjustTiles(false, "node1", "card0:gt0") + dummyCache.nodeTileStatuses = make(map[string]nodeTiles) + dummyCache.adjustTiles(true, "node1", "card0:gt0+gt1") + dummyCache.adjustTiles(false, "node1", "card0:gt0") - statuses := c.nodeTileStatuses["node1"] + statuses := dummyCache.nodeTileStatuses["node1"] tileInfo := statuses["card0"] So(0, ShouldNotBeIn, tileInfo) So(1, ShouldBeIn, tileInfo) }) Convey("When second gpu's tiles are reserved", t, func() { - c.nodeTileStatuses = make(map[string]nodeTiles) - c.adjustTiles(true, "node1", "card0:gt0+gt1") - c.adjustTiles(true, "node1", "card1:gt3+gt4") + dummyCache.nodeTileStatuses = make(map[string]nodeTiles) + dummyCache.adjustTiles(true, "node1", "card0:gt0+gt1") + dummyCache.adjustTiles(true, "node1", "card1:gt3+gt4") - statuses := c.nodeTileStatuses["node1"] + statuses := dummyCache.nodeTileStatuses["node1"] _, ok := statuses["card0"] So(ok, ShouldEqual, true) @@ -347,14 +347,14 @@ func TestAdjustTiles(t *testing.T) { }) Convey("When everything is reserved and released", t, func() { - c.nodeTileStatuses = make(map[string]nodeTiles) - c.adjustTiles(true, "node1", "card0:gt0+gt1") - c.adjustTiles(true, "node1", "card1:gt3+gt4") + dummyCache.nodeTileStatuses = make(map[string]nodeTiles) + dummyCache.adjustTiles(true, "node1", "card0:gt0+gt1") + dummyCache.adjustTiles(true, "node1", "card1:gt3+gt4") - c.adjustTiles(false, "node1", "card1:gt3+gt4") - c.adjustTiles(false, "node1", "card0:gt0+gt1") + dummyCache.adjustTiles(false, "node1", "card1:gt3+gt4") + dummyCache.adjustTiles(false, "node1", "card0:gt0+gt1") - statuses := c.nodeTileStatuses["node1"] + statuses := dummyCache.nodeTileStatuses["node1"] tiles, ok := statuses["card0"] So(ok, ShouldEqual, true) @@ -367,24 +367,26 @@ func TestAdjustTiles(t *testing.T) { } func TestAdjustPodResources(t *testing.T) { - c := getDummyCache() + dummyCache := getDummyCache() pod := v1.Pod{} podContainer := v1.Container{Name: "foobarContainer"} - podRequests := v1.ResourceRequirements{Requests: v1.ResourceList{ - "gpu.intel.com/tiles": resource.MustParse("1"), - "gpu.intel.com/i915": resource.MustParse("1")}, + podRequests := v1.ResourceRequirements{ + Requests: v1.ResourceList{ + "gpu.intel.com/tiles": resource.MustParse("1"), + "gpu.intel.com/i915": resource.MustParse("1"), + }, } podContainer.Resources = podRequests pod.Spec.Containers = append(pod.Spec.Containers, podContainer) Convey("When adjusting pod resources with pod with one container", t, func() { - c.nodeTileStatuses = make(map[string]nodeTiles) - err := c.adjustPodResources(&pod, true, "card0", "card0:gt0", "node1") + dummyCache.nodeTileStatuses = make(map[string]nodeTiles) + err := dummyCache.adjustPodResources(&pod, true, "card0", "card0:gt0", "node1") So(err, ShouldEqual, nil) - statuses, ok := c.nodeTileStatuses["node1"] + statuses, ok := dummyCache.nodeTileStatuses["node1"] So(ok, ShouldEqual, true) tiles, ok := statuses["card0"] @@ -394,11 +396,11 @@ func TestAdjustPodResources(t *testing.T) { }) Convey("When adjusting pod resources back and forth", t, func() { - c.nodeTileStatuses = make(map[string]nodeTiles) - err1 := c.adjustPodResources(&pod, true, "card0", "card0:gt0", "node1") - err2 := c.adjustPodResources(&pod, false, "card0", "card0:gt0", "node1") + dummyCache.nodeTileStatuses = make(map[string]nodeTiles) + err1 := dummyCache.adjustPodResources(&pod, true, "card0", "card0:gt0", "node1") + err2 := dummyCache.adjustPodResources(&pod, false, "card0", "card0:gt0", "node1") - statuses := c.nodeTileStatuses["node1"] + statuses := dummyCache.nodeTileStatuses["node1"] tiles, ok := statuses["card0"] So(ok, ShouldEqual, true) @@ -408,12 +410,12 @@ func TestAdjustPodResources(t *testing.T) { }) Convey("When adjusting pod resources via L", t, func() { - c.nodeTileStatuses = make(map[string]nodeTiles) - err := c.adjustPodResourcesL(&pod, true, "card0", "card0:gt0", "node1") + dummyCache.nodeTileStatuses = make(map[string]nodeTiles) + err := dummyCache.adjustPodResourcesL(&pod, true, "card0", "card0:gt0", "node1") So(err, ShouldEqual, nil) - statuses, ok := c.nodeTileStatuses["node1"] + statuses, ok := dummyCache.nodeTileStatuses["node1"] So(ok, ShouldEqual, true) tiles, ok := statuses["card0"] @@ -480,7 +482,7 @@ func TestDeschedulingCards(t *testing.T) { }) applied := 0 - applyCheck := func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + applyCheck := func(action k8stesting.Action) (bool, runtime.Object, error) { patchAction, ok := action.(k8stesting.PatchAction) if !ok { return false, nil, nil @@ -500,7 +502,7 @@ func TestDeschedulingCards(t *testing.T) { } removed := 0 - removeCheck := func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + removeCheck := func(action k8stesting.Action) (bool, runtime.Object, error) { patchAction, ok := action.(k8stesting.PatchAction) if !ok { return false, nil, nil diff --git a/gpu-aware-scheduling/pkg/gpuscheduler/resource_map_test.go b/gpu-aware-scheduling/pkg/gpuscheduler/resource_map_test.go index ec021199..d1c3bdf3 100644 --- a/gpu-aware-scheduling/pkg/gpuscheduler/resource_map_test.go +++ b/gpu-aware-scheduling/pkg/gpuscheduler/resource_map_test.go @@ -18,57 +18,57 @@ const ( ) func TestDivision(t *testing.T) { - rm := resourceMap{key: 2} + resMap := resourceMap{key: 2} Convey("When I divide a rm with -1", t, func() { - err := rm.divide(-1) - So(rm[key], ShouldEqual, 2) + err := resMap.divide(-1) + So(resMap[key], ShouldEqual, 2) So(err, ShouldNotBeNil) }) Convey("When I divide a rm with 1", t, func() { - err := rm.divide(1) - So(rm[key], ShouldEqual, 2) + err := resMap.divide(1) + So(resMap[key], ShouldEqual, 2) So(err, ShouldBeNil) }) Convey("When I divide a rm with 2", t, func() { - err := rm.divide(2) - So(rm[key], ShouldEqual, 1) + err := resMap.divide(2) + So(resMap[key], ShouldEqual, 1) So(err, ShouldBeNil) }) } func TestAdd(t *testing.T) { int64Max := int64(9223372036854775807) - rm := resourceMap{key: 2} + resMap := resourceMap{key: 2} Convey("When I add to RM 9223372036854775805", t, func() { - err := rm.add(key, (int64Max - 2)) - So(rm[key], ShouldEqual, int64Max) + err := resMap.add(key, (int64Max - 2)) + So(resMap[key], ShouldEqual, int64Max) So(err, ShouldBeNil) }) Convey("When I still add to RM 1", t, func() { - err := rm.add(key, 1) - So(rm[key], ShouldEqual, int64Max) + err := resMap.add(key, 1) + So(resMap[key], ShouldEqual, int64Max) So(err, ShouldEqual, errOverflow) }) } func TestSubtract(t *testing.T) { - rm := resourceMap{key: 2} + resMap := resourceMap{key: 2} Convey("When I subtract an unknown key from RM", t, func() { - err := rm.subtract("bar", 2) - So(rm[key], ShouldEqual, 2) + err := resMap.subtract("bar", 2) + So(resMap[key], ShouldEqual, 2) So(err, ShouldNotBeNil) }) Convey("When I subtract 1 from RM", t, func() { - err := rm.subtract(key, 1) - So(rm[key], ShouldEqual, 1) + err := resMap.subtract(key, 1) + So(resMap[key], ShouldEqual, 1) So(err, ShouldBeNil) }) Convey("When I again subtract 2 from RM", t, func() { - err := rm.subtract(key, 2) - So(rm[key], ShouldEqual, 0) + err := resMap.subtract(key, 2) + So(resMap[key], ShouldEqual, 0) So(err, ShouldBeNil) }) } @@ -77,7 +77,7 @@ func TestAddRM(t *testing.T) { int64Max := int64(9223372036854775807) key2 := "foo2" key3 := "foo3" - rm := resourceMap{key: 2, key2: 3} + rm1 := resourceMap{key: 2, key2: 3} rm2 := resourceMap{key: 4, key2: 5, key3: int64Max} rm3 := resourceMap{key: 2, key2: 3, key3: int64Max} @@ -89,10 +89,10 @@ func TestAddRM(t *testing.T) { So(err, ShouldEqual, errOverflow) }) Convey("When I add RM to another RM which fits", t, func() { - err := rm.addRM(rm2) - So(rm[key], ShouldEqual, 6) - So(rm[key2], ShouldEqual, 8) - So(rm[key3], ShouldEqual, int64Max) + err := rm1.addRM(rm2) + So(rm1[key], ShouldEqual, 6) + So(rm1[key2], ShouldEqual, 8) + So(rm1[key3], ShouldEqual, int64Max) So(err, ShouldBeNil) }) } diff --git a/gpu-aware-scheduling/pkg/gpuscheduler/scheduler.go b/gpu-aware-scheduling/pkg/gpuscheduler/scheduler.go index fee13e0b..297adea1 100644 --- a/gpu-aware-scheduling/pkg/gpuscheduler/scheduler.go +++ b/gpu-aware-scheduling/pkg/gpuscheduler/scheduler.go @@ -49,11 +49,11 @@ const ( gpuPluginResource = gpuPrefix + "i915" gpuTileResource = gpuPrefix + "tiles" numaMappingLabel = gpuPrefix + "numa-gpu-map" - l1 = klog.Level(1) - l2 = klog.Level(2) - l3 = klog.Level(3) - l4 = klog.Level(4) - l5 = klog.Level(5) + logL1 = klog.Level(1) + logL2 = klog.Level(2) + logL3 = klog.Level(3) + logL4 = klog.Level(4) + logL5 = klog.Level(5) maxLabelParts = 2 numaSplitParts = 2 base10 = 10 @@ -99,20 +99,35 @@ type Card struct { // NewGASExtender returns a new GAS Extender. func NewGASExtender(clientset kubernetes.Interface, enableAllowlist, - enableDenylist bool, balanceResource string) *GASExtender { + enableDenylist bool, balanceResource string, +) *GASExtender { return &GASExtender{ - cache: iCache.NewCache(clientset), clientset: clientset, + cache: iCache.NewCache(clientset), + balancedResource: balanceResource, + rwmutex: sync.RWMutex{}, allowlistEnabled: enableAllowlist, denylistEnabled: enableDenylist, - balancedResource: balanceResource, + } +} + +func createPatchOptions() *metav1.PatchOptions { + return &metav1.PatchOptions{ + TypeMeta: metav1.TypeMeta{ + Kind: "", + APIVersion: "", + }, + DryRun: []string{}, + Force: nil, + FieldManager: "", + FieldValidation: "", } } func (m *GASExtender) annotatePodBind(ctx context.Context, annotation, tileAnnotation string, pod *v1.Pod) error { var err error - ts := strconv.FormatInt(time.Now().UnixNano(), base10) + timeStamp := strconv.FormatInt(time.Now().UnixNano(), base10) var payload []patchValue @@ -129,7 +144,7 @@ func (m *GASExtender) annotatePodBind(ctx context.Context, annotation, tileAnnot payload = append(payload, patchValue{ Op: "add", Path: "/metadata/annotations/" + tsAnnotationName, - Value: ts, + Value: timeStamp, }) payload = append(payload, patchValue{ @@ -154,9 +169,9 @@ func (m *GASExtender) annotatePodBind(ctx context.Context, annotation, tileAnnot } _, err = m.clientset.CoreV1().Pods(pod.GetNamespace()).Patch( - ctx, pod.GetName(), types.JSONPatchType, payloadBytes, metav1.PatchOptions{}) + ctx, pod.GetName(), types.JSONPatchType, payloadBytes, *createPatchOptions()) if err == nil { - klog.V(l2).Infof("Annotated pod %v with annotation %v", pod.GetName(), annotation) + klog.V(logL2).Infof("Annotated pod %v with annotation %v", pod.GetName(), annotation) } else { klog.Errorf("Pod %s annotating failed. Err %v", pod.GetName(), err.Error()) err = fmt.Errorf("pod %s annotation failed: %w", pod.GetName(), err) @@ -183,7 +198,7 @@ func getNodeGPUList(node *v1.Node) []string { return nil } - var cards = []string{} + cards := []string{} if gpuNumbersValue := concatenateSplitLabel(node, gpuNumbersLabel); gpuNumbersValue != "" { cards = getCardNameSlice(gpuNumbersValue) @@ -272,12 +287,12 @@ func (m *GASExtender) isGPUUsable(gpuName string, node *v1.Node, pod *v1.Pod) bo // 3) there is an allowlist-annotation in the Pod, and it contains the given GPU name -> true. func (m *GASExtender) isGPUAllowed(gpuName string, pod *v1.Pod) bool { if !m.allowlistEnabled || pod.Annotations == nil { - klog.V(l5).InfoS("gpu allowed", "gpuName", gpuName, "podName", pod.Name, "allowlistEnabled", m.allowlistEnabled) + klog.V(logL5).InfoS("gpu allowed", "gpuName", gpuName, "podName", pod.Name, "allowlistEnabled", m.allowlistEnabled) return true } - allow := false + var allow bool csvAllowlist, ok := pod.Annotations[allowlistAnnotationName] if ok { @@ -287,7 +302,7 @@ func (m *GASExtender) isGPUAllowed(gpuName string, pod *v1.Pod) bool { allow = true } - klog.V(l4).InfoS("gpu allow status", + klog.V(logL4).InfoS("gpu allow status", "allow", allow, "gpuName", gpuName, "podName", pod.Name, "allowlist", csvAllowlist) return allow @@ -299,7 +314,8 @@ func (m *GASExtender) isGPUAllowed(gpuName string, pod *v1.Pod) bool { // Otherwise, GPU is not considered denied. Usage of allowlist at the same time, might make it in practice denied. func (m *GASExtender) isGPUDenied(gpuName string, pod *v1.Pod) bool { if !m.denylistEnabled || pod.Annotations == nil { - klog.V(l5).InfoS("gpu use not denied", "gpuName", gpuName, "podName", pod.Name, "denylistEnabled", m.denylistEnabled) + klog.V(logL5).InfoS("gpu use not denied", + "gpuName", gpuName, "podName", pod.Name, "denylistEnabled", m.denylistEnabled) return false } @@ -312,7 +328,7 @@ func (m *GASExtender) isGPUDenied(gpuName string, pod *v1.Pod) bool { deny = deniedGPUs[gpuName] } - klog.V(l4).InfoS("gpu deny status", "deny", deny, "gpuName", gpuName, "podName", pod.Name, "denylist", csvDenylist) + klog.V(logL4).InfoS("gpu deny status", "deny", deny, "gpuName", gpuName, "podName", pod.Name, "denylist", csvDenylist) return deny } @@ -361,7 +377,8 @@ func movePreferredCardToFront(gpuNames []string, preferredCard string) { // The given gpuNames array must be sorted. func arrangeGPUNamesPerResourceAvailability(nodeResourcesUsed nodeResources, - gpuNames []string, balancedResource string) { + gpuNames []string, balancedResource string, +) { keys := make([]string, 0, len(gpuNames)) keys = append(keys, gpuNames...) @@ -390,7 +407,8 @@ func getSortedGPUNamesForNode(nodeResourcesUsed nodeResources) []string { } func (m *GASExtender) createTileAnnotation(card Card, numCards int64, containerRequest, perGPUCapacity resourceMap, - node *v1.Node, currentlyAllocatingTilesMap map[string][]int, preferredTiles []int) string { + node *v1.Node, currentlyAllocatingTilesMap map[string][]int, preferredTiles []int, +) string { requestedTiles := containerRequest[gpuTileResource] requestedTilesPerGPU := requestedTiles / numCards @@ -439,7 +457,8 @@ func (m *GASExtender) createTileAnnotation(card Card, numCards int64, containerR } func (m *GASExtender) getFreeTiles(tileCapacityPerGPU int64, node *v1.Node, - gpuName string, currentlyAllocatingTilesMap map[string][]int) []int { + gpuName string, currentlyAllocatingTilesMap map[string][]int, +) []int { nTiles := iCache.GetNodeTileStatus(m.cache, node.Name) freeTilesMap := map[int]bool{} @@ -469,9 +488,10 @@ func (m *GASExtender) getFreeTiles(tileCapacityPerGPU int64, node *v1.Node, } func (m *GASExtender) checkGpuAvailability(gpuName string, node *v1.Node, pod *v1.Pod, - usedGPUmap map[string]bool, gpuMap map[string]bool) bool { + usedGPUmap map[string]bool, gpuMap map[string]bool, +) bool { if usedGPUmap[gpuName] { - klog.V(l4).Infof("gpu %v is already used for this container", gpuName) + klog.V(logL4).Infof("gpu %v is already used for this container", gpuName) return false } @@ -482,7 +502,7 @@ func (m *GASExtender) checkGpuAvailability(gpuName string, node *v1.Node, pod *v // skip GPUs which are not usable and continue to next if need be if !m.isGPUUsable(gpuName, node, pod) { - klog.V(l4).Infof("node %v gpu %v is not usable, skipping it", node.Name, gpuName) + klog.V(logL4).Infof("node %v gpu %v is not usable, skipping it", node.Name, gpuName) return false } @@ -506,26 +526,24 @@ func (m *GASExtender) findXeLinkedGPUPair(gpuNames []string, nodeResourcesUsed nodeResources, availableTiles, nodeTilesAllocating nodeTiles, perGPUResourceRequest, perGPUCapacity resourceMap, - gpuMap, usedGPUmap map[string]bool) (cards []Card, err error) { - cards = []Card{} - err = errWontFit + gpuMap, usedGPUmap map[string]bool, +) ([]Card, error) { + cards := []Card{} + err := errWontFit found := false for _, gpuName := range gpuNames { usedResMap := nodeResourcesUsed[gpuName] - klog.V(l4).Info("Checking gpu ", gpuName) + klog.V(logL4).Info("Checking gpu ", gpuName) - if !m.checkGpuAvailability(gpuName, node, pod, usedGPUmap, gpuMap) { - continue - } - - if !checkResourceCapacity(perGPUResourceRequest, perGPUCapacity, usedResMap) { + if !m.checkGpuAvailability(gpuName, node, pod, usedGPUmap, gpuMap) || + !checkResourceCapacity(perGPUResourceRequest, perGPUCapacity, usedResMap) { continue } for _, tileIndex := range availableTiles[gpuName] { linkedGpuName, linkedTileID := getXeLinkedGPUInfo(gpuName, tileIndex, node) - klog.V(l4).Infof("Checking linked gpu %v tile id %v", gpuName, linkedTileID) + klog.V(logL4).Infof("Checking linked gpu %v tile id %v", gpuName, linkedTileID) if !m.checkGpuAvailability(linkedGpuName, node, pod, usedGPUmap, gpuMap) { continue @@ -534,26 +552,22 @@ func (m *GASExtender) findXeLinkedGPUPair(gpuNames []string, linkedGpuUsedResMap := nodeResourcesUsed[linkedGpuName] if contains, _ := containsInt(availableTiles[linkedGpuName], linkedTileID); contains && checkResourceCapacity(perGPUResourceRequest, perGPUCapacity, linkedGpuUsedResMap) { - err = usedResMap.addRM(perGPUResourceRequest) - if err != nil { - return []Card{}, err - } + // can't fail, checked with checkResourceCapacity at around line 540 + _ = usedResMap.addRM(perGPUResourceRequest) - err = linkedGpuUsedResMap.addRM(perGPUResourceRequest) - if err != nil { - err2 := usedResMap.subtractRM(perGPUResourceRequest) - klog.Errorf("resource addition failure: %v, subtraction result: %v", err.Error(), err2) + // can't fail, checked with checkResourceCapacity at around line 554 + _ = linkedGpuUsedResMap.addRM(perGPUResourceRequest) - return []Card{}, err - } - - klog.V(l4).Infof("gpu %v tile id %v and linked gpu %v tile id %v fits", + klog.V(logL4).Infof("gpu %v tile id %v and linked gpu %v tile id %v fits", gpuName, tileIndex, linkedGpuName, linkedTileID) found = true + err = nil - cards = append(cards, []Card{{gpuName: gpuName, xeLinkedTileIds: []int{tileIndex}}, - {gpuName: linkedGpuName, xeLinkedTileIds: []int{linkedTileID}}}...) + cards = append(cards, []Card{ + {gpuName: gpuName, xeLinkedTileIds: []int{tileIndex}}, + {gpuName: linkedGpuName, xeLinkedTileIds: []int{linkedTileID}}, + }...) usedGPUmap[gpuName] = true usedGPUmap[linkedGpuName] = true @@ -577,8 +591,11 @@ func (m *GASExtender) getXELinkedCardsForContainerGPURequest(containerRequest, p node *v1.Node, pod *v1.Pod, nodeResourcesUsed nodeResources, nodeTilesAllocating nodeTiles, - gpuMap map[string]bool) (cards []Card, preferred bool, err error) { - cards = []Card{} + gpuMap map[string]bool, +) ([]Card, bool, error) { + var preferred bool + + cards := []Card{} if len(containerRequest) == 0 { return cards, preferred, nil @@ -611,7 +628,6 @@ func (m *GASExtender) getXELinkedCardsForContainerGPURequest(containerRequest, p cardPair, err := m.findXeLinkedGPUPair(gpuNames, node, pod, nodeResourcesUsed, availableTiles, nodeTilesAllocating, perGPUResourceRequest, perGPUCapacity, gpuMap, usedGPUmap) - if err != nil { return []Card{}, preferred, err } @@ -627,13 +643,23 @@ func (m *GASExtender) getXELinkedCardsForContainerGPURequest(containerRequest, p func (m *GASExtender) getCardsForContainerGPURequest(containerRequest, perGPUCapacity resourceMap, node *v1.Node, pod *v1.Pod, nodeResourcesUsed nodeResources, - gpuMap map[string]bool) (cards []Card, preferred bool, err error) { - cards = []Card{} - + gpuMap map[string]bool, +) ([]Card, bool, error) { if len(containerRequest) == 0 { - return cards, preferred, nil + return []Card{}, false, nil } + return m.getCardsForContainerGPURequestImpl(containerRequest, perGPUCapacity, node, pod, nodeResourcesUsed, gpuMap) +} + +func (m *GASExtender) getCardsForContainerGPURequestImpl(containerRequest, perGPUCapacity resourceMap, + node *v1.Node, pod *v1.Pod, + nodeResourcesUsed nodeResources, + gpuMap map[string]bool, +) ([]Card, bool, error) { + var preferred bool + + cards := []Card{} usedGPUmap := map[string]bool{} // figure out container resources per gpu @@ -653,31 +679,33 @@ func (m *GASExtender) getCardsForContainerGPURequest(containerRequest, perGPUCap for gpuIndex, gpuName := range gpuNames { usedResMap := nodeResourcesUsed[gpuName] - klog.V(l4).Info("Checking gpu ", gpuName) + klog.V(logL4).Info("Checking gpu ", gpuName) if !m.checkGpuAvailability(gpuName, node, pod, usedGPUmap, gpuMap) { continue } if checkResourceCapacity(perGPUResourceRequest, perGPUCapacity, usedResMap) { - err := usedResMap.addRM(perGPUResourceRequest) - if err == nil { - fitted = true - - if gpuIndex == 0 && preferredCardAtFront { - preferred = true - } + // can't fail, checked with checkResourceCapacity above + _ = usedResMap.addRM(perGPUResourceRequest) + fitted = true - cards = append(cards, Card{gpuName: gpuName}) - usedGPUmap[gpuName] = true + if gpuIndex == 0 && preferredCardAtFront { + preferred = true } + cards = append(cards, Card{ + gpuName: gpuName, + xeLinkedTileIds: []int{}, + }) + usedGPUmap[gpuName] = true + break } } if !fitted { - klog.V(l4).Infof("pod %v will not fit node %v", pod.Name, node.Name) + klog.V(logL4).Infof("pod %v will not fit node %v", pod.Name, node.Name) return nil, false, errWontFit } @@ -741,7 +769,6 @@ func combineSamegpuResourceRequests(indexMap map[int]bool, resourceRequests []re func (m *GASExtender) getNodeForName(name string) (*v1.Node, error) { node, err := iCache.FetchNode(m.cache, name) - if err != nil { klog.Warningf("Node %s couldn't be read or node vanished", name) @@ -822,7 +849,7 @@ func (m *GASExtender) checkForSpaceAndRetrieveCards(pod *v1.Pod, node *v1.Node) } gpus := getNodeGPUList(node) - klog.V(l4).Infof("Node %v gpu list: %v", node.Name, gpus) + klog.V(logL4).Infof("Node %v gpu list: %v", node.Name, gpus) gpuCount := len(gpus) if gpuCount == 0 { @@ -832,8 +859,8 @@ func (m *GASExtender) checkForSpaceAndRetrieveCards(pod *v1.Pod, node *v1.Node) } perGPUCapacity := getPerGPUResourceCapacity(node, gpuCount) - nodeResourcesUsed, err := m.readNodeResources(node.Name) + nodeResourcesUsed, err := m.readNodeResources(node.Name) if err != nil { klog.Warningf("Node %s resources couldn't be read or node vanished", node.Name) @@ -849,14 +876,14 @@ func (m *GASExtender) checkForSpaceAndRetrieveCards(pod *v1.Pod, node *v1.Node) tilesPerGpu := perGPUCapacity[gpuTileResource] unavailableResources := m.createUnavailableNodeResources(node, tilesPerGpu) - klog.V(l4).Infof("Node %v unavailable resources: %v", node.Name, unavailableResources) + klog.V(logL4).Infof("Node %v unavailable resources: %v", node.Name, unavailableResources) // add unavailable resources as used, unavailable resources are // (possible) unused resources but are marked as do-not-use externally // e.g. too high temperature detected on a particular resource addUnavailableToUsedResources(nodeResourcesUsed, unavailableResources) - klog.V(l4).Infof("Node %v used resources: %v", node.Name, nodeResourcesUsed) + klog.V(logL4).Infof("Node %v used resources: %v", node.Name, nodeResourcesUsed) containerCards, preferred, err = m.checkForSpaceResourceRequests( perGPUCapacity, pod, node, nodeResourcesUsed, gpuMaps) @@ -865,7 +892,8 @@ func (m *GASExtender) checkForSpaceAndRetrieveCards(pod *v1.Pod, node *v1.Node) } func (m *GASExtender) checkForSpaceResourceRequests(perGPUCapacity resourceMap, pod *v1.Pod, node *v1.Node, - nodeResourcesUsed nodeResources, gpuMaps []map[string]bool) ([][]Card, bool, error) { + nodeResourcesUsed nodeResources, gpuMaps []map[string]bool, +) ([][]Card, bool, error) { var err error var cards []Card @@ -892,9 +920,9 @@ func (m *GASExtender) checkForSpaceResourceRequests(perGPUCapacity resourceMap, nodeTilesAllocating := nodeTiles{} - for i, containerRequest := range allContainerRequests { - if samegpuIndexMap[i] { - klog.V(l4).Infof("found container %v in same-gpu list", i) + for index, containerRequest := range allContainerRequests { + if samegpuIndexMap[index] { + klog.V(logL4).Infof("found container %v in same-gpu list", index) containerCards = append(containerCards, samegpuCard) @@ -903,7 +931,7 @@ func (m *GASExtender) checkForSpaceResourceRequests(perGPUCapacity resourceMap, // loop through gpu maps per numa node, or all gpus if single numa allocation is not requested for _, gpuMap := range gpuMaps { - klog.V(l4).Infof("getting cards for container %v", i) + klog.V(logL4).Infof("getting cards for container %v", index) if _, ok := pod.Annotations[xelinkAnnotationName]; ok { cards, preferred, err = m.getXELinkedCardsForContainerGPURequest(containerRequest, perGPUCapacity, @@ -921,7 +949,7 @@ func (m *GASExtender) checkForSpaceResourceRequests(perGPUCapacity resourceMap, } if err != nil { - klog.V(l4).Infof("Node %v container %v out of %v did not fit", node.Name, i+1, len(allContainerRequests)) + klog.V(logL4).Infof("Node %v container %v out of %v did not fit", node.Name, index+1, len(allContainerRequests)) return containerCards, preferred, err } @@ -932,7 +960,8 @@ func (m *GASExtender) checkForSpaceResourceRequests(perGPUCapacity resourceMap, func (m *GASExtender) getCardForSamegpu(samegpuIndexMap map[int]bool, allContainerRequests []resourceMap, perGPUCapacity resourceMap, node *v1.Node, pod *v1.Pod, nodeResourcesUsed nodeResources, - gpuMap map[string]bool) ([]Card, bool, error) { + gpuMap map[string]bool, +) ([]Card, bool, error) { gpuMapCopy := deepCopySimpleMap(gpuMap) if err := sanitizeSamegpuResourcesRequest(samegpuIndexMap, allContainerRequests); err != nil { @@ -955,7 +984,7 @@ func (m *GASExtender) getCardForSamegpu(samegpuIndexMap map[int]bool, allContain samegpuCard, preferred, err := m.getCardsForContainerGPURequest( combinedResourcesRequest, perGPUCapacity, node, pod, nodeResourcesUsed, gpuMapCopy) if err != nil { - klog.V(l4).Infof("Node %v same-gpu containers of pod %v did not fit", node.Name, pod.Name) + klog.V(logL4).Infof("Node %v same-gpu containers of pod %v did not fit", node.Name, pod.Name) return []Card{}, false, err } @@ -969,8 +998,8 @@ func (m *GASExtender) getCardForSamegpu(samegpuIndexMap map[int]bool, allContain return []Card{}, false, err } - klog.V(l4).Infof("Pod %v same-gpu containers fit to node %v", pod.Name, node.Name) - klog.V(l4).Infof("Node %v used resources: %v", node.Name, nodeResourcesUsed) + klog.V(logL4).Infof("Pod %v same-gpu containers fit to node %v", pod.Name, node.Name) + klog.V(logL4).Infof("Node %v used resources: %v", node.Name, nodeResourcesUsed) return samegpuCard, preferred, nil } @@ -978,9 +1007,13 @@ func (m *GASExtender) getCardForSamegpu(samegpuIndexMap map[int]bool, allContain // convertNodeCardsToAnnotations converts given container cards into card and tile // annotation strings. func (m *GASExtender) convertNodeCardsToAnnotations(pod *v1.Pod, - node *v1.Node, containerCards [][]Card) (annotation, tileAnnotation string) { + node *v1.Node, containerCards [][]Card, +) (string, string) { + annotation := "" + tileAnnotation := "" gpuCount := len(getNodeGPUList(node)) - klog.V(l4).Info("Node gpu count:", gpuCount) + + klog.V(logL4).Info("Node gpu count:", gpuCount) perGPUCapacity := getPerGPUResourceCapacity(node, gpuCount) @@ -1059,7 +1092,8 @@ func (m *GASExtender) createUnavailableTilesStat(node *v1.Node, tilesPerGpu int, func (m *GASExtender) createAvailableXeLinkedTilesStat(node *v1.Node, tileCapacityPerGPU int, gpuNames []string, - nodeTilesAllocating nodeTiles) nodeTiles { + nodeTilesAllocating nodeTiles, +) nodeTiles { availableTiles := nodeTiles{} unavailableTiles := m.createUnavailableTilesStat(node, tileCapacityPerGPU, nodeTilesAllocating) @@ -1126,7 +1160,7 @@ func checkResourceCapacity(neededResources, capacity, used resourceMap) bool { resCapacity, ok := capacity[resName] if !ok || resCapacity <= 0 { - klog.V(l4).Info(" no capacity available for ", resName) + klog.V(logL4).Info(" no capacity available for ", resName) return false } @@ -1139,7 +1173,7 @@ func checkResourceCapacity(neededResources, capacity, used resourceMap) bool { return false } - klog.V(l4).Info(" resource ", resName, " capacity:", strconv.FormatInt(resCapacity, base10), " used:", + klog.V(logL4).Info(" resource ", resName, " capacity:", strconv.FormatInt(resCapacity, base10), " used:", strconv.FormatInt(resUsed, base10), " need:", strconv.FormatInt(resNeed, base10)) if resUsed+resNeed < 0 { @@ -1149,13 +1183,13 @@ func checkResourceCapacity(neededResources, capacity, used resourceMap) bool { } if resCapacity < resUsed+resNeed { - klog.V(l4).Info(" not enough resources") + klog.V(logL4).Info(" not enough resources") return false } } - klog.V(l4).Info(" there is enough resources") + klog.V(logL4).Info(" there is enough resources") return true } @@ -1178,18 +1212,75 @@ func (m *GASExtender) retrievePod(podName, podNamespace string, uid types.UID) ( return pod, nil } +func createBindResult() *extender.BindingResult { + return &extender.BindingResult{ + Error: "", + } +} + +func createV1Binding(args *extender.BindingArgs) *v1.Binding { + return &v1.Binding{ + TypeMeta: metav1.TypeMeta{ + Kind: "", + APIVersion: "", + }, + ObjectMeta: metav1.ObjectMeta{ + Name: args.PodName, + GenerateName: "", + Namespace: "", + SelfLink: "", + UID: args.PodUID, + ResourceVersion: "", + Generation: 0, + CreationTimestamp: metav1.Time{ + Time: time.Time{}, + }, + DeletionTimestamp: &metav1.Time{ + Time: time.Time{}, + }, + DeletionGracePeriodSeconds: new(int64), + Labels: map[string]string{}, + Annotations: map[string]string{}, + OwnerReferences: []metav1.OwnerReference{}, + Finalizers: []string{}, + ManagedFields: []metav1.ManagedFieldsEntry{}, + }, + Target: v1.ObjectReference{ + Kind: "Node", + Namespace: "", + Name: args.Node, + UID: "", + APIVersion: "", + ResourceVersion: "", + FieldPath: "", + }, + } +} + +func createOptions() *metav1.CreateOptions { + return &metav1.CreateOptions{ + TypeMeta: metav1.TypeMeta{ + Kind: "", + APIVersion: "", + }, + DryRun: []string{}, + FieldManager: "", + FieldValidation: "", + } +} + func (m *GASExtender) bindNode(ctx context.Context, args *extender.BindingArgs) *extender.BindingResult { - result := extender.BindingResult{} + result := createBindResult() pod, err := m.retrievePod(args.PodName, args.PodNamespace, args.PodUID) if err != nil { result.Error = err.Error() - return &result + return result } m.rwmutex.Lock() - klog.V(l5).Infof("bind %v:%v to node %v locked", args.PodNamespace, args.PodName, args.Node) + klog.V(logL5).Infof("bind %v:%v to node %v locked", args.PodNamespace, args.PodName, args.Node) defer m.rwmutex.Unlock() resourcesAdjusted := false @@ -1213,42 +1304,60 @@ func (m *GASExtender) bindNode(ctx context.Context, args *extender.BindingArgs) // pod should always fit, but one never knows if something bad happens between filtering and binding node, err := m.getNodeForName(args.Node) if err != nil { - return &result + return result } cards, _, err := m.checkForSpaceAndRetrieveCards(pod, node) if err != nil { - return &result + return result } annotation, tileAnnotation = m.convertNodeCardsToAnnotations(pod, node, cards) if annotation == "" { - return &result + return result } - klog.V(l3).Infof("bind %v:%v to node %v annotation %v tileAnnotation %v", + klog.V(logL3).Infof("bind %v:%v to node %v annotation %v tileAnnotation %v", args.PodNamespace, args.PodName, args.Node, annotation, tileAnnotation) err = iCache.AdjustPodResourcesL(m.cache, pod, add, annotation, tileAnnotation, args.Node) if err != nil { - return &result + return result } resourcesAdjusted = true err = m.annotatePodBind(ctx, annotation, tileAnnotation, pod) // annotate POD with per-container GPU selection if err != nil { - return &result + return result } - binding := &v1.Binding{ - ObjectMeta: metav1.ObjectMeta{Name: args.PodName, UID: args.PodUID}, - Target: v1.ObjectReference{Kind: "Node", Name: args.Node}, - } - opts := metav1.CreateOptions{} - err = m.clientset.CoreV1().Pods(args.PodNamespace).Bind(ctx, binding, opts) + binding := createV1Binding(args) + opts := createOptions() + err = m.clientset.CoreV1().Pods(args.PodNamespace).Bind(ctx, binding, *opts) + + return result +} - return &result +func createFilterResult() *extender.FilterResult { + return &extender.FilterResult{ + Nodes: &v1.NodeList{ + TypeMeta: metav1.TypeMeta{ + Kind: "", + APIVersion: "", + }, + ListMeta: metav1.ListMeta{ + SelfLink: "", + ResourceVersion: "", + Continue: "", + RemainingItemCount: new(int64), + }, + Items: []v1.Node{}, + }, + NodeNames: &[]string{}, + FailedNodes: map[string]string{}, + Error: "", + } } // filterNodes takes in the arguments for the scheduler and filters nodes based on @@ -1259,18 +1368,18 @@ func (m *GASExtender) filterNodes(args *extender.Args) *extender.FilterResult { var preferredNodeNames []string failedNodes := extender.FailedNodesMap{} - result := extender.FilterResult{} + result := createFilterResult() if args.NodeNames == nil || len(*args.NodeNames) == 0 { result.Error = "No nodes to compare. " + "This should not happen, perhaps the extender is misconfigured with NodeCacheCapable == false." klog.Error(result.Error) - return &result + return result } m.rwmutex.Lock() - klog.V(l5).Infof("filter %v:%v from %v locked", args.Pod.Namespace, args.Pod.Name, *args.NodeNames) + klog.V(logL5).Infof("filter %v:%v from %v locked", args.Pod.Namespace, args.Pod.Name, *args.NodeNames) defer m.rwmutex.Unlock() for _, nodeName := range *args.NodeNames { @@ -1292,41 +1401,38 @@ func (m *GASExtender) filterNodes(args *extender.Args) *extender.FilterResult { } } - result = extender.FilterResult{ - NodeNames: &nodeNames, - FailedNodes: failedNodes, - Error: "", - } + result.NodeNames = &nodeNames + result.FailedNodes = failedNodes + result.Error = "" if len(preferredNodeNames) > 0 { result.NodeNames = &preferredNodeNames } - return &result + return result } // decodeRequest reads the json request into the given interface args. // It returns an error if the request is not in the required format. -func (m *GASExtender) decodeRequest(args interface{}, r *http.Request) error { - if r.Body == nil { +func (m *GASExtender) decodeRequest(args interface{}, request *http.Request) error { + if request.Body == nil { return errEmptyBody } - if klog.V(l5).Enabled() { - requestDump, err := httputil.DumpRequest(r, true) + if klog.V(logL5).Enabled() { + requestDump, err := httputil.DumpRequest(request, true) if err == nil { klog.Infof("http-request:\n%v", string(requestDump)) } } - decoder := json.NewDecoder(r.Body) + decoder := json.NewDecoder(request.Body) if err := decoder.Decode(&args); err != nil { return errDecode } - err := r.Body.Close() - + err := request.Body.Close() if err != nil { err = fmt.Errorf("failed to close request body: %w", err) } @@ -1344,21 +1450,23 @@ func (m *GASExtender) writeResponse(w http.ResponseWriter, result interface{}) { // Prioritize manages all prioritize requests from the scheduler extender. // Not implemented yet by GAS, hence response with StatusNotFound. -func (m *GASExtender) Prioritize(w http.ResponseWriter, r *http.Request) { +func (m *GASExtender) Prioritize(w http.ResponseWriter, _ *http.Request) { w.WriteHeader(http.StatusNotFound) } // Filter manages all filter requests from the scheduler. First it decodes the request, // then it calls the filter logic and writes a response to the scheduler. -func (m *GASExtender) Filter(w http.ResponseWriter, r *http.Request) { - klog.V(l4).Info("filter request received") +func (m *GASExtender) Filter(writer http.ResponseWriter, request *http.Request) { + klog.V(logL4).Info("filter request received") + // extenderArgs is too big of a struct for any sane create-function, funlen would fail + //nolint:exhaustruct extenderArgs := extender.Args{} - err := m.decodeRequest(&extenderArgs, r) + err := m.decodeRequest(&extenderArgs, request) if err != nil { klog.Errorf("cannot decode request %v", err) - w.WriteHeader(http.StatusNotFound) + writer.WriteHeader(http.StatusNotFound) return } @@ -1366,35 +1474,40 @@ func (m *GASExtender) Filter(w http.ResponseWriter, r *http.Request) { filteredNodes := m.filterNodes(&extenderArgs) if filteredNodes.Error != "" { klog.Error("filtering failed") - w.WriteHeader(http.StatusNotFound) + writer.WriteHeader(http.StatusNotFound) } - m.writeResponse(w, filteredNodes) - klog.V(l4).Info("filter function done, responded") + m.writeResponse(writer, filteredNodes) + klog.V(logL4).Info("filter function done, responded") } // Bind binds the pod to the node. -func (m *GASExtender) Bind(w http.ResponseWriter, r *http.Request) { - klog.V(l4).Info("bind request received") +func (m *GASExtender) Bind(writer http.ResponseWriter, request *http.Request) { + klog.V(logL4).Info("bind request received") - extenderArgs := extender.BindingArgs{} - err := m.decodeRequest(&extenderArgs, r) + extenderArgs := extender.BindingArgs{ + PodName: "", + PodNamespace: "", + PodUID: "", + Node: "", + } + err := m.decodeRequest(&extenderArgs, request) if err != nil { klog.Errorf("cannot decode request %v", err) - w.WriteHeader(http.StatusNotFound) + writer.WriteHeader(http.StatusNotFound) return } - result := m.bindNode(r.Context(), &extenderArgs) + result := m.bindNode(request.Context(), &extenderArgs) if result.Error != "" { klog.Error("bind failed") - w.WriteHeader(http.StatusNotFound) + writer.WriteHeader(http.StatusNotFound) } - m.writeResponse(w, result) - klog.V(l4).Info("bind function done, responded") + m.writeResponse(writer, result) + klog.V(logL4).Info("bind function done, responded") } // error handler deals with requests sent to an invalid endpoint and returns a 404. @@ -1455,14 +1568,15 @@ func containersRequestingSamegpu(pod *v1.Pod) (map[string]bool, error) { samegpuMap[containerName] = true } - klog.V(l4).Infof("Successfully parsed %v annotation in pod %v", + klog.V(logL4).Infof("Successfully parsed %v annotation in pod %v", samegpuAnnotationName, pod.Name) return samegpuMap, nil } func sanitizeSamegpuResourcesRequest( - samegpuIndexMap map[int]bool, allResourceRequests []resourceMap) error { + samegpuIndexMap map[int]bool, allResourceRequests []resourceMap, +) error { if len(samegpuIndexMap) == 0 { return nil } diff --git a/gpu-aware-scheduling/pkg/gpuscheduler/scheduler_test.go b/gpu-aware-scheduling/pkg/gpuscheduler/scheduler_test.go index 1519e8a2..2706a751 100644 --- a/gpu-aware-scheduling/pkg/gpuscheduler/scheduler_test.go +++ b/gpu-aware-scheduling/pkg/gpuscheduler/scheduler_test.go @@ -276,13 +276,13 @@ func (t *testWriter) WriteHeader(statusCode int) { } func TestErrorHandler(t *testing.T) { - w := testWriter{headerStatus: 0} + writer := testWriter{headerStatus: 0} Convey("When error handler is called", t, func() { gas := getEmptyExtender() - gas.errorHandler(&w, nil) - So(w.headerStatus, ShouldEqual, http.StatusNotFound) + gas.errorHandler(&writer, nil) + So(writer.headerStatus, ShouldEqual, http.StatusNotFound) }) } @@ -581,7 +581,10 @@ func TestPreferredGPU(t *testing.T) { gpuMap) So(len(cards), ShouldEqual, 1) - So(cards[0], ShouldResemble, Card{gpuName: "card0"}) + So(cards[0], ShouldResemble, Card{ + gpuName: "card0", + xeLinkedTileIds: []int{}, + }) So(err, ShouldBeNil) So(preferred, ShouldBeFalse) }) @@ -594,7 +597,10 @@ func TestPreferredGPU(t *testing.T) { gpuMap) So(len(cards), ShouldEqual, 1) - So(cards[0], ShouldResemble, Card{gpuName: "card2"}) + So(cards[0], ShouldResemble, Card{ + gpuName: "card2", + xeLinkedTileIds: []int{}, + }) So(err, ShouldBeNil) So(preferred, ShouldBeTrue) }) @@ -604,14 +610,14 @@ func TestFilter(t *testing.T) { gas := getEmptyExtender() Convey("When Filter is called", t, func() { - w := testWriter{} - r := http.Request{} + writer := testWriter{} + request := http.Request{} Convey("when args are fine but request body is empty", func() { - r.Method = http.MethodPost - r.ContentLength = 100 - r.Header = http.Header{} - r.Header.Set("Content-Type", "application/json") - gas.Filter(&w, &r) + request.Method = http.MethodPost + request.ContentLength = 100 + request.Header = http.Header{} + request.Header.Set("Content-Type", "application/json") + gas.Filter(&writer, &request) }) Convey("when args are fine but request body is ok", func() { content, err := json.Marshal(map[string]string{"foo": "bar"}) @@ -620,7 +626,7 @@ func TestFilter(t *testing.T) { http.MethodPost, "http://foo/bar", bytes.NewBuffer(content)) So(err, ShouldBeNil) request.Header.Set("Content-Type", "application/json") - gas.Filter(&w, request) + gas.Filter(&writer, request) }) }) } @@ -633,14 +639,14 @@ func TestBind(t *testing.T) { iCache = &mockCache Convey("When Bind is called", t, func() { - w := testWriter{} - r := http.Request{} + writer := testWriter{} + request := http.Request{} Convey("when args are fine but request body is empty", func() { - r.Method = http.MethodPost - r.ContentLength = 100 - r.Header = http.Header{} - r.Header.Set("Content-Type", "application/json") - gas.Bind(&w, &r) + request.Method = http.MethodPost + request.ContentLength = 100 + request.Header = http.Header{} + request.Header.Set("Content-Type", "application/json") + gas.Bind(&writer, &request) }) Convey("when args are fine but request body is ok", func() { content, err := json.Marshal(map[string]string{"foo": "bar"}) @@ -650,7 +656,7 @@ func TestBind(t *testing.T) { So(err, ShouldBeNil) request.Header.Set("Content-Type", "application/json") mockCache.On("FetchPod", mock.Anything, mock.Anything, mock.Anything).Return(nil, errMock).Once() - gas.Bind(&w, request) + gas.Bind(&writer, request) }) }) @@ -676,7 +682,8 @@ func TestGetNodeGPUListFromGpuNumbers(t *testing.T) { node := v1.Node{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ - gpuNumbersLabel: "0.1.2"}, + gpuNumbersLabel: "0.1.2", + }, }, } @@ -690,7 +697,8 @@ func TestGetNodeGPUListFromGpuNumbers(t *testing.T) { Labels: map[string]string{ gpuNumbersLabel: "0.1.2", gpuNumbersLabel + "2": "Z.5.8.9", - gpuNumbersLabel + "3": "Z.10"}, + gpuNumbersLabel + "3": "Z.10", + }, }, } @@ -834,8 +842,10 @@ func TestResourceBalancedCardsForContainerGPURequest(t *testing.T) { containerRequest := resourceMap{"gpu.intel.com/i915": 1, "gpu.intel.com/foo": 1} perGPUCapacity := resourceMap{"gpu.intel.com/i915": 1, "gpu.intel.com/foo": 4} - nodeResourcesUsed := nodeResources{"card0": resourceMap{"gpu.intel.com/foo": 1}, - "card1": resourceMap{"gpu.intel.com/foo": 2}, "card2": resourceMap{}} + nodeResourcesUsed := nodeResources{ + "card0": resourceMap{"gpu.intel.com/foo": 1}, + "card1": resourceMap{"gpu.intel.com/foo": 2}, "card2": resourceMap{}, + } gpuMap := map[string]bool{"card0": true, "card1": true, "card2": true} Convey("When GPUs are resource balanced, the least consumed GPU should be used", t, func() { @@ -845,7 +855,10 @@ func TestResourceBalancedCardsForContainerGPURequest(t *testing.T) { gpuMap) So(len(cards), ShouldEqual, 1) - So(cards[0], ShouldResemble, Card{gpuName: "card2"}) + So(cards[0], ShouldResemble, Card{ + gpuName: "card2", + xeLinkedTileIds: []int{}, + }) So(err, ShouldBeNil) So(preferred, ShouldBeFalse) }) @@ -879,15 +892,17 @@ func TestFilterWithXeLinkedDisabledTiles(t *testing.T) { }, { description: "when two tiles are disabled and there are no good xe-links left", - extraLabels: map[string]string{tasNSPrefix + "policy/" + tileDisableLabelPrefix + "card0_gt0": trueValueString, - tasNSPrefix + "policy/" + tileDisableLabelPrefix + "card2_gt1": trueValueString}, + extraLabels: map[string]string{ + tasNSPrefix + "policy/" + tileDisableLabelPrefix + "card0_gt0": trueValueString, + tasNSPrefix + "policy/" + tileDisableLabelPrefix + "card2_gt1": trueValueString, + }, expectedResult: true, // node fails (is filtered) }, } Convey("When node has four cards with two xelinks and one disabled xe-linked tile, pod should still fit", t, func() { - for _, tc := range testCases { - t.Logf("test %v", tc.description) + for _, testCase := range testCases { + t.Logf("test %v", testCase.description) mockCache.On("FetchPod", mock.Anything, args.PodNamespace, args.PodName).Return(&v1.Pod{ Spec: *getMockPodSpecWithTile(1), @@ -912,7 +927,7 @@ func TestFilterWithXeLinkedDisabledTiles(t *testing.T) { }, }, } - for key, value := range tc.extraLabels { + for key, value := range testCase.extraLabels { node.Labels[key] = value } mockCache.On("FetchNode", mock.Anything, args.Node).Return(&node, nil).Once() @@ -932,7 +947,7 @@ func TestFilterWithXeLinkedDisabledTiles(t *testing.T) { result := gas.filterNodes(&args) So(result.Error, ShouldEqual, "") _, ok := result.FailedNodes[nodename] - So(ok, ShouldEqual, tc.expectedResult) + So(ok, ShouldEqual, testCase.expectedResult) } }) @@ -967,8 +982,8 @@ func TestFilterWithNContainerSameGPU(t *testing.T) { } Convey("When node has 3 i915 left in cards, pod should not fit", t, func() { - for _, tc := range testCases { - t.Logf("test %v", tc.description) + for _, testCase := range testCases { + t.Logf("test %v", testCase.description) node := v1.Node{ ObjectMeta: metav1.ObjectMeta{ Labels: map[string]string{ @@ -986,12 +1001,13 @@ func TestFilterWithNContainerSameGPU(t *testing.T) { }, }, } - for key, value := range tc.extraLabels { + for key, value := range testCase.extraLabels { node.Labels[key] = value } mockCache.On("FetchNode", mock.Anything, args.Node).Return(&node, nil).Once() - usedResources := nodeResources{"card0": resourceMap{"gpu.intel.com/i915": 5, "gpu.intel.com/millicores": 500}, + usedResources := nodeResources{ + "card0": resourceMap{"gpu.intel.com/i915": 5, "gpu.intel.com/millicores": 500}, "card1": resourceMap{"gpu.intel.com/i915": 5, "gpu.intel.com/millicores": 500}, } @@ -1005,7 +1021,7 @@ func TestFilterWithNContainerSameGPU(t *testing.T) { result := gas.filterNodes(&args) So(result.Error, ShouldEqual, "") _, ok := result.FailedNodes[nodename] - So(ok, ShouldEqual, tc.expectedResult) + So(ok, ShouldEqual, testCase.expectedResult) } }) @@ -1100,7 +1116,7 @@ func TestRunSchedulingLogicWithMultiContainerXelinkedTileResourceReq(t *testing. } Convey("When running scheduling logic with multi-container pod with tile request", t, func() { - for _, tc := range testCases { + for _, testCase := range testCases { pod := getFakePod() mockNode := getMockNode(4, 4, "card0", "card1", "card2", "card3") pod.Spec = *getMockPodSpecMultiContXeLinked(2) @@ -1114,18 +1130,18 @@ func TestRunSchedulingLogicWithMultiContainerXelinkedTileResourceReq(t *testing. nodeRes := nodeResources{"card0": resourceMap{"gpu.intel.com/i915": 0, "gpu.intel.com/tiles": 0}} noTilesInUse := nodeTiles{"card0": []int{}} - for key, value := range tc.extraLabels { + for key, value := range testCase.extraLabels { mockNode.Labels[key] = value } - for key, value := range tc.extraAnnotations { + for key, value := range testCase.extraAnnotations { pod.Annotations[key] = value } cardAnnotation := "" tileAnnotation := "" timestampFound := false - applyCheck := func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + applyCheck := func(action k8stesting.Action) (bool, runtime.Object, error) { patchAction, _ := action.(k8stesting.PatchAction) patch := patchAction.GetPatch() @@ -1161,8 +1177,8 @@ func TestRunSchedulingLogicWithMultiContainerXelinkedTileResourceReq(t *testing. result := gas.bindNode(ctx, &args) clientset.Fake.ReactionChain = clientset.Fake.ReactionChain[1:] - So(cardAnnotation, ShouldEqual, tc.expectedCardAnnotation) - if tc.defaultTileCheck { + So(cardAnnotation, ShouldEqual, testCase.expectedCardAnnotation) + if testCase.defaultTileCheck { split := strings.Split(tileAnnotation, "|") // Check the tile split between containers So(len(split), ShouldEqual, 2) @@ -1171,8 +1187,8 @@ func TestRunSchedulingLogicWithMultiContainerXelinkedTileResourceReq(t *testing. So(strings.Count(split[1], "card3:gt2"), ShouldEqual, 1) } - So(timestampFound, ShouldEqual, tc.expectTimestamp) - if tc.expectError { + So(timestampFound, ShouldEqual, testCase.expectTimestamp) + if testCase.expectError { So(result.Error, ShouldNotEqual, "") } else { So(result.Error, ShouldEqual, "") @@ -1208,7 +1224,7 @@ func TestRunSchedulingLogicWithMultiContainerTileResourceReq(t *testing.T) { cardAnnotation := "" tileAnnotation := "" timestampFound := false - applyCheck := func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + applyCheck := func(action k8stesting.Action) (bool, runtime.Object, error) { patchAction, _ := action.(k8stesting.PatchAction) patch := patchAction.GetPatch() @@ -1293,10 +1309,12 @@ func TestTileDisablingDeschedulingAndPreference(t *testing.T) { Status: v1.NodeStatus{ Capacity: v1.ResourceList{ "gpu.intel.com/i915": resource.MustParse("1"), - "gpu.intel.com/tiles": resource.MustParse("1")}, + "gpu.intel.com/tiles": resource.MustParse("1"), + }, Allocatable: v1.ResourceList{ "gpu.intel.com/i915": resource.MustParse("1"), - "gpu.intel.com/tiles": resource.MustParse("1")}, + "gpu.intel.com/tiles": resource.MustParse("1"), + }, }, }, nil).Once() mockCache.On("GetNodeResourceStatus", mock.Anything, mock.Anything).Return(nodeResources{}, nil).Once() @@ -1326,10 +1344,12 @@ func TestTileDisablingDeschedulingAndPreference(t *testing.T) { Status: v1.NodeStatus{ Capacity: v1.ResourceList{ "gpu.intel.com/i915": resource.MustParse("2"), - "gpu.intel.com/tiles": resource.MustParse("2")}, + "gpu.intel.com/tiles": resource.MustParse("2"), + }, Allocatable: v1.ResourceList{ "gpu.intel.com/i915": resource.MustParse("2"), - "gpu.intel.com/tiles": resource.MustParse("2")}, + "gpu.intel.com/tiles": resource.MustParse("2"), + }, }, }, nil).Once() mockCache.On("GetNodeResourceStatus", mock.Anything, mock.Anything).Return(nodeResources{}, nil).Once() @@ -1344,7 +1364,7 @@ func TestTileDisablingDeschedulingAndPreference(t *testing.T) { Convey("When node has a preferred card label and fits", t, func() { applied := false - applyCheck := func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + applyCheck := func(action k8stesting.Action) (bool, runtime.Object, error) { patchAction, _ := action.(k8stesting.PatchAction) requiredStr := "card1" patch := patchAction.GetPatch() @@ -1391,7 +1411,7 @@ func TestTileDisablingDeschedulingAndPreference(t *testing.T) { Convey("When node has a tile preferred-label", t, func() { applied := false - applyCheck := func(action k8stesting.Action) (handled bool, ret runtime.Object, err error) { + applyCheck := func(action k8stesting.Action) (bool, runtime.Object, error) { patchAction, _ := action.(k8stesting.PatchAction) requiredStr := "card0:gt3" patch := patchAction.GetPatch() @@ -1627,7 +1647,8 @@ func TestSanitizeSamegpuResourcesRequest(t *testing.T) { // success samegpuIndexes = map[int]bool{0: true} resourceRequests = []resourceMap{ - {"gpu.intel.com/i915": 1, + { + "gpu.intel.com/i915": 1, "gpu.intel.com/millicores": 100, "gpu.intel.com/memory.max": 8589934592, }, diff --git a/gpu-aware-scheduling/pkg/gpuscheduler/utils.go b/gpu-aware-scheduling/pkg/gpuscheduler/utils.go index bb8b1aea..e59238d4 100644 --- a/gpu-aware-scheduling/pkg/gpuscheduler/utils.go +++ b/gpu-aware-scheduling/pkg/gpuscheduler/utils.go @@ -34,26 +34,29 @@ var ( xeLinkReg = regexp.MustCompile(regexXeLink) ) -type DisabledTilesMap map[string][]int -type DescheduledTilesMap map[string][]int -type PreferredTilesMap map[string][]int +type ( + DisabledTilesMap map[string][]int + DescheduledTilesMap map[string][]int + PreferredTilesMap map[string][]int +) // Return all resources requests and samegpuSearchmap indicating which resourceRequests // should be counted together. samegpuSearchmap is same length as samegpuContainerNames arg, // Key is index of allResource item, value is true if container was listed in same-gpu annotation. func containerRequests(pod *v1.Pod, samegpuContainerNames map[string]bool) ( - map[int]bool, []resourceMap) { + map[int]bool, []resourceMap, +) { samegpuSearchMap := map[int]bool{} allResources := []resourceMap{} for idx, container := range pod.Spec.Containers { - rm := resourceMap{} + resMap := resourceMap{} for name, quantity := range container.Resources.Requests { resourceName := name.String() if strings.HasPrefix(resourceName, gpuPrefix) { value, _ := quantity.AsInt64() - rm[resourceName] = value + resMap[resourceName] = value } } @@ -61,7 +64,7 @@ func containerRequests(pod *v1.Pod, samegpuContainerNames map[string]bool) ( samegpuSearchMap[idx] = true } - allResources = append(allResources, rm) + allResources = append(allResources, resMap) } return samegpuSearchMap, allResources @@ -81,26 +84,27 @@ func addPCIGroupGPUs(node *v1.Node, card string, cards []string) []string { return cards } -func createTileMapping(labels map[string]string) ( - DisabledTilesMap, DescheduledTilesMap, PreferredTilesMap) { - disabled := DisabledTilesMap{} - descheduled := DescheduledTilesMap{} - preferred := PreferredTilesMap{} +func extractCardAndTile(cardTileCombo string) (string, int, error) { + card := "" + tile := -1 - extractCardAndTile := func(cardTileCombo string) (card string, tile int, err error) { - card = "" - tile = -1 + values := cardTileReg.FindStringSubmatch(cardTileCombo) + if len(values) != regexDesiredCount { + return card, tile, errExtractFail + } - values := cardTileReg.FindStringSubmatch(cardTileCombo) - if len(values) != regexDesiredCount { - return card, tile, errExtractFail - } + card = "card" + values[1] + tile, _ = strconv.Atoi(values[2]) - card = "card" + values[1] - tile, _ = strconv.Atoi(values[2]) + return card, tile, nil +} - return card, tile, nil - } +func createTileMapping(labels map[string]string) ( + DisabledTilesMap, DescheduledTilesMap, PreferredTilesMap, +) { + disabled := DisabledTilesMap{} + descheduled := DescheduledTilesMap{} + preferred := PreferredTilesMap{} for label, value := range labels { stripped, ok := labelWithoutTASNS(label) @@ -162,7 +166,8 @@ func createDisabledTileMapping(labels map[string]string) map[string][]int { // creates two card to tile-index maps where first is disabled and second is preferred mapping. func createDisabledAndPreferredTileMapping(labels map[string]string) ( - DisabledTilesMap, PreferredTilesMap) { + DisabledTilesMap, PreferredTilesMap, +) { dis, des, pref := createTileMapping(labels) combineMappings(des, dis) @@ -219,7 +224,7 @@ func concatenateSplitLabel(node *v1.Node, labelName string) string { postFix := 2 value := node.Labels[labelName] - for continuingLabelValue, ok := node.Labels[labelName+strconv.Itoa(postFix)]; ok; { + for continuingLabelValue, ok1 := node.Labels[labelName+strconv.Itoa(postFix)]; ok1; { if !strings.HasPrefix(continuingLabelValue, labelControlChar) { klog.Warningf("concatenated chuck has invalid prefix: %s", continuingLabelValue[:len(labelControlChar)]) @@ -229,7 +234,7 @@ func concatenateSplitLabel(node *v1.Node, labelName string) string { value += continuingLabelValue[len(labelControlChar):] postFix++ - continuingLabelValue, ok = node.Labels[labelName+strconv.Itoa(postFix)] + continuingLabelValue, ok1 = node.Labels[labelName+strconv.Itoa(postFix)] } return value @@ -398,7 +403,12 @@ type linkInfo struct { } func parseXeLink(link string) (linkInfo, error) { - lInfo := linkInfo{} + lInfo := linkInfo{ + lZeroDeviceID: 0, + lZeroSubdeviceID: 0, + linkedLZeroDeviceID: 0, + linkedLZeroSubdeviceID: 0, + } submatches := xeLinkReg.FindStringSubmatch(link) diff --git a/gpu-aware-scheduling/pkg/gpuscheduler/utils_test.go b/gpu-aware-scheduling/pkg/gpuscheduler/utils_test.go index 85ac48cb..1cd74dbb 100644 --- a/gpu-aware-scheduling/pkg/gpuscheduler/utils_test.go +++ b/gpu-aware-scheduling/pkg/gpuscheduler/utils_test.go @@ -40,14 +40,14 @@ func TestGetXeLinkedGPUInfo(t *testing.T) { // remember links are in lzero identifiers, gpu names are numbered from devfs // so 1.0-2.1 = card2-card3 if gpu numbers happen to start from 1 instead of 0 - name, id := getXeLinkedGPUInfo("card2", 0, &node) + name, linkedLZeroSubdeviceID := getXeLinkedGPUInfo("card2", 0, &node) So(name, ShouldEqual, "card3") - So(id, ShouldEqual, 1) + So(linkedLZeroSubdeviceID, ShouldEqual, 1) // no link test - name, id = getXeLinkedGPUInfo("card8", 0, &node) + name, linkedLZeroSubdeviceID = getXeLinkedGPUInfo("card8", 0, &node) So(name, ShouldEqual, "") - So(id, ShouldEqual, -1) + So(linkedLZeroSubdeviceID, ShouldEqual, -1) }) Convey("When gpu-numbers are malformed", t, func() {