Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added ability for vSphere to reconnect on secret update #90836

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions staging/src/k8s.io/legacy-cloud-providers/vsphere/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ go_test(
"//vendor/github.com/vmware/govmomi/vapi/tags:go_default_library",
"//vendor/github.com/vmware/govmomi/vim25/mo:go_default_library",
"//vendor/github.com/vmware/govmomi/vim25/types:go_default_library",
"//vendor/k8s.io/klog/v2:go_default_library",
],
)

Expand Down
102 changes: 47 additions & 55 deletions staging/src/k8s.io/legacy-cloud-providers/vsphere/nodemanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -281,13 +281,45 @@ func (nm *NodeManager) removeNode(node *v1.Node) {
//
// This method is a getter but it can cause side-effect of updating NodeInfo object.
func (nm *NodeManager) GetNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error) {
getNodeInfo := func(nodeName k8stypes.NodeName) *NodeInfo {
nm.nodeInfoLock.RLock()
nodeInfo := nm.nodeInfoMap[convertToString(nodeName)]
nm.nodeInfoLock.RUnlock()
return nodeInfo
return nm.getRefreshedNodeInfo(nodeName)
}

// GetNodeDetails returns NodeDetails for all the discovered nodes.
//
// This method is a getter but it can cause side-effect of updating NodeInfo objects.
func (nm *NodeManager) GetNodeDetails() ([]NodeDetails, error) {
nm.registeredNodesLock.Lock()
defer nm.registeredNodesLock.Unlock()
var nodeDetails []NodeDetails

for nodeName, nodeObj := range nm.registeredNodes {
nodeInfo, err := nm.GetNodeInfoWithNodeObject(nodeObj)
if err != nil {
return nil, err
}
klog.V(4).Infof("Updated NodeInfo %v for node %q.", nodeInfo, nodeName)
nodeDetails = append(nodeDetails, NodeDetails{nodeName, nodeInfo.vm, nodeInfo.vmUUID, nodeInfo.zone})
}
nodeInfo := getNodeInfo(nodeName)
return nodeDetails, nil
}

func (nm *NodeManager) refreshNodes() (errList []error) {
nm.registeredNodesLock.Lock()
defer nm.registeredNodesLock.Unlock()

for nodeName := range nm.registeredNodes {
nodeInfo, err := nm.getRefreshedNodeInfo(convertToK8sType(nodeName))
if err != nil {
errList = append(errList, err)
continue
}
klog.V(4).Infof("Updated NodeInfo %v for node %q.", nodeInfo, nodeName)
}
return errList
}

func (nm *NodeManager) getRefreshedNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error) {
nodeInfo := nm.getNodeInfo(nodeName)
var err error
if nodeInfo == nil {
// Rediscover node if no NodeInfo found.
Expand All @@ -297,7 +329,7 @@ func (nm *NodeManager) GetNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error)
klog.Errorf("Error %q node info for node %q not found", err, convertToString(nodeName))
return NodeInfo{}, err
}
nodeInfo = getNodeInfo(nodeName)
nodeInfo = nm.getNodeInfo(nodeName)
} else {
// Renew the found NodeInfo to avoid stale vSphere connection.
klog.V(4).Infof("Renewing NodeInfo %+v for node %q", nodeInfo, convertToString(nodeName))
Expand All @@ -311,31 +343,19 @@ func (nm *NodeManager) GetNodeInfo(nodeName k8stypes.NodeName) (NodeInfo, error)
return *nodeInfo, nil
}

// GetNodeDetails returns NodeDetails for all the discovered nodes.
//
// This method is a getter but it can cause side-effect of updating NodeInfo objects.
func (nm *NodeManager) GetNodeDetails() ([]NodeDetails, error) {
nm.registeredNodesLock.Lock()
defer nm.registeredNodesLock.Unlock()
var nodeDetails []NodeDetails

for nodeName, nodeObj := range nm.registeredNodes {
nodeInfo, err := nm.GetNodeInfoWithNodeObject(nodeObj)
if err != nil {
return nil, err
}
klog.V(4).Infof("Updated NodeInfo %v for node %q.", nodeInfo, nodeName)
nodeDetails = append(nodeDetails, NodeDetails{nodeName, nodeInfo.vm, nodeInfo.vmUUID, nodeInfo.zone})
}
return nodeDetails, nil
}

func (nm *NodeManager) addNodeInfo(nodeName string, nodeInfo *NodeInfo) {
nm.nodeInfoLock.Lock()
nm.nodeInfoMap[nodeName] = nodeInfo
nm.nodeInfoLock.Unlock()
}

func (nm *NodeManager) getNodeInfo(nodeName k8stypes.NodeName) *NodeInfo {
nm.nodeInfoLock.RLock()
nodeInfo := nm.nodeInfoMap[convertToString(nodeName)]
nm.nodeInfoLock.RUnlock()
return nodeInfo
}

func (nm *NodeManager) GetVSphereInstance(nodeName k8stypes.NodeName) (VSphereInstance, error) {
nodeInfo, err := nm.GetNodeInfo(nodeName)
if err != nil {
Expand Down Expand Up @@ -417,35 +437,7 @@ func (nm *NodeManager) vcConnect(ctx context.Context, vsphereInstance *VSphereIn
//
// This method is a getter but it can cause side-effect of updating NodeInfo object.
func (nm *NodeManager) GetNodeInfoWithNodeObject(node *v1.Node) (NodeInfo, error) {
nodeName := node.Name
getNodeInfo := func(nodeName string) *NodeInfo {
nm.nodeInfoLock.RLock()
nodeInfo := nm.nodeInfoMap[nodeName]
nm.nodeInfoLock.RUnlock()
return nodeInfo
}
nodeInfo := getNodeInfo(nodeName)
var err error
if nodeInfo == nil {
// Rediscover node if no NodeInfo found.
klog.V(4).Infof("No VM found for node %q. Initiating rediscovery.", nodeName)
err = nm.DiscoverNode(node)
if err != nil {
klog.Errorf("Error %q node info for node %q not found", err, nodeName)
return NodeInfo{}, err
}
nodeInfo = getNodeInfo(nodeName)
} else {
// Renew the found NodeInfo to avoid stale vSphere connection.
klog.V(4).Infof("Renewing NodeInfo %+v for node %q", nodeInfo, nodeName)
nodeInfo, err = nm.renewNodeInfo(nodeInfo, true)
if err != nil {
klog.Errorf("Error %q occurred while renewing NodeInfo for %q", err, nodeName)
return NodeInfo{}, err
}
nm.addNodeInfo(nodeName, nodeInfo)
}
return *nodeInfo, nil
return nm.getRefreshedNodeInfo(convertToK8sType(node.Name))
}

func (nm *NodeManager) CredentialManager() *SecretCredentialManager {
Expand Down
62 changes: 62 additions & 0 deletions staging/src/k8s.io/legacy-cloud-providers/vsphere/vsphere.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"os"
"path"
"path/filepath"
"reflect"
"runtime"
"strings"
"sync"
Expand Down Expand Up @@ -98,6 +99,7 @@ type VSphere struct {
nodeManager *NodeManager
vmUUID string
isSecretInfoProvided bool
isSecretManaged bool
}

// Represents a vSphere instance where one or more kubernetes nodes are running.
Expand Down Expand Up @@ -175,6 +177,8 @@ type VSphereConfig struct {
SecretName string `gcfg:"secret-name"`
// Secret Namespace where secret will be present that has vCenter credentials.
SecretNamespace string `gcfg:"secret-namespace"`
// Secret changes being ingnored for cloud resources
SecretNotManaged bool `gcfg:"secret-not-managed"`
}

VirtualCenter map[string]*VirtualCenterConfig
Expand Down Expand Up @@ -276,6 +280,15 @@ func (vs *VSphere) SetInformers(informerFactory informers.SharedInformerFactory)
VirtualCenter: make(map[string]*Credential),
},
}
if vs.isSecretManaged {
klog.V(4).Infof("Setting up secret informers for vSphere Cloud Provider")
secretInformer := informerFactory.Core().V1().Secrets().Informer()
secretInformer.AddEventHandler(cache.ResourceEventHandlerFuncs{
AddFunc: vs.SecretAdded,
UpdateFunc: vs.SecretUpdated,
})
klog.V(4).Infof("Secret informers in vSphere cloud provider initialized")
}
vs.nodeManager.UpdateCredentialManager(secretCredentialManager)
}

Expand Down Expand Up @@ -530,6 +543,7 @@ func buildVSphereFromConfig(cfg VSphereConfig) (*VSphere, error) {
registeredNodes: make(map[string]*v1.Node),
},
isSecretInfoProvided: isSecretInfoProvided,
isSecretManaged: !cfg.Global.SecretNotManaged,
cfg: &cfg,
}
return &vs, nil
Expand Down Expand Up @@ -1502,6 +1516,54 @@ func (vs *VSphere) NodeDeleted(obj interface{}) {
}
}

// Notification handler when credentials secret is added.
func (vs *VSphere) SecretAdded(obj interface{}) {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

all informers will get an added event at startup, which means we'll always trigger RediscoverNodes... is that intentional?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

That's the issue faced by QE at some point. If some of the configuration is dedicated to the secret content, and there is another manager resource updating cluster configuration on config's content change, this part was left over, as nothing would recognize the secret being added/updated/deleted

secret, ok := obj.(*v1.Secret)
if secret == nil || !ok {
klog.Warningf("Unrecognized secret object %T", obj)
return
}

if secret.Name != vs.cfg.Global.SecretName ||
secret.Namespace != vs.cfg.Global.SecretNamespace {
return
}

klog.V(4).Infof("secret added: %+v", obj)
vs.refreshNodesForSecretChange()
}

// Notification handler when credentials secret is updated.
func (vs *VSphere) SecretUpdated(obj interface{}, newObj interface{}) {
oldSecret, ok := obj.(*v1.Secret)
if oldSecret == nil || !ok {
klog.Warningf("Unrecognized secret object %T", obj)
return
}

secret, ok := newObj.(*v1.Secret)
if secret == nil || !ok {
klog.Warningf("Unrecognized secret object %T", newObj)
return
}

if secret.Name != vs.cfg.Global.SecretName ||
secret.Namespace != vs.cfg.Global.SecretNamespace ||
reflect.DeepEqual(secret.Data, oldSecret.Data) {
return
}

klog.V(4).Infof("secret updated: %+v", newObj)
vs.refreshNodesForSecretChange()
}

func (vs *VSphere) refreshNodesForSecretChange() {
err := vs.nodeManager.refreshNodes()
if err != nil {
klog.Errorf("failed to rediscover nodes: %v", err)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

returning here without retry means a single node lookup failure will leave the remaining nodes in the list with stale credentials, right?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Having and error here means that it failed during couple of retries while trying to (re)discover node, and update VM state, so this logic is already handled.

}
}

func (vs *VSphere) NodeManager() (nodeManager *NodeManager) {
if vs == nil {
return nil
Expand Down
Loading