Skip to content

Commit

Permalink
Merge pull request #104912 from gnufied/fix-dangling-volume-vsphere-19
Browse files Browse the repository at this point in the history
Fix dangling volume vsphere detaches
  • Loading branch information
k8s-ci-robot committed Oct 6, 2021
2 parents 7b343ec + 60b4500 commit 719be54
Show file tree
Hide file tree
Showing 8 changed files with 396 additions and 12 deletions.
1 change: 0 additions & 1 deletion pkg/volume/vsphere_volume/attacher.go
Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,6 @@ func (plugin *vsphereVolumePlugin) NewDeviceUnmounter() (volume.DeviceUnmounter,

// Detach the given device from the given node.
func (detacher *vsphereVMDKDetacher) Detach(volumeName string, nodeName types.NodeName) error {

volPath := getVolPathfromVolumeName(volumeName)
attached, newVolumePath, err := detacher.vsphereVolumes.DiskIsAttached(volPath, nodeName)
if err != nil {
Expand Down
4 changes: 4 additions & 0 deletions staging/src/k8s.io/legacy-cloud-providers/vsphere/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ go_library(
"vsphere_util_linux.go",
"vsphere_util_unsupported.go",
"vsphere_util_windows.go",
"vsphere_volume_map.go",
],
importmap = "k8s.io/kubernetes/vendor/k8s.io/legacy-cloud-providers/vsphere",
importpath = "k8s.io/legacy-cloud-providers/vsphere",
Expand All @@ -32,6 +33,7 @@ go_library(
"//staging/src/k8s.io/client-go/tools/cache:go_default_library",
"//staging/src/k8s.io/cloud-provider:go_default_library",
"//staging/src/k8s.io/cloud-provider/node/helpers:go_default_library",
"//staging/src/k8s.io/cloud-provider/volume/errors:go_default_library",
"//staging/src/k8s.io/cloud-provider/volume/helpers:go_default_library",
"//staging/src/k8s.io/legacy-cloud-providers/vsphere/vclib:go_default_library",
"//staging/src/k8s.io/legacy-cloud-providers/vsphere/vclib/diskmanagers:go_default_library",
Expand All @@ -55,6 +57,7 @@ go_test(
"credentialmanager_test.go",
"vsphere_test.go",
"vsphere_util_test.go",
"vsphere_volume_map_test.go",
],
embed = [":go_default_library"],
deps = [
Expand All @@ -71,6 +74,7 @@ go_test(
"//vendor/github.com/vmware/govmomi:go_default_library",
"//vendor/github.com/vmware/govmomi/find:go_default_library",
"//vendor/github.com/vmware/govmomi/lookup/simulator:go_default_library",
"//vendor/github.com/vmware/govmomi/object:go_default_library",
"//vendor/github.com/vmware/govmomi/property:go_default_library",
"//vendor/github.com/vmware/govmomi/simulator:go_default_library",
"//vendor/github.com/vmware/govmomi/simulator/vpx:go_default_library",
Expand Down
11 changes: 11 additions & 0 deletions staging/src/k8s.io/legacy-cloud-providers/vsphere/nodemanager.go
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,17 @@ func (nm *NodeManager) GetNodeDetails() ([]NodeDetails, error) {
return nodeDetails, nil
}

// GetNodeNames returns list of nodes that are known to vsphere cloudprovider.
// These are typically nodes that make up k8s cluster.
func (nm *NodeManager) GetNodeNames() []k8stypes.NodeName {
nodes := nm.getNodes()
var nodeNameList []k8stypes.NodeName
for _, node := range nodes {
nodeNameList = append(nodeNameList, k8stypes.NodeName(node.Name))
}
return nodeNameList
}

func (nm *NodeManager) refreshNodes() (errList []error) {
for nodeName := range nm.getNodes() {
nodeInfo, err := nm.getRefreshedNodeInfo(convertToK8sType(nodeName))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@ import (

"github.com/vmware/govmomi/find"
"github.com/vmware/govmomi/object"
"github.com/vmware/govmomi/vim25/mo"
"github.com/vmware/govmomi/vim25/soap"
"github.com/vmware/govmomi/vim25/types"
"k8s.io/klog/v2"
Expand Down Expand Up @@ -181,14 +180,6 @@ func IsInvalidCredentialsError(err error) bool {
return isInvalidCredentialsError
}

// VerifyVolumePathsForVM verifies if the volume paths (volPaths) are attached to VM.
func VerifyVolumePathsForVM(vmMo mo.VirtualMachine, volPaths []string, nodeName string, nodeVolumeMap map[string]map[string]bool) {
// Verify if the volume paths are present on the VM backing virtual disk devices
vmDevices := object.VirtualDeviceList(vmMo.Config.Hardware.Device)
VerifyVolumePathsForVMDevices(vmDevices, volPaths, nodeName, nodeVolumeMap)

}

// VerifyVolumePathsForVMDevices verifies if the volume paths (volPaths) are attached to VM.
func VerifyVolumePathsForVMDevices(vmDevices object.VirtualDeviceList, volPaths []string, nodeName string, nodeVolumeMap map[string]map[string]bool) {
volPathsMap := make(map[string]bool)
Expand Down
30 changes: 29 additions & 1 deletion staging/src/k8s.io/legacy-cloud-providers/vsphere/vsphere.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"errors"
"fmt"
"io"

"net"
"net/url"
"os"
Expand All @@ -48,6 +49,7 @@ import (
"k8s.io/client-go/tools/cache"
cloudprovider "k8s.io/cloud-provider"
nodehelpers "k8s.io/cloud-provider/node/helpers"
volerr "k8s.io/cloud-provider/volume/errors"
volumehelpers "k8s.io/cloud-provider/volume/helpers"
"k8s.io/klog/v2"

Expand Down Expand Up @@ -95,6 +97,7 @@ type VSphere struct {
hostName string
// Maps the VSphere IP address to VSphereInstance
vsphereInstanceMap map[string]*VSphereInstance
vsphereVolumeMap *VsphereVolumeMap
// Responsible for managing discovery of k8s node, their location etc.
nodeManager *NodeManager
vmUUID string
Expand Down Expand Up @@ -542,6 +545,7 @@ func buildVSphereFromConfig(cfg VSphereConfig) (*VSphere, error) {
nodeInfoMap: make(map[string]*NodeInfo),
registeredNodes: make(map[string]*v1.Node),
},
vsphereVolumeMap: NewVsphereVolumeMap(),
isSecretInfoProvided: isSecretInfoProvided,
isSecretManaged: !cfg.Global.SecretNotManaged,
cfg: &cfg,
Expand Down Expand Up @@ -950,6 +954,20 @@ func (vs *VSphere) AttachDisk(vmDiskPath string, storagePolicyName string, nodeN
}
}
klog.V(4).Infof("AttachDisk executed for node %s and volume %s with diskUUID %s. Err: %s", convertToString(nodeName), vmDiskPath, diskUUID, err)
if err != nil {
// if attach failed, we should check if disk is attached somewhere else. This can happen for several reasons
// and throwing a dangling volume error here will allow attach-detach controller to detach disk from a node
// where it is not needed.
existingNode, ok := vs.vsphereVolumeMap.CheckForVolume(vmDiskPath)
if ok {
attached, newVolumePath, diskAttachedError := vs.DiskIsAttached(vmDiskPath, existingNode)
// if disk is attached somewhere else then we can throw a dangling error
if diskAttachedError == nil && attached && (nodeName != existingNode) {
klog.V(3).Infof("found dangling volume %s to node %s", vmDiskPath, existingNode)
return "", volerr.NewDanglingError(err.Error(), existingNode, newVolumePath)
}
}
}
vclib.RecordvSphereMetric(vclib.OperationAttachVolume, requestTime, err)
return diskUUID, err
}
Expand Down Expand Up @@ -1083,6 +1101,7 @@ func (vs *VSphere) DiskIsAttached(volPath string, nodeName k8stypes.NodeName) (b
// 5b. If VMs are removed from vSphere inventory they are ignored.
func (vs *VSphere) DisksAreAttached(nodeVolumes map[k8stypes.NodeName][]string) (map[k8stypes.NodeName]map[string]bool, error) {
disksAreAttachedInternal := func(nodeVolumes map[k8stypes.NodeName][]string) (map[k8stypes.NodeName]map[string]bool, error) {
vs.vsphereVolumeMap.StartDiskVerification()

// disksAreAttach checks whether disks are attached to the nodes.
// Returns nodes that need to be retried if retry is true
Expand Down Expand Up @@ -1194,8 +1213,17 @@ func (vs *VSphere) DisksAreAttached(nodeVolumes map[k8stypes.NodeName][]string)
for nodeName, volPaths := range attached {
disksAttached[convertToK8sType(nodeName)] = volPaths
}

}
klog.V(4).Infof("DisksAreAttach successfully executed. result: %+v", attached)
klog.V(4).Infof("DisksAreAttached successfully executed. result: %+v", attached)
// There could be nodes in cluster which do not have any pods with vsphere volumes running on them
// such nodes won't be part of nodeVolumes map because attach-detach controller does not keep track
// such nodes. But such nodes may still have dangling volumes on them and hence we need to scan all the
// remaining nodes which weren't scanned by code previously.
vs.BuildMissingVolumeNodeMap(ctx)
// any volume which we could not verify will be removed from the map.
vs.vsphereVolumeMap.RemoveUnverified()
klog.V(4).Infof("current node volume map is: %+v", vs.vsphereVolumeMap.volumeNodeMap)
return disksAttached, nil
}
requestTime := time.Now()
Expand Down
124 changes: 123 additions & 1 deletion staging/src/k8s.io/legacy-cloud-providers/vsphere/vsphere_util.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ import (
"os"
"path/filepath"
"strings"
"sync"
"time"

"github.com/vmware/govmomi/find"
Expand Down Expand Up @@ -569,6 +570,7 @@ func (vs *VSphere) checkDiskAttached(ctx context.Context, nodes []k8stypes.NodeN
return nodesToRetry, err
}
klog.V(4).Infof("Verifying Volume Paths by devices for node %s and VM %s", nodeName, nodeInfo.vm)
vs.vsphereVolumeMap.Add(nodeName, devices)
vclib.VerifyVolumePathsForVMDevices(devices, nodeVolumes[nodeName], convertToString(nodeName), attached)
}
}
Expand Down Expand Up @@ -599,11 +601,131 @@ func (vs *VSphere) checkDiskAttached(ctx context.Context, nodes []k8stypes.NodeN
}
nodeUUID = strings.ToLower(nodeUUID)
klog.V(9).Infof("Verifying volume for node %s with nodeuuid %q: %v", nodeName, nodeUUID, vmMoMap)
vclib.VerifyVolumePathsForVM(vmMoMap[nodeUUID], nodeVolumes[nodeName], convertToString(nodeName), attached)
vmMo := vmMoMap[nodeUUID]
vmDevices := object.VirtualDeviceList(vmMo.Config.Hardware.Device)
vs.vsphereVolumeMap.Add(nodeName, vmDevices)
vclib.VerifyVolumePathsForVMDevices(vmDevices, nodeVolumes[nodeName], convertToString(nodeName), attached)
}
return nodesToRetry, nil
}

// BuildMissingVolumeNodeMap builds a map of volumes and nodes which are not known to attach detach controller.
// There could be nodes in cluster which do not have any pods with vsphere volumes running on them
// such nodes won't be part of disk verification check because attach-detach controller does not keep track
// such nodes. But such nodes may still have dangling volumes on them and hence we need to scan all the
// remaining nodes which weren't scanned by code previously.
func (vs *VSphere) BuildMissingVolumeNodeMap(ctx context.Context) {
nodeNames := vs.nodeManager.GetNodeNames()
// Segregate nodes according to VC-DC
dcNodes := make(map[string][]k8stypes.NodeName)

for _, nodeName := range nodeNames {
// if given node is not in node volume map
if !vs.vsphereVolumeMap.CheckForNode(nodeName) {
nodeInfo, err := vs.nodeManager.GetNodeInfo(nodeName)
if err != nil {
klog.V(4).Infof("Failed to get node info: %+v. err: %+v", nodeInfo.vm, err)
continue
}
vcDC := nodeInfo.vcServer + nodeInfo.dataCenter.String()
dcNodes[vcDC] = append(dcNodes[vcDC], nodeName)
}
}

var wg sync.WaitGroup

for _, nodeNames := range dcNodes {
// Start go routines per VC-DC to check disks are attached
wg.Add(1)
go func(nodes []k8stypes.NodeName) {
err := vs.checkNodeDisks(ctx, nodes)
if err != nil {
klog.Errorf("Failed to check disk attached for nodes: %+v. err: %+v", nodes, err)
}
wg.Done()
}(nodeNames)
}
wg.Wait()
}

func (vs *VSphere) checkNodeDisks(ctx context.Context, nodeNames []k8stypes.NodeName) error {
var vmList []*vclib.VirtualMachine
var nodeInfo NodeInfo
var err error

for _, nodeName := range nodeNames {
nodeInfo, err = vs.nodeManager.GetNodeInfo(nodeName)
if err != nil {
return err
}
vmList = append(vmList, nodeInfo.vm)
}

// Making sure session is valid
_, err = vs.getVSphereInstanceForServer(nodeInfo.vcServer, ctx)
if err != nil {
return err
}

// If any of the nodes are not present property collector query will fail for entire operation
vmMoList, err := nodeInfo.dataCenter.GetVMMoList(ctx, vmList, []string{"config.hardware.device", "name", "config.uuid"})
if err != nil {
if vclib.IsManagedObjectNotFoundError(err) {
klog.V(4).Infof("checkNodeDisks: ManagedObjectNotFound for property collector query for nodes: %+v vms: %+v", nodeNames, vmList)
// Property Collector Query failed
// VerifyVolumePaths per VM
for _, nodeName := range nodeNames {
nodeInfo, err := vs.nodeManager.GetNodeInfo(nodeName)
if err != nil {
return err
}
devices, err := nodeInfo.vm.VirtualMachine.Device(ctx)
if err != nil {
if vclib.IsManagedObjectNotFoundError(err) {
klog.V(4).Infof("checkNodeDisks: ManagedObjectNotFound for Kubernetes node: %s with vSphere Virtual Machine reference: %v", nodeName, nodeInfo.vm)
continue
}
return err
}
klog.V(4).Infof("Verifying Volume Paths by devices for node %s and VM %s", nodeName, nodeInfo.vm)
vs.vsphereVolumeMap.Add(nodeName, devices)
}
return nil
}
return err
}

vmMoMap := make(map[string]mo.VirtualMachine)
for _, vmMo := range vmMoList {
if vmMo.Config == nil {
klog.Errorf("Config is not available for VM: %q", vmMo.Name)
continue
}
klog.V(9).Infof("vmMoMap vmname: %q vmuuid: %s", vmMo.Name, strings.ToLower(vmMo.Config.Uuid))
vmMoMap[strings.ToLower(vmMo.Config.Uuid)] = vmMo
}

klog.V(9).Infof("vmMoMap: +%v", vmMoMap)

for _, nodeName := range nodeNames {
node, err := vs.nodeManager.GetNode(nodeName)
if err != nil {
return err
}
nodeUUID, err := GetNodeUUID(&node)
if err != nil {
klog.Errorf("Node Discovery failed to get node uuid for node %s with error: %v", node.Name, err)
return err
}
nodeUUID = strings.ToLower(nodeUUID)
klog.V(9).Infof("Verifying volume for node %s with nodeuuid %q: %v", nodeName, nodeUUID, vmMoMap)
vmMo := vmMoMap[nodeUUID]
vmDevices := object.VirtualDeviceList(vmMo.Config.Hardware.Device)
vs.vsphereVolumeMap.Add(nodeName, vmDevices)
}
return nil
}

func (vs *VSphere) GetNodeNameFromProviderID(providerID string) (string, error) {
var nodeName string
nodes, err := vs.nodeManager.GetNodeDetails()
Expand Down
Loading

0 comments on commit 719be54

Please sign in to comment.