Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Rework volume reconstruction #108180

Closed
wants to merge 9 commits into from
10 changes: 10 additions & 0 deletions pkg/features/kube_features.go
Expand Up @@ -834,6 +834,15 @@ const (
//
// Enable MinDomains in Pod Topology Spread.
MinDomainsInPodTopologySpread featuregate.Feature = "MinDomainsInPodTopologySpread"
// owner: @jsafrane

// kep: http://kep.k8s.io/1710
// alpha: v1.24
//
// Speed up container startup by mounting volumes with the correct SELinux label
// instead of changing each file on the volumes recursively.
// Initial implementation focused on ReadWriteOncePod volumes.
SELinuxMountReadWriteOncePod featuregate.Feature = "SELinuxMountReadWriteOncePod"
)

func init() {
Expand Down Expand Up @@ -956,6 +965,7 @@ var defaultKubernetesFeatureGates = map[featuregate.Feature]featuregate.FeatureS
GRPCContainerProbe: {Default: false, PreRelease: featuregate.Alpha},
LegacyServiceAccountTokenNoAutoGeneration: {Default: true, PreRelease: featuregate.Beta},
MinDomainsInPodTopologySpread: {Default: false, PreRelease: featuregate.Alpha},
SELinuxMountReadWriteOncePod: {Default: false, PreRelease: featuregate.Alpha},

// inherited features from generic apiserver, relisted here to get a conflict if it is changed
// unintentionally on either side:
Expand Down
35 changes: 34 additions & 1 deletion pkg/kubelet/volumemanager/cache/actual_state_of_world.go
Expand Up @@ -175,6 +175,12 @@ type ActualStateOfWorld interface {
// SyncReconstructedVolume check the volume.outerVolumeSpecName in asw and
// the one populated from dsw , if they do not match, update this field from the value from dsw.
SyncReconstructedVolume(volumeName v1.UniqueVolumeName, podName volumetypes.UniquePodName, outerVolumeSpecName string)

// UpdateReconstructedDevicePath updates devicePath of a reconstructed volume
// from Node.Status.VolumesAttached. The ASW is updated only when the volume is still
// uncertain. If the volume got mounted in the meantime, its devicePath must have
// been fixed by such an update.
UpdateReconstructedDevicePath(volumeName v1.UniqueVolumeName, devicePath string)
}

// MountedVolume represents a volume that has successfully been mounted to a pod.
Expand Down Expand Up @@ -390,6 +396,24 @@ func (asw *actualStateOfWorld) MarkDeviceAsUnmounted(
return asw.SetDeviceMountState(volumeName, operationexecutor.DeviceNotMounted, "", "")
}

func (asw *actualStateOfWorld) UpdateReconstructedDevicePath(volumeName v1.UniqueVolumeName, devicePath string) {
asw.RLock()
defer asw.RUnlock()

volumeObj, volumeExists := asw.attachedVolumes[volumeName]
if !volumeExists {
return
}
if volumeObj.deviceMountState != operationexecutor.DeviceMountUncertain {
// Reconciler must have updated volume state, i.e. when a pod uses the volume and
// succeeded mounting the volume. Such update has fixed the device path.
return
}

volumeObj.devicePath = devicePath
asw.attachedVolumes[volumeName] = volumeObj
}

func (asw *actualStateOfWorld) GetDeviceMountState(volumeName v1.UniqueVolumeName) operationexecutor.DeviceMountState {
asw.RLock()
defer asw.RUnlock()
Expand Down Expand Up @@ -525,7 +549,16 @@ func (asw *actualStateOfWorld) AddPodToVolume(markVolumeOpts operationexecutor.M
}

podObj, podExists := volumeObj.mountedPods[podName]
if !podExists {

updateUncertainVolume := false
if podExists {
// Update uncertain volumes - the new markVolumeOpts may have updated information.
// Especially reconstructed volumes (marked as uncertain during reconstruction) need
// an update.
updateUncertainVolume = utilfeature.DefaultFeatureGate.Enabled(features.SELinuxMountReadWriteOncePod) && podObj.volumeMountStateForPod == operationexecutor.VolumeMountUncertain
}
if !podExists || updateUncertainVolume {
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure I follow, can you please elaborate?

Copy link
Member

@gnufied gnufied Mar 16, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So there was this bug - #103143 which was reported because during reconstruction we do not get correct outerSpecName and if populator loop did not add volume to DSOW because of api-server unavailability then we never properly mount the volume.

After your change - it looks like the entire data structure stored in ASOW is replaced for uncertain volumes, so we still need SyncReconstructedVolume ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You're correct, with SELinuxMountReadWriteOncePod feature enabled it should not be necessary to call SyncReconstructedVolume. I added a new commit that removes it.

// Add new mountedPod or update existing one.
podObj = mountedPod{
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does not doing this results in real breakage? (sorry just curious, I suspected it will but I don't know for sure).

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know, you told me that we can't trust reconstructed volumes. And I think we should not trust Uncertain volumes either, so I update everything.

podName: podName,
podUID: podUID,
Expand Down
156 changes: 156 additions & 0 deletions pkg/kubelet/volumemanager/cache/actual_state_of_world_test.go
Expand Up @@ -22,6 +22,7 @@ import (
"github.com/stretchr/testify/require"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/types"
"k8s.io/kubernetes/pkg/volume"
volumetesting "k8s.io/kubernetes/pkg/volume/testing"
"k8s.io/kubernetes/pkg/volume/util"
Expand Down Expand Up @@ -782,6 +783,26 @@ func verifyPodExistsInVolumeAsw(
}
}

func verifyDeviceStateInVolumeAsw(
t *testing.T,
podName volumetypes.UniquePodName,
volumeName v1.UniqueVolumeName,
expectedDevicePath string,
expectMounted bool,
asw ActualStateOfWorld) {

mounted, devicePath, err := asw.PodExistsInVolume(podName, volumeName)
if err != nil {
t.Fatalf("Failed to check PodExistsInVolume: %s", err)
}
if expectMounted != mounted {
t.Errorf("ASW reports the volume mounted %t, got %t", expectMounted, mounted)
}
if devicePath != expectedDevicePath {
t.Errorf("Expected devicePath %q, got %q", expectedDevicePath, devicePath)
}
}

func verifyVolumeMountedElsewhere(
t *testing.T,
expectedPodName volumetypes.UniquePodName,
Expand Down Expand Up @@ -872,3 +893,138 @@ func verifyVolumeSpecNameInVolumeAsw(
}
}
}

func TestUpdateReconstructedDevicePath(t *testing.T) {
// Arrange - add a reconstructed volume with empty devicePath to ASW
volumePluginMgr, plugin := volumetesting.GetTestKubeletVolumePluginMgr(t)
asw := NewActualStateOfWorld("mynode" /* nodeName */, volumePluginMgr)

pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod1",
UID: "pod1uid",
},
Spec: v1.PodSpec{
Volumes: []v1.Volume{
{
Name: "volume-name-1",
VolumeSource: v1.VolumeSource{
GCEPersistentDisk: &v1.GCEPersistentDiskVolumeSource{
PDName: "fake-device1",
},
},
},
},
},
}
volumeSpec := &volume.Spec{Volume: &pod.Spec.Volumes[0]}
reconstrucedVolumePath := ""
volumeName, err := util.GetUniqueVolumeNameFromSpec(plugin, volumeSpec)
require.NoError(t, err)

// This is how a reconstructed volume looks like, copied from reconciler.updateStates()
err = asw.MarkVolumeAsAttached(volumeName, volumeSpec, "" /* nodeName */, "")
require.NoError(t, err)

mounter, err := plugin.NewMounter(volumeSpec, pod, volume.VolumeOptions{})
require.NoError(t, err)
deviceMounter, err := plugin.NewDeviceMounter()
require.NoError(t, err)

markVolumeOpts := operationexecutor.MarkVolumeOpts{
PodName: util.GetUniquePodName(pod),
PodUID: types.UID(pod.UID),
VolumeName: volumeName,
Mounter: mounter,
BlockVolumeMapper: nil,
OuterVolumeSpecName: volumeSpec.Name(),
VolumeSpec: volumeSpec,
VolumeMountState: operationexecutor.VolumeMountUncertain,
}
err = asw.MarkVolumeMountAsUncertain(markVolumeOpts)
require.NoError(t, err)

deviceMountPath, err := deviceMounter.GetDeviceMountPath(volumeSpec)
require.NoError(t, err)
err = asw.MarkDeviceAsUncertain(volumeName, reconstrucedVolumePath, deviceMountPath)
require.NoError(t, err)

// Act - update the device path from node.status
nodeStatusDevicePath := "/dev/foo"
asw.UpdateReconstructedDevicePath(volumeName, nodeStatusDevicePath)

// Assert - ensure the device path was updated
verifyDeviceStateInVolumeAsw(t, util.GetUniquePodName(pod), volumeName, nodeStatusDevicePath, false, asw)
}

func TestUpdateReconstructedDevicePath_Skio(t *testing.T) {
// Arrange - add a reconstructed volume with empty devicePath to ASW and replace it with a real volume
volumePluginMgr, plugin := volumetesting.GetTestKubeletVolumePluginMgr(t)
asw := NewActualStateOfWorld("mynode" /* nodeName */, volumePluginMgr)

pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "pod1",
UID: "pod1uid",
},
Spec: v1.PodSpec{
Volumes: []v1.Volume{
{
Name: "volume-name-1",
VolumeSource: v1.VolumeSource{
GCEPersistentDisk: &v1.GCEPersistentDiskVolumeSource{
PDName: "fake-device1",
},
},
},
},
},
}
volumeSpec := &volume.Spec{Volume: &pod.Spec.Volumes[0]}
reconstrucedVolumePath := ""
volumeName, err := util.GetUniqueVolumeNameFromSpec(plugin, volumeSpec)
require.NoError(t, err)

// This is how a reconstructed volume looks like, copied from reconciler.updateStates()
err = asw.MarkVolumeAsAttached(volumeName, volumeSpec, "" /* nodeName */, "")
require.NoError(t, err)

mounter, err := plugin.NewMounter(volumeSpec, pod, volume.VolumeOptions{})
require.NoError(t, err)
deviceMounter, err := plugin.NewDeviceMounter()
require.NoError(t, err)

markVolumeOpts := operationexecutor.MarkVolumeOpts{
PodName: util.GetUniquePodName(pod),
PodUID: types.UID(pod.UID),
VolumeName: volumeName,
Mounter: mounter,
BlockVolumeMapper: nil,
OuterVolumeSpecName: volumeSpec.Name(),
VolumeSpec: volumeSpec,
VolumeMountState: operationexecutor.VolumeMountUncertain,
}
err = asw.MarkVolumeMountAsUncertain(markVolumeOpts)
require.NoError(t, err)

deviceMountPath, err := deviceMounter.GetDeviceMountPath(volumeSpec)
require.NoError(t, err)
err = asw.MarkDeviceAsUncertain(volumeName, reconstrucedVolumePath, deviceMountPath)
require.NoError(t, err)

// Simulate successful volume setup done by the reconciler before device paths are reconstructed from node status
realDevicePath := "/dev/sdb"
err = asw.MarkDeviceAsMounted(volumeName, realDevicePath, deviceMountPath)
require.NoError(t, err)

markVolumeOpts.VolumeMountState = operationexecutor.VolumeMounted
err = asw.MarkVolumeAsMounted(markVolumeOpts)
require.NoError(t, err)

// Act - update the device path from node.status
nodeStatusDevicePath := "/dev/foo"
asw.UpdateReconstructedDevicePath(volumeName, nodeStatusDevicePath)

// Assert - ensure the device path was not updated and the volume is mounted
verifyDeviceStateInVolumeAsw(t, util.GetUniquePodName(pod), volumeName, realDevicePath, true, asw)
}
Expand Up @@ -151,7 +151,10 @@ func (dswp *desiredStateOfWorldPopulator) Run(sourcesReady config.SourcesReady,
return done, nil
}, stopCh)
dswp.hasAddedPodsLock.Lock()
dswp.hasAddedPods = true
if !dswp.hasAddedPods {
klog.InfoS("Finished populating initial desired state of world")
dswp.hasAddedPods = true
}
dswp.hasAddedPodsLock.Unlock()
wait.Until(dswp.populatorLoop, dswp.loopSleepDuration, stopCh)
}
Expand Down Expand Up @@ -316,8 +319,12 @@ func (dswp *desiredStateOfWorldPopulator) processPodVolumes(
} else {
klog.V(4).InfoS("Added volume to desired state", "pod", klog.KObj(pod), "volumeName", podVolume.Name, "volumeSpecName", volumeSpec.Name())
}
// sync reconstructed volume
dswp.actualStateOfWorld.SyncReconstructedVolume(uniqueVolumeName, uniquePodName, podVolume.Name)
if !utilfeature.DefaultFeatureGate.Enabled(features.SELinuxMountReadWriteOncePod) {
// sync reconstructed volume. This is necessary only when the old-style reconstruction is still used.
// With reconstruct_new.go, AWS.MarkVolumeAsMounted will update the outer spec name of previously
// uncertain volumes.
dswp.actualStateOfWorld.SyncReconstructedVolume(uniqueVolumeName, uniquePodName, podVolume.Name)
}

if expandInUsePV {
dswp.checkVolumeFSResize(pod, podVolume, pvc, volumeSpec,
Expand Down
Expand Up @@ -84,6 +84,9 @@ func prepareDswpWithVolume(t *testing.T) (*desiredStateOfWorldPopulator, kubepod
}

func TestFindAndAddNewPods_WithRescontructedVolume(t *testing.T) {
// Outer volume spec replacement is needed only when the old volume reconstruction is used
// (i.e. with SELinuxMountReadWriteOncePod disabled)
defer featuregatetesting.SetFeatureGateDuringTest(t, utilfeature.DefaultFeatureGate, features.SELinuxMountReadWriteOncePod, false)()
// create dswp
dswp, fakePodManager := prepareDswpWithVolume(t)

Expand Down