Skip to content

Commit

Permalink
Merge pull request #113 from leakingtapan/watch-dog
Browse files Browse the repository at this point in the history
Add watch dog for efs mount with stunnel
  • Loading branch information
Cheng Pan committed Dec 30, 2019
2 parents 60d489b + 4385428 commit 019e989
Show file tree
Hide file tree
Showing 8 changed files with 311 additions and 6 deletions.
19 changes: 19 additions & 0 deletions Dockerfile.dev
@@ -0,0 +1,19 @@
# Copyright 2019 The Kubernetes Authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
FROM amazonlinux:2
RUN yum install util-linux amazon-efs-utils -y
COPY bin/aws-efs-csi-driver /bin/aws-efs-csi-driver
COPY THIRD-PARTY /

ENTRYPOINT ["/bin/aws-efs-csi-driver"]
4 changes: 4 additions & 0 deletions examples/kubernetes/volume_path/specs/example.yaml
Expand Up @@ -16,6 +16,8 @@ spec:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
storageClassName: efs-sc
mountOptions:
- tls
csi:
driver: efs.csi.aws.com
volumeHandle: fs-e8a95a42:/dir1
Expand Down Expand Up @@ -44,6 +46,8 @@ spec:
- ReadWriteMany
persistentVolumeReclaimPolicy: Retain
storageClassName: efs-sc
mountOptions:
- tls
csi:
driver: efs.csi.aws.com
volumeHandle: fs-e8a95a42:/dir2
Expand Down
17 changes: 14 additions & 3 deletions pkg/driver/driver.go
Expand Up @@ -38,6 +38,8 @@ type Driver struct {
srv *grpc.Server

mounter Mounter

efsWatchdog Watchdog
}

func NewDriver(endpoint string) *Driver {
Expand All @@ -46,10 +48,12 @@ func NewDriver(endpoint string) *Driver {
klog.Fatalln(err)
}

watchdog := newExecWatchdog("amazon-efs-mount-watchdog")
return &Driver{
endpoint: endpoint,
nodeID: cloud.GetMetadata().GetInstanceID(),
mounter: newNodeMounter(),
endpoint: endpoint,
nodeID: cloud.GetMetadata().GetInstanceID(),
mounter: newNodeMounter(),
efsWatchdog: watchdog,
}
}

Expand Down Expand Up @@ -79,6 +83,13 @@ func (d *Driver) Run() error {
csi.RegisterIdentityServer(d.srv, d)
csi.RegisterNodeServer(d.srv, d)

klog.Info("Starting watchdog")
d.efsWatchdog.start()

reaper := newReaper()
klog.Info("Staring subreaper")
reaper.start()

klog.Infof("Listening for connections on address: %#v", listener.Addr())
return d.srv.Serve(listener)
}
133 changes: 133 additions & 0 deletions pkg/driver/efs_watch_dog.go
@@ -0,0 +1,133 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package driver

import (
"fmt"
"os/exec"
"sync"

"k8s.io/klog"
)

// Watchdog defines the interface for process monitoring and supervising
type Watchdog interface {
// start starts the watch dog along with the process
start()

// stop stops the watch dog along with the process
stop()
}

// execWatchdog is a watch dog that monitors a process and restart it
// if it has crashed accidentally
type execWatchdog struct {
// the command to be exec and monitored
execCmd string
// the command arguments
execArg []string
// the cmd that is running
cmd *exec.Cmd
// stopCh indicates if it should be stopped
stopCh chan struct{}

mu sync.Mutex
}

func newExecWatchdog(cmd string, arg ...string) Watchdog {
return &execWatchdog{
execCmd: cmd,
execArg: arg,
stopCh: make(chan struct{}),
}
}

func (w *execWatchdog) start() {
go w.runLoop(w.stopCh)
}

// stop kills the underlying process and stops the watchdog
func (w *execWatchdog) stop() {
close(w.stopCh)

w.mu.Lock()
if w.cmd.Process != nil {
p := w.cmd.Process
err := p.Kill()
if err != nil {
klog.Errorf("Failed to kill process: %s", err)
}
}
w.mu.Unlock()
}

// runLoop starts the monitoring loop
func (w *execWatchdog) runLoop(stopCh <-chan struct{}) {
for {
select {
case <-stopCh:
klog.Info("stopping...")
break
default:
err := w.exec()
if err != nil {
klog.Errorf("Process %s exits %s", w.execCmd, err)
}
}
}
}

func (w *execWatchdog) exec() error {
cmd := exec.Command(w.execCmd, w.execArg...)
cmd.Stdout = newInfoRedirect(w.execCmd)
cmd.Stderr = newErrRedirect(w.execCmd)

w.cmd = cmd

w.mu.Lock()
err := cmd.Start()
if err != nil {
return err
}
w.mu.Unlock()

return cmd.Wait()
}

type logRedirect struct {
processName string
level string
logFunc func(string, ...interface{})
}

func newInfoRedirect(name string) *logRedirect {
return &logRedirect{
processName: name,
level: "Info",
logFunc: klog.V(4).Infof,
}
}

func newErrRedirect(name string) *logRedirect {
return &logRedirect{
processName: name,
level: "Error",
logFunc: klog.Errorf,
}
}
func (l *logRedirect) Write(p []byte) (n int, err error) {
msg := fmt.Sprintf("%s[%s]: %s", l.processName, l.level, string(p))
l.logFunc("%s", msg)
return len(msg), nil
}
26 changes: 26 additions & 0 deletions pkg/driver/efs_watch_dog_test.go
@@ -0,0 +1,26 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package driver

import (
"testing"
"time"
)

func TestExecWatchdog(t *testing.T) {
w := newExecWatchdog("sleep", "300")
w.start()
time.Sleep(time.Second)
w.stop()
}
72 changes: 72 additions & 0 deletions pkg/driver/reaper.go
@@ -0,0 +1,72 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package driver

import (
"os"
"os/signal"
"syscall"

"k8s.io/klog"
)

type reaper struct {
sigs chan os.Signal
stopCh chan struct{}
}

func newReaper() *reaper {
sigs := make(chan os.Signal, 1)
stopCh := make(chan struct{})

signal.Notify(sigs, syscall.SIGCHLD)
return &reaper{
sigs: sigs,
stopCh: stopCh,
}
}

// start starts the reaper
func (r *reaper) start() {
go r.runLoop()
}

// runLoop waits for all child processes that exit
// currently only stunnel process is created by efs mount helper
// and is inherited as the child process of the driver
func (r *reaper) runLoop() {
for {
select {
case <-r.sigs:
var status syscall.WaitStatus
var rusage syscall.Rusage
childPid, err := syscall.Wait4(-1, &status, syscall.WNOHANG, &rusage)
if err != nil {
klog.Warningf("Failed to wait for child process %s", err)
} else {
klog.V(4).Infof("Waited for child process %d", childPid)
}
case <-r.stopCh:
break
}
}
}

// stop stops the reaper
func (r *reaper) stop() {
r.stopCh <- struct{}{}
}
30 changes: 30 additions & 0 deletions pkg/driver/reaper_test.go
@@ -0,0 +1,30 @@
/*
Copyright 2019 The Kubernetes Authors.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
*/

package driver

import (
"testing"
"time"
)

func TestReaper(t *testing.T) {
r := newReaper()

r.start()
time.Sleep(time.Second)
r.stop()
}
16 changes: 13 additions & 3 deletions pkg/driver/sanity_test.go
Expand Up @@ -28,6 +28,15 @@ import (
"github.com/kubernetes-sigs/aws-efs-csi-driver/pkg/driver/mocks"
)

type mockWatchdog struct {
}

func (w *mockWatchdog) start() {
}

func (w *mockWatchdog) stop() {
}

func TestSanityEFSCSI(t *testing.T) {
// Setup the full driver and its environment
dir, err := ioutil.TempDir("", "sanity-efs-csi")
Expand All @@ -48,9 +57,10 @@ func TestSanityEFSCSI(t *testing.T) {

mockCtrl := gomock.NewController(t)
drv := Driver{
endpoint: endpoint,
nodeID: "sanity",
mounter: mocks.NewMockMounter(mockCtrl),
endpoint: endpoint,
nodeID: "sanity",
mounter: mocks.NewMockMounter(mockCtrl),
efsWatchdog: &mockWatchdog{},
}
defer func() {
if r := recover(); r != nil {
Expand Down

0 comments on commit 019e989

Please sign in to comment.