From 6f203d6463a0cae450f7e0cf65767966ada193cb Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw@google.com>
Date: Thu, 7 Feb 2019 15:40:31 -0800
Subject: [PATCH 1/2] allows configuring NPD release and flags on GCI and add
 cluster e2e test

---
 cluster/gce/config-default.sh          |   2 +
 cluster/gce/config-test.sh             |   2 +
 cluster/gce/gci/configure-helper.sh    |  30 +--
 cluster/gce/gci/configure.sh           |   6 +-
 cluster/gce/util.sh                    |   2 +
 test/e2e/framework/kubelet_stats.go    |   6 +-
 test/e2e/node/BUILD                    |   2 +
 test/e2e/node/node_problem_detector.go | 282 +++++++++++++++++++++++++
 8 files changed, 313 insertions(+), 19 deletions(-)
 create mode 100644 test/e2e/node/node_problem_detector.go

diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh
index da8e119c9e52..7c8003c60861 100755
--- a/cluster/gce/config-default.sh
+++ b/cluster/gce/config-default.sh
@@ -284,6 +284,8 @@ else
 fi
 NODE_PROBLEM_DETECTOR_VERSION="${NODE_PROBLEM_DETECTOR_VERSION:-}"
 NODE_PROBLEM_DETECTOR_TAR_HASH="${NODE_PROBLEM_DETECTOR_TAR_HASH:-}"
+NODE_PROBLEM_DETECTOR_RELEASE_PATH="${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}"
+NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
 
 # Optional: Create autoscaler for cluster's nodes.
 ENABLE_CLUSTER_AUTOSCALER="${KUBE_ENABLE_CLUSTER_AUTOSCALER:-false}"
diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh
index fcbb9597e8e1..debc10afddc5 100755
--- a/cluster/gce/config-test.sh
+++ b/cluster/gce/config-test.sh
@@ -291,6 +291,8 @@ else
 fi
 NODE_PROBLEM_DETECTOR_VERSION="${NODE_PROBLEM_DETECTOR_VERSION:-}"
 NODE_PROBLEM_DETECTOR_TAR_HASH="${NODE_PROBLEM_DETECTOR_TAR_HASH:-}"
+NODE_PROBLEM_DETECTOR_RELEASE_PATH="${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}"
+NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
 
 # Optional: Create autoscaler for cluster's nodes.
 ENABLE_CLUSTER_AUTOSCALER="${KUBE_ENABLE_CLUSTER_AUTOSCALER:-false}"
diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh
index 11feaeeaaa38..ee66d1e74912 100644
--- a/cluster/gce/gci/configure-helper.sh
+++ b/cluster/gce/gci/configure-helper.sh
@@ -1197,21 +1197,25 @@ EOF
 function start-node-problem-detector {
   echo "Start node problem detector"
   local -r npd_bin="${KUBE_HOME}/bin/node-problem-detector"
-  local -r km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor.json"
-  # TODO(random-liu): Handle this for alternative container runtime.
-  local -r dm_config="${KUBE_HOME}/node-problem-detector/config/docker-monitor.json"
-  local -r custom_km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor-counter.json,${KUBE_HOME}/node-problem-detector/config/systemd-monitor-counter.json,${KUBE_HOME}/node-problem-detector/config/docker-monitor-counter.json"
   echo "Using node problem detector binary at ${npd_bin}"
-  local flags="${NPD_TEST_LOG_LEVEL:-"--v=2"} ${NPD_TEST_ARGS:-}"
-  flags+=" --logtostderr"
-  flags+=" --system-log-monitors=${km_config},${dm_config}"
-  flags+=" --custom-plugin-monitors=${custom_km_config}"
-  flags+=" --apiserver-override=https://${KUBERNETES_MASTER_NAME}?inClusterConfig=false&auth=/var/lib/node-problem-detector/kubeconfig"
-  local -r npd_port=${NODE_PROBLEM_DETECTOR_PORT:-20256}
-  flags+=" --port=${npd_port}"
-  if [[ -n "${EXTRA_NPD_ARGS:-}" ]]; then
-    flags+=" ${EXTRA_NPD_ARGS}"
+
+  local flags="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
+  if [[ -z "${flags}" ]]; then
+    local -r km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor.json"
+    # TODO(random-liu): Handle this for alternative container runtime.
+    local -r dm_config="${KUBE_HOME}/node-problem-detector/config/docker-monitor.json"
+    local -r custom_km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor-counter.json,${KUBE_HOME}/node-problem-detector/config/systemd-monitor-counter.json,${KUBE_HOME}/node-problem-detector/config/docker-monitor-counter.json"
+    flags="${NPD_TEST_LOG_LEVEL:-"--v=2"} ${NPD_TEST_ARGS:-}"
+    flags+=" --logtostderr"
+    flags+=" --system-log-monitors=${km_config},${dm_config}"
+    flags+=" --custom-plugin-monitors=${custom_km_config}"
+    local -r npd_port=${NODE_PROBLEM_DETECTOR_PORT:-20256}
+    flags+=" --port=${npd_port}"
+    if [[ -n "${EXTRA_NPD_ARGS:-}" ]]; then
+      flags+=" ${EXTRA_NPD_ARGS}"
+    fi
   fi
+  flags+=" --apiserver-override=https://${KUBERNETES_MASTER_NAME}?inClusterConfig=false&auth=/var/lib/node-problem-detector/kubeconfig"
 
   # Write the systemd service file for node problem detector.
   cat <<EOF >/etc/systemd/system/node-problem-detector.service
diff --git a/cluster/gce/gci/configure.sh b/cluster/gce/gci/configure.sh
index 82b9fe3d1ac1..6a40a6894ebe 100644
--- a/cluster/gce/gci/configure.sh
+++ b/cluster/gce/gci/configure.sh
@@ -202,12 +202,12 @@ function install-node-problem-detector {
   local -r npd_tar="node-problem-detector-${npd_version}.tar.gz"
 
   if is-preloaded "${npd_tar}" "${npd_sha1}"; then
-    echo "node-problem-detector is preloaded."
+    echo "${npd_tar} is preloaded."
     return
   fi
 
-  echo "Downloading node problem detector."
-  local -r npd_release_path="https://storage.googleapis.com/kubernetes-release"
+  echo "Downloading ${npd_tar}."
+  local -r npd_release_path="${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-https://storage.googleapis.com/kubernetes-release}"
   download-or-bust "${npd_sha1}" "${npd_release_path}/node-problem-detector/${npd_tar}"
   local -r npd_dir="${KUBE_HOME}/node-problem-detector"
   mkdir -p "${npd_dir}"
diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh
index 5ca120491ab6..e6d77410db47 100755
--- a/cluster/gce/util.sh
+++ b/cluster/gce/util.sh
@@ -829,6 +829,8 @@ ENABLE_CLUSTER_UI: $(yaml-quote ${ENABLE_CLUSTER_UI:-false})
 ENABLE_NODE_PROBLEM_DETECTOR: $(yaml-quote ${ENABLE_NODE_PROBLEM_DETECTOR:-none})
 NODE_PROBLEM_DETECTOR_VERSION: $(yaml-quote ${NODE_PROBLEM_DETECTOR_VERSION:-})
 NODE_PROBLEM_DETECTOR_TAR_HASH: $(yaml-quote ${NODE_PROBLEM_DETECTOR_TAR_HASH:-})
+NODE_PROBLEM_DETECTOR_RELEASE_PATH: $(yaml-quote ${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-})
+NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS: $(yaml-quote ${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-})
 ENABLE_NODE_LOGGING: $(yaml-quote ${ENABLE_NODE_LOGGING:-false})
 ENABLE_RESCHEDULER: $(yaml-quote ${ENABLE_RESCHEDULER:-false})
 LOGGING_DESTINATION: $(yaml-quote ${LOGGING_DESTINATION:-})
diff --git a/test/e2e/framework/kubelet_stats.go b/test/e2e/framework/kubelet_stats.go
index 1304cbfcf599..da077b3e220c 100644
--- a/test/e2e/framework/kubelet_stats.go
+++ b/test/e2e/framework/kubelet_stats.go
@@ -281,8 +281,8 @@ func HighLatencyKubeletOperations(c clientset.Interface, threshold time.Duration
 	return badMetrics, nil
 }
 
-// getStatsSummary contacts kubelet for the container information.
-func getStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) {
+// GetStatsSummary contacts kubelet for the container information.
+func GetStatsSummary(c clientset.Interface, nodeName string) (*stats.Summary, error) {
 	ctx, cancel := context.WithTimeout(context.Background(), SingleCallTimeout)
 	defer cancel()
 
@@ -348,7 +348,7 @@ func getOneTimeResourceUsageOnNode(
 		return nil, fmt.Errorf("numStats needs to be > 1 and < %d", maxNumStatsToRequest)
 	}
 	// Get information of all containers on the node.
-	summary, err := getStatsSummary(c, nodeName)
+	summary, err := GetStatsSummary(c, nodeName)
 	if err != nil {
 		return nil, err
 	}
diff --git a/test/e2e/node/BUILD b/test/e2e/node/BUILD
index 88305fc4320e..878d69e1999d 100644
--- a/test/e2e/node/BUILD
+++ b/test/e2e/node/BUILD
@@ -9,6 +9,7 @@ go_library(
         "kubelet.go",
         "kubelet_perf.go",
         "mount_propagation.go",
+        "node_problem_detector.go",
         "pod_gc.go",
         "pods.go",
         "pre_stop.go",
@@ -18,6 +19,7 @@ go_library(
     importpath = "k8s.io/kubernetes/test/e2e/node",
     visibility = ["//visibility:public"],
     deps = [
+        "//pkg/api/v1/node:go_default_library",
         "//pkg/kubelet/apis/stats/v1alpha1:go_default_library",
         "//test/e2e/common:go_default_library",
         "//test/e2e/framework:go_default_library",
diff --git a/test/e2e/node/node_problem_detector.go b/test/e2e/node/node_problem_detector.go
new file mode 100644
index 000000000000..3f3289421be2
--- /dev/null
+++ b/test/e2e/node/node_problem_detector.go
@@ -0,0 +1,282 @@
+/*
+Copyright 2019 The Kubernetes Authors.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+*/
+
+package node
+
+import (
+	"fmt"
+	"net"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"k8s.io/api/core/v1"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/fields"
+	nodeutil "k8s.io/kubernetes/pkg/api/v1/node"
+	"k8s.io/kubernetes/test/e2e/framework"
+
+	. "github.com/onsi/ginkgo"
+	. "github.com/onsi/gomega"
+)
+
+// This test checks if node-problem-detector (NPD) runs fine without error on
+// the nodes in the cluster. NPD's functionality is tested in e2e_node tests.
+var _ = SIGDescribe("NodeProblemDetector", func() {
+	const (
+		pollInterval = 1 * time.Second
+		pollTimeout  = 1 * time.Minute
+	)
+	f := framework.NewDefaultFramework("node-problem-detector")
+
+	BeforeEach(func() {
+		framework.SkipUnlessSSHKeyPresent()
+		framework.SkipUnlessProviderIs(framework.ProvidersWithSSH...)
+		framework.SkipUnlessProviderIs("gce", "gke")
+		framework.SkipUnlessNodeOSDistroIs("gci", "ubuntu")
+		framework.WaitForAllNodesHealthy(f.ClientSet, time.Minute)
+	})
+
+	It("should run without error", func() {
+		By("Getting all nodes and their SSH-able IP addresses")
+		nodes := framework.GetReadySchedulableNodesOrDie(f.ClientSet)
+		Expect(len(nodes.Items)).NotTo(BeZero())
+		hosts := []string{}
+		for _, node := range nodes.Items {
+			for _, addr := range node.Status.Addresses {
+				if addr.Type == v1.NodeExternalIP {
+					hosts = append(hosts, net.JoinHostPort(addr.Address, "22"))
+					break
+				}
+			}
+		}
+		Expect(len(hosts)).To(Equal(len(nodes.Items)))
+
+		isStandaloneMode := make(map[string]bool)
+		cpuUsageStats := make(map[string][]float64)
+		uptimeStats := make(map[string][]float64)
+		rssStats := make(map[string][]float64)
+		workingSetStats := make(map[string][]float64)
+
+		for _, host := range hosts {
+			cpuUsageStats[host] = []float64{}
+			uptimeStats[host] = []float64{}
+			rssStats[host] = []float64{}
+			workingSetStats[host] = []float64{}
+
+			cmd := "systemctl status node-problem-detector.service"
+			result, err := framework.SSH(cmd, host, framework.TestContext.Provider)
+			isStandaloneMode[host] = (err == nil && result.Code == 0)
+
+			By(fmt.Sprintf("Check node %q has node-problem-detector process", host))
+			// Using brackets "[n]" is a trick to prevent grep command itself from
+			// showing up, because string text "[n]ode-problem-detector" does not
+			// match regular expression "[n]ode-problem-detector".
+			psCmd := "ps aux | grep [n]ode-problem-detector"
+			result, err = framework.SSH(psCmd, host, framework.TestContext.Provider)
+			framework.ExpectNoError(err)
+			Expect(result.Code).To(BeZero())
+			Expect(result.Stdout).To(ContainSubstring("node-problem-detector"))
+
+			By(fmt.Sprintf("Check node-problem-detector is running fine on node %q", host))
+			journalctlCmd := "sudo journalctl -u node-problem-detector"
+			result, err = framework.SSH(journalctlCmd, host, framework.TestContext.Provider)
+			framework.ExpectNoError(err)
+			Expect(result.Code).To(BeZero())
+			Expect(result.Stdout).NotTo(ContainSubstring("node-problem-detector.service: Failed"))
+
+			if isStandaloneMode[host] {
+				cpuUsage, uptime := getCpuStat(f, host)
+				cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
+				uptimeStats[host] = append(uptimeStats[host], uptime)
+			}
+
+			By(fmt.Sprintf("Inject log to trigger AUFSUmountHung on node %q", host))
+			log := "INFO: task umount.aufs:21568 blocked for more than 120 seconds."
+			injectLogCmd := "sudo sh -c \"echo 'kernel: " + log + "' >> /dev/kmsg\""
+			_, err = framework.SSH(injectLogCmd, host, framework.TestContext.Provider)
+			framework.ExpectNoError(err)
+			Expect(result.Code).To(BeZero())
+		}
+
+		By("Check node-problem-detector can post conditions and events to API server")
+		for _, node := range nodes.Items {
+			By(fmt.Sprintf("Check node-problem-detector posted KernelDeadlock condition on node %q", node.Name))
+			Eventually(func() error {
+				return verifyNodeCondition(f, "KernelDeadlock", v1.ConditionTrue, "AUFSUmountHung", node.Name)
+			}, pollTimeout, pollInterval).Should(Succeed())
+
+			By(fmt.Sprintf("Check node-problem-detector posted AUFSUmountHung event on node %q", node.Name))
+			eventListOptions := metav1.ListOptions{FieldSelector: fields.Set{"involvedObject.kind": "Node"}.AsSelector().String()}
+			Eventually(func() error {
+				return verifyEvents(f, eventListOptions, 1, "AUFSUmountHung", node.Name)
+			}, pollTimeout, pollInterval).Should(Succeed())
+		}
+
+		By("Gather node-problem-detector cpu and memory stats")
+		numIterations := 60
+		for i := 1; i <= numIterations; i++ {
+			for j, host := range hosts {
+				if isStandaloneMode[host] {
+					rss, workingSet := getMemoryStat(f, host)
+					rssStats[host] = append(rssStats[host], rss)
+					workingSetStats[host] = append(workingSetStats[host], workingSet)
+					if i == numIterations {
+						cpuUsage, uptime := getCpuStat(f, host)
+						cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
+						uptimeStats[host] = append(uptimeStats[host], uptime)
+					}
+				} else {
+					cpuUsage, rss, workingSet := getNpdPodStat(f, nodes.Items[j].Name)
+					cpuUsageStats[host] = append(cpuUsageStats[host], cpuUsage)
+					rssStats[host] = append(rssStats[host], rss)
+					workingSetStats[host] = append(workingSetStats[host], workingSet)
+				}
+			}
+			time.Sleep(time.Second)
+		}
+
+		cpuStatsMsg := "CPU (core):"
+		rssStatsMsg := "RSS (MB):"
+		workingSetStatsMsg := "WorkingSet (MB):"
+		for i, host := range hosts {
+			if isStandaloneMode[host] {
+				// When in standalone mode, NPD is running as systemd service. We
+				// calculate its cpu usage from cgroup cpuacct value differences.
+				cpuUsage := cpuUsageStats[host][1] - cpuUsageStats[host][0]
+				totaltime := uptimeStats[host][1] - uptimeStats[host][0]
+				cpuStatsMsg += fmt.Sprintf(" %s[%.3f];", nodes.Items[i].Name, cpuUsage/totaltime)
+			} else {
+				sort.Float64s(cpuUsageStats[host])
+				cpuStatsMsg += fmt.Sprintf(" %s[%.3f|%.3f|%.3f];", nodes.Items[i].Name,
+					cpuUsageStats[host][0], cpuUsageStats[host][len(cpuUsageStats[host])/2], cpuUsageStats[host][len(cpuUsageStats[host])-1])
+			}
+
+			sort.Float64s(rssStats[host])
+			rssStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name,
+				rssStats[host][0], rssStats[host][len(rssStats[host])/2], rssStats[host][len(rssStats[host])-1])
+
+			sort.Float64s(workingSetStats[host])
+			workingSetStatsMsg += fmt.Sprintf(" %s[%.1f|%.1f|%.1f];", nodes.Items[i].Name,
+				workingSetStats[host][0], workingSetStats[host][len(workingSetStats[host])/2], workingSetStats[host][len(workingSetStats[host])-1])
+		}
+		framework.Logf("Node-Problem-Detector CPU and Memory Stats:\n\t%s\n\t%s\n\t%s", cpuStatsMsg, rssStatsMsg, workingSetStatsMsg)
+	})
+})
+
+func verifyEvents(f *framework.Framework, options metav1.ListOptions, num int, reason, nodeName string) error {
+	events, err := f.ClientSet.CoreV1().Events(metav1.NamespaceDefault).List(options)
+	if err != nil {
+		return err
+	}
+	count := 0
+	for _, event := range events.Items {
+		if event.Reason != reason || event.Source.Host != nodeName {
+			continue
+		}
+		count += int(event.Count)
+	}
+	if count != num {
+		return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items)
+	}
+	return nil
+}
+
+func verifyNodeCondition(f *framework.Framework, condition v1.NodeConditionType, status v1.ConditionStatus, reason, nodeName string) error {
+	node, err := f.ClientSet.CoreV1().Nodes().Get(nodeName, metav1.GetOptions{})
+	if err != nil {
+		return err
+	}
+	_, c := nodeutil.GetNodeCondition(&node.Status, condition)
+	if c == nil {
+		return fmt.Errorf("node condition %q not found", condition)
+	}
+	if c.Status != status || c.Reason != reason {
+		return fmt.Errorf("unexpected node condition %q: %+v", condition, c)
+	}
+	return nil
+}
+
+func getMemoryStat(f *framework.Framework, host string) (rss, workingSet float64) {
+	memCmd := "cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.usage_in_bytes && cat /sys/fs/cgroup/memory/system.slice/node-problem-detector.service/memory.stat"
+	result, err := framework.SSH(memCmd, host, framework.TestContext.Provider)
+	framework.ExpectNoError(err)
+	Expect(result.Code).To(BeZero())
+	lines := strings.Split(result.Stdout, "\n")
+
+	memoryUsage, err := strconv.ParseFloat(lines[0], 64)
+	Expect(err).To(BeNil())
+
+	var totalInactiveFile float64
+	for _, line := range lines[1:] {
+		tokens := strings.Split(line, " ")
+		if tokens[0] == "total_rss" {
+			rss, err = strconv.ParseFloat(tokens[1], 64)
+			Expect(err).To(BeNil())
+		}
+		if tokens[0] == "total_inactive_file" {
+			totalInactiveFile, err = strconv.ParseFloat(tokens[1], 64)
+			Expect(err).To(BeNil())
+		}
+	}
+
+	workingSet = memoryUsage
+	if workingSet < totalInactiveFile {
+		workingSet = 0
+	} else {
+		workingSet -= totalInactiveFile
+	}
+
+	// Convert to MB
+	rss = rss / 1024 / 1024
+	workingSet = workingSet / 1024 / 1024
+	return
+}
+
+func getCpuStat(f *framework.Framework, host string) (usage, uptime float64) {
+	cpuCmd := "cat /sys/fs/cgroup/cpu/system.slice/node-problem-detector.service/cpuacct.usage && cat /proc/uptime | awk '{print $1}'"
+	result, err := framework.SSH(cpuCmd, host, framework.TestContext.Provider)
+	framework.ExpectNoError(err)
+	Expect(result.Code).To(BeZero())
+	lines := strings.Split(result.Stdout, "\n")
+
+	usage, err = strconv.ParseFloat(lines[0], 64)
+	uptime, err = strconv.ParseFloat(lines[1], 64)
+
+	// Convert from nanoseconds to seconds
+	usage *= 1e-9
+	return
+}
+
+func getNpdPodStat(f *framework.Framework, nodeName string) (cpuUsage, rss, workingSet float64) {
+	summary, err := framework.GetStatsSummary(f.ClientSet, nodeName)
+	framework.ExpectNoError(err)
+
+	hasNpdPod := false
+	for _, pod := range summary.Pods {
+		if !strings.HasPrefix(pod.PodRef.Name, "npd") {
+			continue
+		}
+		cpuUsage = float64(*pod.CPU.UsageNanoCores) * 1e-9
+		rss = float64(*pod.Memory.RSSBytes) / 1024 / 1024
+		workingSet = float64(*pod.Memory.WorkingSetBytes) / 1024 / 1024
+		hasNpdPod = true
+		break
+	}
+	Expect(hasNpdPod).To(BeTrue())
+	return
+}

From ffa6f476c4e96343a3bfbc8e33b3d84bef3e04af Mon Sep 17 00:00:00 2001
From: Zhen Wang <zhenw@google.com>
Date: Thu, 7 Feb 2019 15:41:16 -0800
Subject: [PATCH 2/2] allows configuring NPD image version in node e2e test and
 fix the test

---
 hack/make-rules/test-e2e-node.sh              |  7 +--
 test/e2e/framework/test_context.go            |  3 ++
 test/e2e_node/conformance/build/Dockerfile    |  5 +-
 test/e2e_node/e2e_node_suite_test.go          | 23 ++++++++-
 test/e2e_node/image_list.go                   | 19 ++++++-
 .../conformance/conformance-jenkins.sh        |  3 +-
 test/e2e_node/jenkins/e2e-node-jenkins.sh     |  3 +-
 test/e2e_node/node_problem_detector_linux.go  | 51 ++++++++++++-------
 test/e2e_node/remote/cadvisor_e2e.go          |  2 +-
 test/e2e_node/remote/node_conformance.go      |  6 +--
 test/e2e_node/remote/node_e2e.go              |  6 +--
 test/e2e_node/remote/remote.go                |  4 +-
 test/e2e_node/remote/types.go                 |  3 +-
 test/e2e_node/runner/local/run_local.go       |  3 +-
 test/e2e_node/runner/remote/run_remote.go     |  3 +-
 15 files changed, 103 insertions(+), 38 deletions(-)

diff --git a/hack/make-rules/test-e2e-node.sh b/hack/make-rules/test-e2e-node.sh
index 2e5c95ae2646..451486ca39c8 100755
--- a/hack/make-rules/test-e2e-node.sh
+++ b/hack/make-rules/test-e2e-node.sh
@@ -34,6 +34,7 @@ image_service_endpoint=${IMAGE_SERVICE_ENDPOINT:-""}
 run_until_failure=${RUN_UNTIL_FAILURE:-"false"}
 test_args=${TEST_ARGS:-""}
 system_spec_name=${SYSTEM_SPEC_NAME:-}
+extra_envs=${EXTRA_ENVS:-}
 
 # Parse the flags to pass to ginkgo
 ginkgoflags=""
@@ -148,7 +149,7 @@ if [ $remote = true ] ; then
     --image-project="$image_project" --instance-name-prefix="$instance_prefix" \
     --delete-instances="$delete_instances" --test_args="$test_args" --instance-metadata="$metadata" \
     --image-config-file="$image_config_file" --system-spec-name="$system_spec_name" \
-    --test-suite="$test_suite" \
+    --extra-envs="$extra_envs" --test-suite="$test_suite" \
     2>&1 | tee -i "${artifacts}/build-log.txt"
   exit $?
 
@@ -169,8 +170,8 @@ else
   # Test using the host the script was run on
   # Provided for backwards compatibility
   go run test/e2e_node/runner/local/run_local.go \
-    --system-spec-name="$system_spec_name" --ginkgo-flags="$ginkgoflags" \
-    --test-flags="--container-runtime=${runtime} \
+    --system-spec-name="$system_spec_name" --extra-envs="$extra_envs" \
+    --ginkgo-flags="$ginkgoflags" --test-flags="--container-runtime=${runtime} \
     --alsologtostderr --v 4 --report-dir=${artifacts} --node-name $(hostname) \
     $test_args" --build-dependencies=true 2>&1 | tee -i "${artifacts}/build-log.txt"
   exit $?
diff --git a/test/e2e/framework/test_context.go b/test/e2e/framework/test_context.go
index 71b6aab09f3a..81df49d707b9 100644
--- a/test/e2e/framework/test_context.go
+++ b/test/e2e/framework/test_context.go
@@ -160,6 +160,8 @@ type NodeTestContextType struct {
 	// the node e2e test. If empty, the default one (system.DefaultSpec) is
 	// used. The system specs are in test/e2e_node/system/specs/.
 	SystemSpecName string
+	// ExtraEnvs is a map of environment names to values.
+	ExtraEnvs map[string]string
 }
 
 // StorageConfig contains the shared settings for storage 2e2 tests.
@@ -301,6 +303,7 @@ func RegisterNodeFlags() {
 	flag.BoolVar(&TestContext.PrepullImages, "prepull-images", true, "If true, prepull images so image pull failures do not cause test failures.")
 	flag.StringVar(&TestContext.ImageDescription, "image-description", "", "The description of the image which the test will be running on.")
 	flag.StringVar(&TestContext.SystemSpecName, "system-spec-name", "", "The name of the system spec (e.g., gke) that's used in the node e2e test. The system specs are in test/e2e_node/system/specs/. This is used by the test framework to determine which tests to run for validating the system requirements.")
+	flag.Var(utilflag.NewMapStringString(&TestContext.ExtraEnvs), "extra-envs", "The extra environment variables needed for node e2e tests. Format: a list of key=value pairs, e.g., env1=val1,env2=val2")
 }
 
 func RegisterStorageFlags() {
diff --git a/test/e2e_node/conformance/build/Dockerfile b/test/e2e_node/conformance/build/Dockerfile
index 5783726a08b3..288649683f20 100644
--- a/test/e2e_node/conformance/build/Dockerfile
+++ b/test/e2e_node/conformance/build/Dockerfile
@@ -27,12 +27,14 @@ COPY_SYSTEM_SPEC_FILE
 # REPORT_PATH is the path in the container to save test result and logs.
 # FLAKE_ATTEMPTS is the time to retry when there is a test failure. By default 2.
 # TEST_ARGS is the test arguments passed into the test.
+# EXTRA_ENVS is the extra environment variables needed for node e2e tests.
 ENV FOCUS="\[Conformance\]" \
 	   SKIP="\[Flaky\]|\[Serial\]" \
 	   PARALLELISM=8 \
 	   REPORT_PATH="/var/result" \
 	   FLAKE_ATTEMPTS=2 \
-	   TEST_ARGS=""
+	   TEST_ARGS="" \
+	   EXTRA_ENVS=""
 
 ENTRYPOINT ginkgo --focus="$FOCUS" \
 	--skip="$SKIP" \
@@ -46,4 +48,5 @@ ENTRYPOINT ginkgo --focus="$FOCUS" \
 	--system-spec-name=SYSTEM_SPEC_NAME \
 	# This is a placeholder that will be substituted in the Makefile.
 	--system-spec-file=SYSTEM_SPEC_FILE_PATH \
+	--extra-envs=$EXTRA_ENVS \
 	$TEST_ARGS
diff --git a/test/e2e_node/e2e_node_suite_test.go b/test/e2e_node/e2e_node_suite_test.go
index 7b017401f037..6d920e2a16f0 100644
--- a/test/e2e_node/e2e_node_suite_test.go
+++ b/test/e2e_node/e2e_node_suite_test.go
@@ -76,6 +76,7 @@ func init() {
 func TestMain(m *testing.M) {
 	pflag.Parse()
 	framework.AfterReadingAllFlags(&framework.TestContext)
+	setExtraEnvs()
 	os.Exit(m.Run())
 }
 
@@ -146,6 +147,7 @@ var _ = SynchronizedBeforeSuite(func() []byte {
 	// This helps with debugging test flakes since it is hard to tell when a test failure is due to image pulling.
 	if framework.TestContext.PrepullImages {
 		glog.Infof("Pre-pulling images so that they are cached for the tests.")
+		updateImageWhiteList()
 		err := PrePullAllImages()
 		Expect(err).ShouldNot(HaveOccurred())
 	}
@@ -244,6 +246,9 @@ func waitForNodeReady() {
 // TODO(random-liu): Using dynamic kubelet configuration feature to
 // update test context with node configuration.
 func updateTestContext() error {
+	setExtraEnvs()
+	updateImageWhiteList()
+
 	client, err := getAPIServerClient()
 	if err != nil {
 		return fmt.Errorf("failed to get apiserver client: %v", err)
@@ -261,7 +266,7 @@ func updateTestContext() error {
 	if err != nil {
 		return fmt.Errorf("failed to get kubelet configuration: %v", err)
 	}
-	framework.TestContext.KubeletConfig = *kubeletCfg // Set kubelet config.
+	framework.TestContext.KubeletConfig = *kubeletCfg // Set kubelet config
 	return nil
 }
 
@@ -309,3 +314,19 @@ func loadSystemSpecFromFile(filename string) (*system.SysSpec, error) {
 	}
 	return spec, nil
 }
+
+// isNodeReady returns true if a node is ready; false otherwise.
+func isNodeReady(node *v1.Node) bool {
+	for _, c := range node.Status.Conditions {
+		if c.Type == v1.NodeReady {
+			return c.Status == v1.ConditionTrue
+		}
+	}
+	return false
+}
+
+func setExtraEnvs() {
+	for name, value := range framework.TestContext.ExtraEnvs {
+		os.Setenv(name, value)
+	}
+}
diff --git a/test/e2e_node/image_list.go b/test/e2e_node/image_list.go
index d35bb5acaae2..a11b902142c1 100644
--- a/test/e2e_node/image_list.go
+++ b/test/e2e_node/image_list.go
@@ -18,6 +18,7 @@ package e2e_node
 
 import (
 	"fmt"
+	"os"
 	"os/exec"
 	"os/user"
 	"time"
@@ -46,7 +47,6 @@ var NodeImageWhiteList = sets.NewString(
 	"k8s.gcr.io/stress:v1",
 	busyboxImage,
 	"k8s.gcr.io/busybox@sha256:4bdd623e848417d96127e16037743f0cd8b528c026e9175e22a84f639eca58ff",
-	"k8s.gcr.io/node-problem-detector:v0.4.1",
 	imageutils.GetE2EImage(imageutils.NginxSlim),
 	imageutils.GetE2EImage(imageutils.ServeHostname),
 	imageutils.GetE2EImage(imageutils.Netexec),
@@ -55,9 +55,24 @@ var NodeImageWhiteList = sets.NewString(
 	framework.GetGPUDevicePluginImage(),
 )
 
-func init() {
+// updateImageWhiteList updates the framework.ImageWhiteList with
+// 1. the hard coded lists
+// 2. the ones passed in from framework.TestContext.ExtraEnvs
+// So this function needs to be called after the extra envs are applied.
+func updateImageWhiteList() {
 	// Union NodeImageWhiteList and CommonImageWhiteList into the framework image white list.
 	framework.ImageWhiteList = NodeImageWhiteList.Union(commontest.CommonImageWhiteList)
+	// Images from extra envs
+	framework.ImageWhiteList.Insert(getNodeProblemDetectorImage())
+}
+
+func getNodeProblemDetectorImage() string {
+	const defaultImage string = "k8s.gcr.io/node-problem-detector:v0.6.2"
+	image := os.Getenv("NODE_PROBLEM_DETECTOR_IMAGE")
+	if image == "" {
+		image = defaultImage
+	}
+	return image
 }
 
 // puller represents a generic image puller
diff --git a/test/e2e_node/jenkins/conformance/conformance-jenkins.sh b/test/e2e_node/jenkins/conformance/conformance-jenkins.sh
index 9e8715287cf1..7758d0b2df66 100755
--- a/test/e2e_node/jenkins/conformance/conformance-jenkins.sh
+++ b/test/e2e_node/jenkins/conformance/conformance-jenkins.sh
@@ -40,4 +40,5 @@ go run test/e2e_node/runner/remote/run_remote.go  --test-suite=conformance \
   --results-dir="$ARTIFACTS" --test-timeout="$TIMEOUT" \
   --test_args="--kubelet-flags=\"$KUBELET_ARGS\"" \
   --instance-metadata="$GCE_INSTANCE_METADATA" \
-  --system-spec-name="$SYSTEM_SPEC_NAME"
+  --system-spec-name="$SYSTEM_SPEC_NAME" \
+  --extra-envs="$EXTRA_ENVS"
diff --git a/test/e2e_node/jenkins/e2e-node-jenkins.sh b/test/e2e_node/jenkins/e2e-node-jenkins.sh
index a1caae4ad95f..99a4ac14bc38 100755
--- a/test/e2e_node/jenkins/e2e-node-jenkins.sh
+++ b/test/e2e_node/jenkins/e2e-node-jenkins.sh
@@ -47,4 +47,5 @@ go run test/e2e_node/runner/remote/run_remote.go  --logtostderr --vmodule=*=4 \
   --image-config-file="$GCE_IMAGE_CONFIG_PATH" --cleanup="$CLEANUP" \
   --results-dir="$ARTIFACTS" --ginkgo-flags="--nodes=$PARALLELISM $GINKGO_FLAGS" \
   --test-timeout="$TIMEOUT" --test_args="$TEST_ARGS --kubelet-flags=\"$KUBELET_ARGS\"" \
-  --instance-metadata="$GCE_INSTANCE_METADATA" --system-spec-name="$SYSTEM_SPEC_NAME"
+  --instance-metadata="$GCE_INSTANCE_METADATA" --system-spec-name="$SYSTEM_SPEC_NAME" \
+  --extra-envs="$EXTRA_ENVS"
diff --git a/test/e2e_node/node_problem_detector_linux.go b/test/e2e_node/node_problem_detector_linux.go
index 36a63193c9e2..a6a248abcf97 100644
--- a/test/e2e_node/node_problem_detector_linux.go
+++ b/test/e2e_node/node_problem_detector_linux.go
@@ -45,13 +45,14 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 		pollInterval   = 1 * time.Second
 		pollConsistent = 5 * time.Second
 		pollTimeout    = 1 * time.Minute
-		image          = "k8s.gcr.io/node-problem-detector:v0.4.1"
 	)
 	f := framework.NewDefaultFramework("node-problem-detector")
 	var c clientset.Interface
 	var uid string
 	var ns, name, configName, eventNamespace string
 	var bootTime, nodeTime time.Time
+	var image string
+
 	BeforeEach(func() {
 		c = f.ClientSet
 		ns = f.Namespace.Name
@@ -60,6 +61,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 		configName = "node-problem-detector-config-" + uid
 		// There is no namespace for Node, event recorder will set default namespace for node events.
 		eventNamespace = metav1.NamespaceDefault
+		image = getNodeProblemDetectorImage()
+		By(fmt.Sprintf("Using node-problem-detector image: %s", image))
 	})
 
 	// Test system log monitor. We may add other tests if we have more problem daemons in the future.
@@ -245,7 +248,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 				timestamp        time.Time
 				message          string
 				messageNum       int
-				events           int
+				tempEvents       int // Events for temp errors
+				totalEvents      int // Events for both temp errors and condition changes
 				conditionReason  string
 				conditionMessage string
 				conditionType    v1.ConditionStatus
@@ -279,7 +283,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime,
 					message:          tempMessage,
 					messageNum:       3,
-					events:           3,
+					tempEvents:       3,
+					totalEvents:      3,
 					conditionReason:  defaultReason,
 					conditionMessage: defaultMessage,
 					conditionType:    v1.ConditionFalse,
@@ -289,7 +294,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime,
 					message:          permMessage1,
 					messageNum:       1,
-					events:           3, // event number should not change
+					tempEvents:       3, // event number for temp errors should not change
+					totalEvents:      4, // add 1 event for condition change
 					conditionReason:  permReason1,
 					conditionMessage: permMessage1,
 					conditionType:    v1.ConditionTrue,
@@ -299,7 +305,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime.Add(5 * time.Minute),
 					message:          tempMessage,
 					messageNum:       3,
-					events:           6,
+					tempEvents:       6, // add 3 events for temp errors
+					totalEvents:      7, // add 3 events for temp errors
 					conditionReason:  permReason1,
 					conditionMessage: permMessage1,
 					conditionType:    v1.ConditionTrue,
@@ -309,7 +316,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime.Add(5 * time.Minute),
 					message:          permMessage1 + "different message",
 					messageNum:       1,
-					events:           6, // event number should not change
+					tempEvents:       6, // event number should not change
+					totalEvents:      7, // event number should not change
 					conditionReason:  permReason1,
 					conditionMessage: permMessage1,
 					conditionType:    v1.ConditionTrue,
@@ -319,7 +327,8 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					timestamp:        nodeTime.Add(5 * time.Minute),
 					message:          permMessage2,
 					messageNum:       1,
-					events:           6, // event number should not change
+					tempEvents:       6, // event number for temp errors should not change
+					totalEvents:      8, // add 1 event for condition change
 					conditionReason:  permReason2,
 					conditionMessage: permMessage2,
 					conditionType:    v1.ConditionTrue,
@@ -332,13 +341,17 @@ var _ = framework.KubeDescribe("NodeProblemDetector [NodeFeature:NodeProblemDete
 					Expect(err).NotTo(HaveOccurred())
 				}
 
-				By(fmt.Sprintf("Wait for %d events generated", test.events))
+				By(fmt.Sprintf("Wait for %d temp events generated", test.tempEvents))
+				Eventually(func() error {
+					return verifyEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.tempEvents, tempReason, tempMessage)
+				}, pollTimeout, pollInterval).Should(Succeed())
+				By(fmt.Sprintf("Wait for %d total events generated", test.totalEvents))
 				Eventually(func() error {
-					return verifyEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.events, tempReason, tempMessage)
+					return verifyTotalEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
 				}, pollTimeout, pollInterval).Should(Succeed())
-				By(fmt.Sprintf("Make sure only %d events generated", test.events))
+				By(fmt.Sprintf("Make sure only %d total events generated", test.totalEvents))
 				Consistently(func() error {
-					return verifyEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.events, tempReason, tempMessage)
+					return verifyTotalEvents(c.CoreV1().Events(eventNamespace), eventListOptions, test.totalEvents)
 				}, pollConsistent, pollInterval).Should(Succeed())
 
 				By(fmt.Sprintf("Make sure node condition %q is set", condition))
@@ -390,7 +403,7 @@ func injectLog(file string, timestamp time.Time, log string, num int) error {
 	return nil
 }
 
-// verifyEvents verifies there are num specific events generated
+// verifyEvents verifies there are num specific events generated with given reason and message.
 func verifyEvents(e coreclientset.EventInterface, options metav1.ListOptions, num int, reason, message string) error {
 	events, err := e.List(options)
 	if err != nil {
@@ -399,7 +412,7 @@ func verifyEvents(e coreclientset.EventInterface, options metav1.ListOptions, nu
 	count := 0
 	for _, event := range events.Items {
 		if event.Reason != reason || event.Message != message {
-			return fmt.Errorf("unexpected event: %v", event)
+			continue
 		}
 		count += int(event.Count)
 	}
@@ -409,14 +422,18 @@ func verifyEvents(e coreclientset.EventInterface, options metav1.ListOptions, nu
 	return nil
 }
 
-// verifyNoEvents verifies there is no event generated
-func verifyNoEvents(e coreclientset.EventInterface, options metav1.ListOptions) error {
+// verifyTotalEvents verifies there are num events in total.
+func verifyTotalEvents(e coreclientset.EventInterface, options metav1.ListOptions, num int) error {
 	events, err := e.List(options)
 	if err != nil {
 		return err
 	}
-	if len(events.Items) != 0 {
-		return fmt.Errorf("unexpected events: %v", events.Items)
+	count := 0
+	for _, event := range events.Items {
+		count += int(event.Count)
+	}
+	if count != num {
+		return fmt.Errorf("expect event number %d, got %d: %v", num, count, events.Items)
 	}
 	return nil
 }
diff --git a/test/e2e_node/remote/cadvisor_e2e.go b/test/e2e_node/remote/cadvisor_e2e.go
index 8bdb567d031a..76ae79aff504 100644
--- a/test/e2e_node/remote/cadvisor_e2e.go
+++ b/test/e2e_node/remote/cadvisor_e2e.go
@@ -63,7 +63,7 @@ func runCommand(command string, args ...string) error {
 }
 
 // RunTest implements TestSuite.RunTest
-func (n *CAdvisorE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName string, timeout time.Duration) (string, error) {
+func (n *CAdvisorE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, extraEnvs string, timeout time.Duration) (string, error) {
 	// Kill any running node processes
 	cleanupNodeProcesses(host)
 
diff --git a/test/e2e_node/remote/node_conformance.go b/test/e2e_node/remote/node_conformance.go
index 9c78ae30887c..3a6cf98ae44a 100644
--- a/test/e2e_node/remote/node_conformance.go
+++ b/test/e2e_node/remote/node_conformance.go
@@ -259,7 +259,7 @@ func stopKubelet(host, workspace string) error {
 }
 
 // RunTest runs test on the node.
-func (c *ConformanceRemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, _, systemSpecName string, timeout time.Duration) (string, error) {
+func (c *ConformanceRemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, _, systemSpecName, extraEnvs string, timeout time.Duration) (string, error) {
 	// Install the cni plugins and add a basic CNI configuration.
 	if err := setupCNI(host, workspace); err != nil {
 		return "", err
@@ -293,8 +293,8 @@ func (c *ConformanceRemote) RunTest(host, workspace, results, imageDesc, junitFi
 	// Run the tests
 	glog.V(2).Infof("Starting tests on %q", host)
 	podManifestPath := getPodPath(workspace)
-	cmd := fmt.Sprintf("'timeout -k 30s %fs docker run --rm --privileged=true --net=host -v /:/rootfs -v %s:%s -v %s:/var/result -e TEST_ARGS=--report-prefix=%s %s'",
-		timeout.Seconds(), podManifestPath, podManifestPath, results, junitFilePrefix, getConformanceTestImageName(systemSpecName))
+	cmd := fmt.Sprintf("'timeout -k 30s %fs docker run --rm --privileged=true --net=host -v /:/rootfs -v %s:%s -v %s:/var/result -e TEST_ARGS=--report-prefix=%s -e EXTRA_ENVS=%s %s'",
+		timeout.Seconds(), podManifestPath, podManifestPath, results, junitFilePrefix, extraEnvs, getConformanceTestImageName(systemSpecName))
 	testOutput, err := SSH(host, "sh", "-c", cmd)
 	if err != nil {
 		return testOutput, err
diff --git a/test/e2e_node/remote/node_e2e.go b/test/e2e_node/remote/node_e2e.go
index d54b0d94b023..b3f58267273a 100644
--- a/test/e2e_node/remote/node_e2e.go
+++ b/test/e2e_node/remote/node_e2e.go
@@ -138,7 +138,7 @@ func updateOSSpecificKubeletFlags(args, host, workspace string) (string, error)
 }
 
 // RunTest runs test on the node.
-func (n *NodeE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName string, timeout time.Duration) (string, error) {
+func (n *NodeE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, extraEnvs string, timeout time.Duration) (string, error) {
 	// Install the cni plugins and add a basic CNI configuration.
 	// TODO(random-liu): Do this in cloud init after we remove containervm test.
 	if err := setupCNI(host, workspace); err != nil {
@@ -167,8 +167,8 @@ func (n *NodeE2ERemote) RunTest(host, workspace, results, imageDesc, junitFilePr
 	glog.V(2).Infof("Starting tests on %q", host)
 	cmd := getSSHCommand(" && ",
 		fmt.Sprintf("cd %s", workspace),
-		fmt.Sprintf("timeout -k 30s %fs ./ginkgo %s ./e2e_node.test -- --system-spec-name=%s --system-spec-file=%s --logtostderr --v 4 --node-name=%s --report-dir=%s --report-prefix=%s --image-description=\"%s\" %s",
-			timeout.Seconds(), ginkgoArgs, systemSpecName, systemSpecFile, host, results, junitFilePrefix, imageDesc, testArgs),
+		fmt.Sprintf("timeout -k 30s %fs ./ginkgo %s ./e2e_node.test -- --system-spec-name=%s --system-spec-file=%s --extra-envs=%s --logtostderr --v 4 --node-name=%s --report-dir=%s --report-prefix=%s --image-description=\"%s\" %s",
+			timeout.Seconds(), ginkgoArgs, systemSpecName, systemSpecFile, extraEnvs, host, results, junitFilePrefix, imageDesc, testArgs),
 	)
 	return SSH(host, "sh", "-c", cmd)
 }
diff --git a/test/e2e_node/remote/remote.go b/test/e2e_node/remote/remote.go
index 746899f8b57b..47501d297748 100644
--- a/test/e2e_node/remote/remote.go
+++ b/test/e2e_node/remote/remote.go
@@ -65,7 +65,7 @@ func CreateTestArchive(suite TestSuite, systemSpecName string) (string, error) {
 
 // Returns the command output, whether the exit was ok, and any errors
 // TODO(random-liu): junitFilePrefix is not prefix actually, the file name is junit-junitFilePrefix.xml. Change the variable name.
-func RunRemote(suite TestSuite, archive string, host string, cleanup bool, imageDesc, junitFilePrefix string, testArgs string, ginkgoArgs string, systemSpecName string) (string, bool, error) {
+func RunRemote(suite TestSuite, archive string, host string, cleanup bool, imageDesc, junitFilePrefix string, testArgs string, ginkgoArgs string, systemSpecName string, extraEnvs string) (string, bool, error) {
 	// Create the temp staging directory
 	glog.V(2).Infof("Staging test binaries on %q", host)
 	workspace := newWorkspaceDir()
@@ -110,7 +110,7 @@ func RunRemote(suite TestSuite, archive string, host string, cleanup bool, image
 	}
 
 	glog.V(2).Infof("Running test on %q", host)
-	output, err := suite.RunTest(host, workspace, resultDir, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, *testTimeoutSeconds)
+	output, err := suite.RunTest(host, workspace, resultDir, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, extraEnvs, *testTimeoutSeconds)
 
 	aggErrs := []error{}
 	// Do not log the output here, let the caller deal with the test output.
diff --git a/test/e2e_node/remote/types.go b/test/e2e_node/remote/types.go
index f7e360f7440b..33d36fca5e9a 100644
--- a/test/e2e_node/remote/types.go
+++ b/test/e2e_node/remote/types.go
@@ -46,6 +46,7 @@ type TestSuite interface {
 	// * ginkgoArgs is the arguments passed to ginkgo.
 	// * systemSpecName is the name of the system spec used for validating the
 	//   image on which the test runs.
+	// * extraEnvs is the extra environment variables needed for node e2e tests.
 	// * timeout is the test timeout.
-	RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName string, timeout time.Duration) (string, error)
+	RunTest(host, workspace, results, imageDesc, junitFilePrefix, testArgs, ginkgoArgs, systemSpecName, extraEnvs string, timeout time.Duration) (string, error)
 }
diff --git a/test/e2e_node/runner/local/run_local.go b/test/e2e_node/runner/local/run_local.go
index c2c169e87bc8..c2b179fde78f 100644
--- a/test/e2e_node/runner/local/run_local.go
+++ b/test/e2e_node/runner/local/run_local.go
@@ -34,6 +34,7 @@ var buildDependencies = flag.Bool("build-dependencies", true, "If true, build al
 var ginkgoFlags = flag.String("ginkgo-flags", "", "Space-separated list of arguments to pass to Ginkgo test runner.")
 var testFlags = flag.String("test-flags", "", "Space-separated list of arguments to pass to node e2e test.")
 var systemSpecName = flag.String("system-spec-name", "", "The name of the system spec used for validating the image in the node conformance test. The specs are at test/e2e_node/system/specs/. If unspecified, the default built-in spec (system.DefaultSpec) will be used.")
+var extraEnvs = flag.String("extra-envs", "", "The extra environment variables needed for node e2e tests. Format: a list of key=value pairs, e.g., env1=val1,env2=val2")
 
 const (
 	systemSpecPath = "test/e2e_node/system/specs"
@@ -65,7 +66,7 @@ func main() {
 			glog.Fatalf("Failed to get k8s root directory: %v", err)
 		}
 		systemSpecFile := filepath.Join(rootDir, systemSpecPath, *systemSpecName+".yaml")
-		args = append(args, fmt.Sprintf("--system-spec-name=%s --system-spec-file=%s", *systemSpecName, systemSpecFile))
+		args = append(args, fmt.Sprintf("--system-spec-name=%s --system-spec-file=%s  --extra-envs=%s", *systemSpecName, systemSpecFile, *extraEnvs))
 	}
 	if err := runCommand(ginkgo, args...); err != nil {
 		glog.Exitf("Test failed: %v", err)
diff --git a/test/e2e_node/runner/remote/run_remote.go b/test/e2e_node/runner/remote/run_remote.go
index 7c440ad96b65..998a3c468356 100644
--- a/test/e2e_node/runner/remote/run_remote.go
+++ b/test/e2e_node/runner/remote/run_remote.go
@@ -62,6 +62,7 @@ var instanceMetadata = flag.String("instance-metadata", "", "key/value metadata
 var gubernator = flag.Bool("gubernator", false, "If true, output Gubernator link to view logs")
 var ginkgoFlags = flag.String("ginkgo-flags", "", "Passed to ginkgo to specify additional flags such as --skip=.")
 var systemSpecName = flag.String("system-spec-name", "", "The name of the system spec used for validating the image in the node conformance test. The specs are at test/e2e_node/system/specs/. If unspecified, the default built-in spec (system.DefaultSpec) will be used.")
+var extraEnvs = flag.String("extra-envs", "", "The extra environment variables needed for node e2e tests. Format: a list of key=value pairs, e.g., env1=val1,env2=val2")
 
 // envs is the type used to collect all node envs. The key is the env name,
 // and the value is the env value
@@ -440,7 +441,7 @@ func testHost(host string, deleteFiles bool, imageDesc, junitFilePrefix, ginkgoF
 		}
 	}
 
-	output, exitOk, err := remote.RunRemote(suite, path, host, deleteFiles, imageDesc, junitFilePrefix, *testArgs, ginkgoFlagsStr, *systemSpecName)
+	output, exitOk, err := remote.RunRemote(suite, path, host, deleteFiles, imageDesc, junitFilePrefix, *testArgs, ginkgoFlagsStr, *systemSpecName, *extraEnvs)
 	return &TestResult{
 		output: output,
 		err:    err,