Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Wait on Kubernetes service in DNS pod #22253

Merged
merged 1 commit into from
Mar 4, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 4 additions & 0 deletions cluster/addons/dns/kube2sky/Changelog
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
## Version 1.13 (Mar 1 2016 Prashanth.B <beeps@google.com>)
- Synchronously wait for the Kubernetes service at startup.
- Add a SIGTERM/SIGINT handler.

## Version 1.12 (Dec 15 2015 Abhishek Shah <abshah@google.com>)
- Gave pods their own cache store. (034ecbd)
- Allow pods to have dns. (717660a)
Expand Down
2 changes: 1 addition & 1 deletion cluster/addons/dns/kube2sky/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@

.PHONY: all kube2sky container push clean test

TAG = 1.12
TAG = 1.13
PREFIX = gcr.io/google_containers

all: container
Expand Down
70 changes: 68 additions & 2 deletions cluster/addons/dns/kube2sky/kube2sky.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,8 +26,10 @@ import (
"net/http"
"net/url"
"os"
"os/signal"
"strings"
"sync"
"syscall"
"time"

etcd "github.com/coreos/go-etcd/etcd"
Expand All @@ -47,12 +49,16 @@ import (
"k8s.io/kubernetes/pkg/util/wait"
)

// The name of the "master" Kubernetes Service.
const kubernetesSvcName = "kubernetes"

var (
argDomain = flag.String("domain", "cluster.local", "domain under which to create names")
argEtcdMutationTimeout = flag.Duration("etcd-mutation-timeout", 10*time.Second, "crash after retrying etcd mutation for a specified duration")
argEtcdServer = flag.String("etcd-server", "http://127.0.0.1:4001", "URL to etcd server")
argKubecfgFile = flag.String("kubecfg-file", "", "Location of kubecfg file for access to kubernetes master service; --kube-master-url overrides the URL part of this; if neither this nor --kube-master-url are provided, defaults to service account tokens")
argKubeMasterURL = flag.String("kube-master-url", "", "URL to reach kubernetes master. Env variables in this flag will be expanded.")
healthzPort = flag.Int("healthz-port", 8081, "port on which to serve a kube2sky HTTP readiness probe.")
)

const (
Expand Down Expand Up @@ -410,7 +416,11 @@ func (ks *kube2sky) removeService(obj interface{}) {
}

func (ks *kube2sky) updateService(oldObj, newObj interface{}) {
// TODO: Avoid unwanted updates.
// TODO: We shouldn't leave etcd in a state where it doesn't have a
// record for a Service. This removal is needed to completely clean
// the directory of a Service, which has SRV records and A records
// that are hashed according to oldObj. Unfortunately, this is the
// easiest way to purge the directory.
ks.removeService(oldObj)
ks.newService(newObj)
}
Expand Down Expand Up @@ -562,10 +572,56 @@ func getHash(text string) string {
return fmt.Sprintf("%x", h.Sum32())
}

// waitForKubernetesService waits for the "Kuberntes" master service.
// Since the health probe on the kube2sky container is essentially an nslookup
// of this service, we cannot serve any DNS records if it doesn't show up.
// Once the Service is found, we start replying on this containers readiness
// probe endpoint.
func waitForKubernetesService(client *kclient.Client) (svc *kapi.Service) {
name := fmt.Sprintf("%v/%v", kapi.NamespaceDefault, kubernetesSvcName)
glog.Infof("Waiting for service: %v", name)
var err error
servicePollInterval := 1 * time.Second
for {
svc, err = client.Services(kapi.NamespaceDefault).Get(kubernetesSvcName)
if err != nil || svc == nil {
glog.Infof("Ignoring error while waiting for service %v: %v. Sleeping %v before retrying.", name, err, servicePollInterval)
time.Sleep(servicePollInterval)
continue
}
break
}
return
}

// setupSignalHandlers runs a goroutine that waits on SIGINT or SIGTERM and logs it
// before exiting.
func setupSignalHandlers() {
sigChan := make(chan os.Signal)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
// This program should always exit gracefully logging that it received
// either a SIGINT or SIGTERM. Since kube2sky is run in a container
// without a liveness probe as part of the kube-dns pod, it shouldn't
// restart unless the pod is deleted. If it restarts without logging
// anything it means something is seriously wrong.
// TODO: Remove once #22290 is fixed.
go func() {
glog.Fatalf("Received signal %s", <-sigChan)
}()
}

// setupHealthzHandlers sets up a readiness and liveness endpoint for kube2sky.
func setupHealthzHandlers(ks *kube2sky) {
http.HandleFunc("/readiness", func(w http.ResponseWriter, req *http.Request) {
fmt.Fprintf(w, "ok\n")
})
}

func main() {
flag.CommandLine.SetNormalizeFunc(util.WarnWordSepNormalizeFunc)
flag.Parse()
var err error
setupSignalHandlers()
// TODO: Validate input flags.
domain := *argDomain
if !strings.HasSuffix(domain, ".") {
Expand All @@ -583,10 +639,20 @@ func main() {
if err != nil {
glog.Fatalf("Failed to create a kubernetes client: %v", err)
}
// Wait synchronously for the Kubernetes service and add a DNS record for it.
ks.newService(waitForKubernetesService(kubeClient))
glog.Infof("Successfully added DNS record for Kubernetes service.")

ks.endpointsStore = watchEndpoints(kubeClient, &ks)
ks.servicesStore = watchForServices(kubeClient, &ks)
ks.podsStore = watchPods(kubeClient, &ks)

select {}
// We declare kube2sky ready when:
// 1. It has retrieved the Kubernetes master service from the apiserver. If this
// doesn't happen skydns will fail its liveness probe assuming that it can't
// perform any cluster local DNS lookups.
// 2. It has setup the 3 watches above.
// Once ready this container never flips to not-ready.
setupHealthzHandlers(&ks)
glog.Fatal(http.ListenAndServe(fmt.Sprintf(":%d", *healthzPort), nil))
}
64 changes: 39 additions & 25 deletions cluster/addons/dns/skydns-rc.yaml.in
Original file line number Diff line number Diff line change
@@ -1,32 +1,35 @@
apiVersion: v1
kind: ReplicationController
metadata:
name: kube-dns-v10
name: kube-dns-v11
namespace: kube-system
labels:
k8s-app: kube-dns
version: v10
version: v11
kubernetes.io/cluster-service: "true"
spec:
replicas: {{ pillar['dns_replicas'] }}
selector:
k8s-app: kube-dns
version: v10
version: v11
template:
metadata:
labels:
k8s-app: kube-dns
version: v10
version: v11
kubernetes.io/cluster-service: "true"
spec:
containers:
- name: etcd
image: gcr.io/google_containers/etcd-amd64:2.2.1
resources:
# keep request = limit to keep this container in guaranteed class
# TODO: Set memory limits when we've profiled the container for large
# clusters, then set request = limit to keep this container in
# guaranteed class. Currently, this container falls into the
# "burstable" category so the kubelet doesn't backoff from restarting it.
limits:
cpu: 100m
memory: 50Mi
memory: 500Mi
requests:
cpu: 100m
memory: 50Mi
Expand All @@ -44,25 +47,50 @@ spec:
- name: etcd-storage
mountPath: /var/etcd/data
- name: kube2sky
image: gcr.io/google_containers/kube2sky:1.12
image: gcr.io/google_containers/kube2sky:1.13
resources:
# keep request = limit to keep this container in guaranteed class
# TODO: Set memory limits when we've profiled the container for large
# clusters, then set request = limit to keep this container in
# guaranteed class. Currently, this container falls into the
# "burstable" category so the kubelet doesn't backoff from restarting it.
limits:
cpu: 100m
memory: 50Mi
# Kube2sky watches all pods.
memory: 200Mi
requests:
cpu: 100m
memory: 50Mi
livenessProbe:
httpGet:
path: /healthz
port: 8080
scheme: HTTP
initialDelaySeconds: 60
timeoutSeconds: 5
successThreshold: 1
failureThreshold: 5
readinessProbe:
httpGet:
path: /readiness
port: 8081
scheme: HTTP
# we poll on pod startup for the Kubernetes master service and
# only setup the /readiness HTTP server once that's available.
initialDelaySeconds: 30
timeoutSeconds: 5
args:
# command = "/kube2sky"
- --domain={{ pillar['dns_domain'] }}
- name: skydns
image: gcr.io/google_containers/skydns:2015-10-13-8c72f8c
resources:
# keep request = limit to keep this container in guaranteed class
# TODO: Set memory limits when we've profiled the container for large
# clusters, then set request = limit to keep this container in
# guaranteed class. Currently, this container falls into the
# "burstable" category so the kubelet doesn't backoff from restarting it.
limits:
cpu: 100m
memory: 50Mi
memory: 200Mi
requests:
cpu: 100m
memory: 50Mi
Expand All @@ -79,20 +107,6 @@ spec:
- containerPort: 53
name: dns-tcp
protocol: TCP
livenessProbe:
httpGet:
path: /healthz
port: 8080
scheme: HTTP
initialDelaySeconds: 30
timeoutSeconds: 5
readinessProbe:
httpGet:
path: /healthz
port: 8080
scheme: HTTP
initialDelaySeconds: 1
timeoutSeconds: 5
- name: healthz
image: gcr.io/google_containers/exechealthz:1.0
resources:
Expand Down