New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Liveness Readiness Probe Istio #2628

Closed
jsenon opened this Issue Jan 16, 2018 · 53 comments

Comments

Projects
None yet
@jsenon
Copy link
Contributor

jsenon commented Jan 16, 2018

We are using istio 0.4 with mutual TLS with automatic sidecar injection. We want to configure liveness probe but we receive :

[2018-01-15 15:29:03.199][13][warning][upstream] external/envoy/source/server/lds_subscription.cc:68] lds: fetch failure: error adding listener: 'tcp_100.108.29.243_5555' has duplicate address '100.108.29.243:5555' as existing listener
[2018-01-15 15:29:04.213][13][warning][config] external/envoy/source/server/listener_manager_impl.cc:245] error adding listener: 'tcp_100.108.29.243_5555' has duplicate address '100.108.29.243:5555' as existing listener

Our deployment is like this:

"containers": [
          {
            "name": "xxx-test",
            "image": "xxx/hello-world:latest",
            "command": [
              "/usr/bin/HelloWorld"
            ],
            "args": [
              "server"
            ],
            "ports": [
              {
                "containerPort": 5555,
                "protocol": "TCP"
              }
            ],
            "resources": {},
            "livenessProbe": {
              "httpGet": {
                "path": "/healthz",
                "port": 5555,
                "scheme": "HTTP"
              },
              "initialDelaySeconds": 20,
              "timeoutSeconds": 1,
              "periodSeconds": 10,
              "successThreshold": 1,
              "failureThreshold": 3
            },

If we open a new listener in our application container, but don't declare it on service, and use this new port for liveness probe it will work:

{
  "kind": "Deployment",
  "apiVersion": "extensions/v1beta1",
  "metadata": {
    "name": "xxx-test",
    "namespace": "mydemo",
    "selfLink": "/apis/extensions/v1beta1/namespaces/mydemo/deployments/xxx-test",
    "uid": "1fee6068-fa13-11e7-89c2-029743d47442",
    "resourceVersion": "1428024",
    "generation": 7,
    "creationTimestamp": "2018-01-15T16:42:50Z",
    "labels": {
      "app": "xxx-test",
      "version": "v1"
    },
    "annotations": {
      "deployment.kubernetes.io/revision": "6",
      "kubectl.kubernetes.io/last-applied-configuration": "{\"apiVersion\":\"extensions/v1beta1\",\"kind\":\"Deployment\",\"metadata\":{\"annotations\":{},\"name\":\"xxx-test\",\"namespace\":\"mydemo\"},\"spec\":{\"replicas\":1,\"template\":{\"metadata\":{\"labels\":{\"app\":\"xxx-test\",\"version\":\"v1\"}},\"spec\":{\"containers\":[{\"args\":[\"server\"],\"command\":[\"/usr/bin/HelloWorld\"],\"image\":\"xxx/hello-world:latest\",\"imagePullPolicy\":\"Always\",\"livenessProbe\":{\"httpGet\":{\"path\":\"/healthz\",\"port\":5556},\"initialDelaySeconds\":20,\"periodSeconds\":10},\"name\":\"xxx-test\",\"ports\":[{\"containerPort\":5555},{\"containerPort\":5556}]}],\"imagePullSecrets\":[{\"name\":\"regsecret-dev\"}]}}}}\n",
      "sidecar.istio.io/status": "injected-version-0.4.0"
    }
  },
  "spec": {
    "replicas": 1,
    "selector": {
      "matchLabels": {
        "app": "xxx-test",
        "version": "v1"
      }
    },
    "template": {
      "metadata": {
        "creationTimestamp": null,
        "labels": {
          "app": "xxx-test",
          "version": "v1"
        },
        "annotations": {
          "sidecar.istio.io/status": "injected-version-0.4.0"
        }
      },
      "spec": {
        "volumes": [
          {
            "name": "istio-envoy",
            "emptyDir": {
              "medium": "Memory"
            }
          },
          {
            "name": "istio-certs",
            "secret": {
              "secretName": "istio.default",
              "defaultMode": 420,
              "optional": true
            }
          }
        ],
        "initContainers": [
          {
            "name": "istio-init",
            "image": "docker.io/istio/proxy_init:0.4.0",
            "args": [
              "-p",
              "15001",
              "-u",
              "1337"
            ],
            "resources": {},
            "terminationMessagePath": "/dev/termination-log",
            "terminationMessagePolicy": "File",
            "imagePullPolicy": "IfNotPresent",
            "securityContext": {
              "capabilities": {
                "add": [
                  "NET_ADMIN"
                ]
              },
              "privileged": true
            }
          }
        ],
        "containers": [
          {
            "name": "xxx-test",
            "image": "xxx/hello-world:latest",
            "command": [
              "/usr/bin/HelloWorld"
            ],
            "args": [
              "server"
            ],
            "ports": [
              {
                "containerPort": 5555,
                "protocol": "TCP"
              },
              {
                "containerPort": 5556,
                "protocol": "TCP"
              }
            ],
            "resources": {},
            "livenessProbe": {
              "httpGet": {
                "path": "/healthz",
                "port": 5556,
                "scheme": "HTTP"
              },
              "initialDelaySeconds": 20,
              "timeoutSeconds": 1,
              "periodSeconds": 10,
              "successThreshold": 1,
              "failureThreshold": 3
            },
            "terminationMessagePath": "/dev/termination-log",
            "terminationMessagePolicy": "File",
            "imagePullPolicy": "Always"
          },
          {
            "name": "istio-proxy",
            "image": "docker.io/istio/proxy_debug:0.4.0",
            "args": [
              "proxy",
              "sidecar",
              "-v",
              "2",
              "--configPath",
              "/etc/istio/proxy",
              "--binaryPath",
              "/usr/local/bin/envoy",
              "--serviceCluster",
              "xxx-test",
              "--drainDuration",
              "45s",
              "--parentShutdownDuration",
              "1m0s",
              "--discoveryAddress",
              "istio-pilot.istio-system:15003",
              "--discoveryRefreshDelay",
              "1s",
              "--zipkinAddress",
              "zipkin.istio-system:9411",
              "--connectTimeout",
              "10s",
              "--statsdUdpAddress",
              "istio-mixer.istio-system:9125",
              "--proxyAdminPort",
              "15000",
              "--controlPlaneAuthPolicy",
              "MUTUAL_TLS"
            ],
            "env": [
              {
                "name": "POD_NAME",
                "valueFrom": {
                  "fieldRef": {
                    "apiVersion": "v1",
                    "fieldPath": "metadata.name"
                  }
                }
              },
              {
                "name": "POD_NAMESPACE",
                "valueFrom": {
                  "fieldRef": {
                    "apiVersion": "v1",
                    "fieldPath": "metadata.namespace"
                  }
                }
              },
              {
                "name": "INSTANCE_IP",
                "valueFrom": {
                  "fieldRef": {
                    "apiVersion": "v1",
                    "fieldPath": "status.podIP"
                  }
                }
              }
            ],
            "resources": {},
            "volumeMounts": [
              {
                "name": "istio-envoy",
                "mountPath": "/etc/istio/proxy"
              },
              {
                "name": "istio-certs",
                "readOnly": true,
                "mountPath": "/etc/certs/"
              }
            ],
            "terminationMessagePath": "/dev/termination-log",
            "terminationMessagePolicy": "File",
            "imagePullPolicy": "IfNotPresent",
            "securityContext": {
              "privileged": true,
              "runAsUser": 1337,
              "readOnlyRootFilesystem": false
            }
          }
        ],
        "restartPolicy": "Always",
        "terminationGracePeriodSeconds": 30,
        "dnsPolicy": "ClusterFirst",
        "securityContext": {},
        "imagePullSecrets": [
          {
            "name": "regsecret-dev"
          }
        ],
        "schedulerName": "default-scheduler"
      }
    },
    "strategy": {
      "type": "RollingUpdate",
      "rollingUpdate": {
        "maxUnavailable": 1,
        "maxSurge": 1
      }
    }
  },
  "status": {
    "observedGeneration": 7,
    "replicas": 1,
    "updatedReplicas": 1,
    "readyReplicas": 1,
    "availableReplicas": 1,
    "conditions": [
      {
        "type": "Available",
        "status": "True",
        "lastUpdateTime": "2018-01-15T16:42:50Z",
        "lastTransitionTime": "2018-01-15T16:42:50Z",
        "reason": "MinimumReplicasAvailable",
        "message": "Deployment has minimum availability."
      }
    ]
  }
}

Service:

apiVersion: v1
kind: Service
metadata:
  annotations:
    prometheus.io/scrape: 'true'
  name: xxx-test
  labels:
    app: xxx-test
    hub-type: rest-endpoint
spec:
  ports:
  - name: http
    port: 80
    targetPort: 5555
  selector:
    app: xxx-test

Our micro service is build with Goland and we use a mux listener.

@cmluciano

This comment has been minimized.

Copy link
Member

cmluciano commented Jan 16, 2018

@jsenon

This comment has been minimized.

Copy link
Contributor

jsenon commented Jan 16, 2018

Do the FAQ have answer?

How can I use Kubernetes liveness and readiness for service health check with Istio Auth enabled?

If Istio Auth is enabled, http and tcp health check from kubelet will not work since they do not have Istio Auth issued certs. A workaround is to use a liveness command for health check, e.g., one can install curl in the service pod and curl itself within the pod. The Istio team is actively working on a solution.

An example of readinessProbe:

livenessProbe:
exec:
  command:
  - curl
  - -f
  - http://localhost:8080/healthz # Replace port and URI by your actual health check
initialDelaySeconds: 10
periodSeconds: 5
@ZackButcher

This comment has been minimized.

Copy link
Contributor

ZackButcher commented Jan 16, 2018

If your health check is on the same port as your main application's serving port, and you have Istio Auth enabled (i.e. you have mTLS enabled between services in your mesh) then health checking will not work. This is because Envoy can't tell the difference between a health check and regular old un-encrypted traffic, and the API server performing health checking doesn't run with a sidecar that can perform mTLS for it.

However, splitting the two so that health checking occurs on one port and serving on a different one, and excluding the health check port from mTLS, works fine. When you wired up a listener for health checking on a new port and didn't specify it in the service, this is what you were doing. cc @wattli - what's the correct way to write config to describe this?

@ZackButcher

This comment has been minimized.

Copy link
Contributor

ZackButcher commented Jan 16, 2018

Ah, it is in our FAQ:

Starting with release 0.3, you can use service-level annotations to disable (or enable) Istio Auth for particular service-port. The annotation key should be auth.istio.io/{port_number}, and the value should be NONE (to disable), or MUTUAL_TLS (to enable).

Example: disable Istio Auth on port 9080 for service details.

kind: Service
metadata:
name: details
labels:
  app: details
annotations:
  auth.istio.io/9080: NONE

So, the solution is to keep your health check on a different port than your main service, and in your service definition exempt the health check port from mTLS using the annotation "auth.istio.io/<SERVICE PORT>: NONE"

Feel free to re-open if there're other aspects of the question not answered, or if you hit more issues with the same.

@wstrange

This comment has been minimized.

Copy link

wstrange commented Jan 16, 2018

There are many applications that will not support moving health checks to a different port.

It would be nice to have a better solution. The trick of using curl from within the pod is an option, but it does feel a little kludgy.

@jsenon

This comment has been minimized.

Copy link
Contributor

jsenon commented Jan 17, 2018

Agreed with @wstrange, solution with curl or use different port can works but seems to be wily. May we think about share certificate with k8s api who needs to have it in order to works with mTLS ? Nevertheless thanks @ZackButcher for your quick answer.

@ZackButcher

This comment has been minimized.

Copy link
Contributor

ZackButcher commented Jan 17, 2018

I'll re-open since the core issue isn't really solved.

With some of the work we're landing to support Gateways, and also to support incremental mTLS adoption, we can probably work out a scheme to pass health checks through. (@wattli, what's the security team been thinking about on this front?)

@ZackButcher ZackButcher reopened this Jan 17, 2018

@PiotrSikora

This comment has been minimized.

Copy link
Member

PiotrSikora commented Jan 17, 2018

I think the proper solution would be to provide valid certificate to the health checker, so that it can connect and properly authenticate (i.e. no unencrypted backdoors) to the service that it's health checking.

(Not sure if that's possible right now, K8s-wise).

@ldemailly

This comment has been minimized.

Copy link
Contributor

ldemailly commented Jan 17, 2018

I believe if you use curl in your container it will work as our iptables don’t intercept localhost traffic

@wenchenglu

This comment has been minimized.

Copy link
Contributor

wenchenglu commented Jan 17, 2018

agreed with @PiotrSikora on handing the istio CA cert to k8s health checker. I think we have discussed with k8s folks on this support, @wattli may update where we are on this.

An alternative option is to support multiplexing both TLS and plaintext traffic on the same port, and use authorization to control what type of access is allowed. Piotr, do you know when we would have this multiplexing ready?

@ZackButcher

This comment has been minimized.

Copy link
Contributor

ZackButcher commented Jan 17, 2018

An alternative option is to support multiplexing both TLS and plaintext traffic on the same port, and use authorization to control what type of access is allowed. Piotr, do you know when we would have this multiplexing ready?

Yup, that's what I was alluding to: TLS sniffing + port multiplexing to allow health checks through in plain text. While not as nice a solution as giving the health checker a cert, it is good in that it's a general solution across health checkers.

@ldemailly

This comment has been minimized.

Copy link
Contributor

ldemailly commented Jan 17, 2018

what is wrong with curl localhost ?

@diemtvu

This comment has been minimized.

Copy link
Contributor

diemtvu commented Jan 17, 2018

port multiplexing could be a "convenient" solution, though I feel it maybe too implicit and may cause some security leak (customers might not realize that their service accept plaintext request)

The other option using gateway (or alias port) could be better. In this approach, users can create an alias port, config it to use plaintext and config health check to use that port. It's more work for users, but they'll get what they ask for.

curl could still be an option, though we may want to have Istio helped installing curl and convert httpGet to equivalent command, if it's going to be a canonical solution.

@ZackButcher

This comment has been minimized.

Copy link
Contributor

ZackButcher commented Jan 17, 2018

curl is fine for some use cases, but in the long run we need to support health checking services that call in to the workload. For example, Envoy's own cluster health checks are "active": they call a specific endpoint hosted by the workload. We also need health check services to work to enable look-aside load balancing features.

@PiotrSikora

This comment has been minimized.

Copy link
Member

PiotrSikora commented Jan 17, 2018

@wenchenglu TLS sniffing should land this week (it's waiting for changes to listeners to be merged first), and multiplexing will follow right after that.

However, like @diemtvu said, leaving the unencrypted channel open only for the sake of health checker isn't a perfect solution, and we should have a plan to use mTLS there as well.

@jsenon

This comment has been minimized.

Copy link
Contributor

jsenon commented Jan 18, 2018

Agreed with @ZackButcher @diemtvu and @PiotrSikora . If we can have all channel (apple + k8s api healthcheck) covered with mTLS will be great.

@prune998

This comment has been minimized.

Copy link
Contributor

prune998 commented Jan 18, 2018

Last time I checked, healthchecks (liveness/readyness) were not working is set on the same port as a "service" port, even without mTLS...

I don't see any release info/PR that would have made it working... am I wrong ?

ex :

...
        ports:
          - containerPort: 8080
            name: http
         readinessProbe:
           httpGet:
             path: "/healthz"
             port: 8080
             scheme: HTTP
           initialDelaySeconds: 2
           periodSeconds: 15
           timeoutSeconds: 1
         livenessProbe:
           httpGet:
             path: "/healthz"
             port: 8080
             scheme: HTTP
           initialDelaySeconds: 2
           periodSeconds: 15
           timeoutSeconds: 1

@ZackButcher

This comment has been minimized.

Copy link
Contributor

ZackButcher commented Jan 18, 2018

Health checks should work fine if you don't have mTLS enabled, regardless of the port hosting the check. If you have a case where they're not working today, that'd valuable for us to use to debug.

@emedina

This comment has been minimized.

Copy link

emedina commented Jan 21, 2018

I'm experiencing this issue without mTLS as well like @prune998 in 0.4.0

Only workaround is to disable liveness and readiness probes from all my Deployments. Not ideal, though.

@yudiandreanp

This comment has been minimized.

Copy link

yudiandreanp commented Jan 22, 2018

Also got the same issue without mTLS enabled, the healthcheck connection to the same port exposed by Service is intermittent, so the pods would crash unintentionally. The workaround is to use curl or other bash commands as a healthcheck, but it is a bit rough.

@prune998

This comment has been minimized.

Copy link
Contributor

prune998 commented Jan 22, 2018

I think @emedina did a great job linking to similar issues here..
The bug is still here in 0.4.0 and it does not seem to come from a pressure on istio-pilot...

This bug is a real pain as it force us to change all the deployments because of Istio...

@ZackButcher

This comment has been minimized.

Copy link
Contributor

ZackButcher commented Jan 23, 2018

I created a minimum setup with failing liveness probes, still digging to see exactly why its failing.

@ZackButcher

This comment has been minimized.

Copy link
Contributor

ZackButcher commented Jan 30, 2018

So to close the loop here: with Istio 0.4.0 and both Kubernetes 1.8 and 1.9 I am not able to replicate the health check failing when mTLS is disabled. The above minimum setup passes health/liveness checks and starts up successfully with Istio deployed.

Can someone seeing failures please provide a minimum reproducible case that can be used for debugging (or make the repo I linked fail health checks)?

@prune998

This comment has been minimized.

Copy link
Contributor

prune998 commented Jan 30, 2018

So, I'm going to debug this again, but it's obliously not working...
For the moment, here are the logs at startup :

istio-proxy 2018-01-30T19:49:29.941385457Z [2018-01-30 19:49:29.941][16][warning][upstream] external/envoy/source/server/lds_subscription.cc:68] lds: fetch failure: tcp proxy: unknown cluster 'out.orig-dst-cluster-tcp' in TCP route
istio-proxy 2018-01-30T19:49:30.961533907Z [2018-01-30 19:49:30.961][16][warning][upstream] external/envoy/source/server/lds_subscription.cc:68] lds: fetch failure: tcp proxy: unknown cluster 'out.orig-dst-cluster-tcp' in TCP route
istio-proxy 2018-01-30T19:49:36.384387293Z [2018-01-30T19:49:36.368Z] "GET /healthz HTTP/1.1" 404 NR 0 0 0 - "-" "kube-probe/1.8+" "031ab91f-246b-9784-a985-0388bbfa8b31" "10.20.8.22:12901" "-"
...
istio-proxy 2018-01-30T19:52:09.70219149Z [2018-01-30 19:52:09.701][16][warning][config] external/envoy/source/server/listener_manager_impl.cc:245] error adding listener: 'http_10.20.8.22_12901' has duplicate address '10.20.8.22:12901' as existing listener

Going to go in the istio-proxy pod and see what the RDS/LDS config is, but, from my point of view, nothing have changed with K8s 1.8 + Istio 0.4.0

@prune998

This comment has been minimized.

Copy link
Contributor

prune998 commented Jan 30, 2018

In fact, it's kind of working but :

  • istio-proxy takes about 10s to be fully set-up and allow network connexions from "application pod" to "external" (non istio but inside the cluster) ports
  • service is working
  • no health-check appear in the logs (should turn debug on maybe)
  • fetch failure: error adding listener: 'http_10.20.8.22_12901' has duplicate address '10.20.8.22:12901' as existing listener in the logs every few seconds
@mattatcha

This comment has been minimized.

Copy link

mattatcha commented Jan 30, 2018

@ZackButcher I am able to reproduce the problem with your deployment.

I also found that as soon as I remove the name from the service port, the duplicate address error is no longer logged.

current versions:

  • Kubernetes 1.9.2
  • Istio 0.4.0
  • Calico 2.6.2
  • Docker 1.12.6
  • Host OS Debian GNU/Linux 9 (stretch)
@wattli

This comment has been minimized.

Copy link
Contributor

wattli commented Mar 22, 2018

http prober for liveness check actually works in Istio with mTLS disabled, the "duplicated listener" error is a red herring.

If you actually check pod status with:

$ kubectl get pod sleep-756488dc99-l8r9r 
NAME                                      READY     STATUS    RESTARTS   AGE
sleep-756488dc99-l8r9r    2/2            Running    0                    21h

The restart count is actually 0, which means liveness works fine.

The duplicate listener is because:

"We explicitly create listeners for all health check ports. We also create listeners for all serving ports from the service spec. When the health/liveness port is the same as the normal serving port for the server, we emit an error (since we can't have two listeners on the same port). So long as the health check is HTTP, and the server is serving HTTP on the duplicated port, there's no problem and health checking passes (see the original issue, I posted a github repo showing this works; Tao has also verified this AFAIK)." quote from @ZackButcher

For mTLS, the short fix is use exec instead of httpGet, the long term fix is on the way. Close this one and the dup listener issue can be track here #1194

@wattli wattli closed this Mar 22, 2018

@prune998

This comment has been minimized.

Copy link
Contributor

prune998 commented Mar 22, 2018

Maybe you should link a merge request to this issue, showing where the code was changed so the listener is not created twice, so Envoy does not keep logging tons and tons of stuff for the duplicate.

I can confirm that it's in fact working (without mTLS), despite all the warnings

@ZhiminXiang

This comment has been minimized.

Copy link

ZhiminXiang commented May 3, 2018

curl localhost does not work well in our project when doing readiness check. In our project, we want to have a end-to-end readiness check (something like kubelet -> istio proxy sidecar -> container A -> container B). By using curl localhost, seems like we can only check the path container A -> container B, which caused a bunch of 503 errors at the beginning of starting Pods.

  1. I tried to use "curl podIP", but envoy rejects the request. Anyone know how I can configure envoy to make it work?
  2. Is there any timeline in Istio side to resolve this issue?
@wattli

This comment has been minimized.

Copy link
Contributor

wattli commented May 3, 2018

  1. Do you really mean the workflow containerA -> sidecarA -> sidecarB -> containerB failed? If so, you can simply disable the health check, and manually logged into containerA, try to run 'curl podIP' manually and see what happens.

  2. It is likely to be ready in 0.8.1 release, which I expect to happen in a month.

@ZhiminXiang

This comment has been minimized.

Copy link

ZhiminXiang commented May 4, 2018

Thanks @wattli for confirming the timeline.

About question 1, containerA and containerB belong to the same pod. containerA calls containerB. Originally we were using httpGet with containerA's port (portA) as readiness probe. So the original readiness check workflow is kubelete -> istio proxy sidecar -> containerA -> containerB. After replacing httpGet with "curl localhost:portA", I think the readiness check workflow was changed to containerA -> containerB, which missed the path: istio proxy sidecar -> containerA.
This change may cause a problem that a pod which is NOT really ready (the full workflow from istio proxy sidecar to containerB is NOT ready) was considered as ready (containerA -> containerB may have been ready), and starts serving real traffic, which could cause 5XX errors in the beginning of serving real traffic.
We observed 5XX errors of serving real traffic at the beginning of starting Pods after we did the readiness probe change.

We are trying to use "curl podIP:portA" to replace "curl localhost:portA" for setting readiness probe so that we can have a similar end to end readiness check workflow (containerA -> istio proxy sidecar -> containerA -> containerB). But seems like envoy rejects the request of "curl podIP:portA". Not sure if there is any way to configure envoy to make it work.

@markns

This comment has been minimized.

Copy link

markns commented May 16, 2018

@ZhiminXiang did you manage to find a work-around for this issue? I believe I have the same problem, as described here: https://groups.google.com/forum/#!topic/istio-users/2nn5GasL46Q

@ZhiminXiang

This comment has been minimized.

Copy link

ZhiminXiang commented May 16, 2018

@markns we have not found a work-around yet. Hopefully this issue could be fixed in Istio side soon.

@chandresh-pancholi

This comment has been minimized.

Copy link

chandresh-pancholi commented May 25, 2018

We are facing the same problem with 0.6

@sakshigoel12

This comment has been minimized.

Copy link
Contributor

sakshigoel12 commented Jun 4, 2018

@jsenon @prune998 @chandresh-pancholi @ZhiminXiang @markns would it be possible for you to upgrade to 0.8 and let us know if you still see this issue

@jsenon

This comment has been minimized.

Copy link
Contributor

jsenon commented Jun 5, 2018

Do you prefer upgrade or reinstall from scratch on 0.8?

@markns

This comment has been minimized.

Copy link

markns commented Jun 5, 2018

@sakshigoel12 I still see an error if I try to connect immediately after the k8s deployment becomes available.

I'm using gRPC, and I see a RpcException as follows raised in the client:
Grpc.Core.RpcException: Status(StatusCode=Unavailable, Detail="no healthy upstream")

The istio-proxy log shows:

[2018-06-05T17:12:18.601Z] "POST /myapi/myStream HTTP/2" 200 UH 5 0 2 - "10.132.0.6" "grpc-csharp/1.10.0 grpc-c/6.0.0 (windows; chttp2; glamorous)" "7c3abadf-8b6b-9313-8076-7dccce3ba2a1" "app.domain.site" "-"

Seems to be 200 status, but I guess the UH is relevant.

(clean install of 0.8)

@sakshigoel12

This comment has been minimized.

Copy link
Contributor

sakshigoel12 commented Jun 5, 2018

@wattli adding you here

@ZhiminXiang

This comment has been minimized.

Copy link

ZhiminXiang commented Jun 5, 2018

I also saw "no healthy upstream" error after I enabled mTLS in 0.8.0. But this kind of error is not persistent. This error happens for about 50% requests.
But seems like the health check probe works since I saw my server received health check request.

Is healthy check probe supposed to work in Istio 0.8.0 ? Anyone knows what caused this kind of "no healthy upstream" issue?

@wattli

This comment has been minimized.

Copy link
Contributor

wattli commented Jun 5, 2018

@markns @ZhiminXiang , can you try something like:

  1. Do not enable mTLS by running install/kubernetes/istio-demo.yaml to deploy istio
  2. Use auth policy/destination rule to enable mTLS at service/namespace level.
  3. For those services with health check, do not enable mTLS on the health check (you can do it via auth policy).
@wattli

This comment has been minimized.

Copy link
Contributor

wattli commented Jun 5, 2018

https://istio.io/docs/tasks/security/authn-policy/, here is the doc about authPolicy and destRule

@chandresh-pancholi

This comment has been minimized.

Copy link

chandresh-pancholi commented Jun 6, 2018

@sakshigoel12 , Could you please answer @jsenon question?

@markns

This comment has been minimized.

Copy link

markns commented Jun 6, 2018

Hey @wattli,

I disabled mtls using helm upgrade istio install/kubernetes/helm/istio --namespace istio-system --set global.mtls.enabled=false --set global.controlPlaneSecurityEnabled=true

Auth policy and destination rule are as follows:

apiVersion: authentication.istio.io/v1alpha1
kind: Policy
metadata:
  name: mystio
  namespace: mystio
spec:
  origins:
  - jwt:
      audiences:
      - https://gridarrow.site/mystio
      issuer: https://gridarrow.auth0.com/
      jwksUri: https://gridarrow.auth0.com/.well-known/jwks.json
  peers:
  - mtls: null
  principalBinding: USE_ORIGIN
  targets: null
apiVersion: networking.istio.io/v1alpha3
kind: DestinationRule
metadata:
  name: mystio-mtls
  namespace: mystio
spec:
  host: '*.mystio.svc.cluster.local'
  trafficPolicy:
    tls:
      mode: ISTIO_MUTUAL

And, the result is.... fewer "no healthy upstream" errors, but they are still there ~50% of the time. Previously it seemed that 100% of connection attempts would need to be retried. Note, my clients attempt to connect to the service as soon as the k8s deployment becomes "available".

btw, I think you're referring to this when you mention health-checking in 3) - if so, I don't have any health checks enabled.

@wattli

This comment has been minimized.

Copy link
Contributor

wattli commented Jun 6, 2018

@jsenon , generally we prefer reinstall from scratch if possible.

@wattli

This comment has been minimized.

Copy link
Contributor

wattli commented Jun 6, 2018

@markns, can I take a look at your application yaml (which should include the health check of the app)?

@markns

This comment has been minimized.

Copy link

markns commented Jun 6, 2018

@wattli Here is is (with just a few application specifics removed)

apiVersion: v1
kind: Pod
metadata:
  annotations:
    sidecar.istio.io/status: '{"version":"2f8d1bac608518e837b2e4e3edccf0fc4dae57d7fe97c4ee87305668de753513","initContainers":["istio-init"],"containers":["istio-proxy"],"volumes":["istio-envoy","istio-certs"],"imagePullSecrets":null}'
  name: welcome-app-67487fcd85-xslss
  namespace: mystio
spec:
  containers:
  - image: eu.gcr.io/myapp...
    imagePullPolicy: IfNotPresent
    name: userapp
    ports:
    - containerPort: 50051
      name: grpc
      protocol: TCP
    readinessProbe:
      exec:
        command:
        - /bin/sh
        - -c
        - curl localhost:8001/healthz
      failureThreshold: 3
      initialDelaySeconds: 2
      periodSeconds: 2
      successThreshold: 1
      timeoutSeconds: 1
    securityContext:
      allowPrivilegeEscalation: false
      capabilities: {}
      privileged: false
      readOnlyRootFilesystem: true
      runAsNonRoot: true
      runAsUser: 1000
  - args:
    - proxy
    - sidecar
    - --configPath
    - /etc/istio/proxy
    - --binaryPath
    - /usr/local/bin/envoy
    - --serviceCluster
    - welcome-app
    - --drainDuration
    - 45s
    - --parentShutdownDuration
    - 1m0s
    - --discoveryAddress
    - istio-pilot.istio-system:15005
    - --discoveryRefreshDelay
    - 10s
    - --zipkinAddress
    - zipkin.istio-system:9411
    - --connectTimeout
    - 10s
    - --statsdUdpAddress
    - istio-statsd-prom-bridge.istio-system:9125
    - --proxyAdminPort
    - "15000"
    - --controlPlaneAuthPolicy
    - MUTUAL_TLS
    env:
    - name: POD_NAME
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.name
    - name: POD_NAMESPACE
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.namespace
    - name: INSTANCE_IP
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: status.podIP
    - name: ISTIO_META_POD_NAME
      valueFrom:
        fieldRef:
          apiVersion: v1
          fieldPath: metadata.name
    - name: ISTIO_META_INTERCEPTION_MODE
      value: REDIRECT
    image: docker.io/istio/proxyv2:0.8.0
    imagePullPolicy: IfNotPresent
    lifecycle:
      preStop:
        exec:
          command:
          - bash
          - -c
          - sleep 2
    name: istio-proxy
    resources:
      limits:
        cpu: 100m
        memory: 128Mi
      requests:
        cpu: 100m
        memory: 128Mi
    securityContext:
      privileged: false
      readOnlyRootFilesystem: true
      runAsUser: 1337
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
    volumeMounts:
    - mountPath: /etc/istio/proxy
      name: istio-envoy
    - mountPath: /etc/certs/
      name: istio-certs
      readOnly: true
  dnsPolicy: ClusterFirst
  initContainers:
  - args:
    - -p
    - "15001"
    - -u
    - "1337"
    - -m
    - REDIRECT
    - -i
    - 10.28.0.0/14,10.31.240.0/20
    - -x
    - ""
    - -b
    - 50051,
    - -d
    - ""
    image: docker.io/istio/proxy_init:0.8.0
    imagePullPolicy: IfNotPresent
    name: istio-init
    resources:
      limits:
        cpu: 100m
        memory: 128Mi
      requests:
        cpu: 100m
        memory: 128Mi
    securityContext:
      capabilities:
        add:
        - NET_ADMIN
      privileged: true
    terminationMessagePath: /dev/termination-log
    terminationMessagePolicy: File
  nodeName: gke-cluster-dev0-user-1-3739c992-pzvw
  restartPolicy: Always
  schedulerName: default-scheduler
  terminationGracePeriodSeconds: 30
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment