Skip to content

Commit

Permalink
[FABG-916] Evict connection in TRANSIENT_FAILURE state
Browse files Browse the repository at this point in the history
When a peer is shut down, the connection to that peer goes into TRANSIENT_FAILURE state and we wait for it to be set to READY state. After the peer is started the connection will be in TRANSIENT_FAILURE state for several more seconds (even though the peer is up), potentially causing timeouts at the client.
With this patch, connections that are in TRANSIENT_FAIURE state are evicted from the cache. An error is immediately returned to the client so that it may retry connecting with a fresh connection.

Signed-off-by: Bob Stasyszyn <Bob.Stasyszyn@securekey.com>
  • Loading branch information
bstasyszyn committed Oct 22, 2019
1 parent 2f8e5d3 commit 6fa500f
Show file tree
Hide file tree
Showing 3 changed files with 30 additions and 1 deletion.
1 change: 1 addition & 0 deletions go.mod
Expand Up @@ -40,6 +40,7 @@ require (
github.com/stretchr/testify v1.3.0
golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2
golang.org/x/net v0.0.0-20190311183353-d8887717615a
google.golang.org/appengine v1.4.0 // indirect
google.golang.org/genproto v0.0.0-20190327125643-d831d65fe17d // indirect
google.golang.org/grpc v1.23.0
gopkg.in/check.v1 v1.0.0-20180628173108-788fd7840127 // indirect
Expand Down
9 changes: 8 additions & 1 deletion pkg/fab/comm/connector.go
Expand Up @@ -129,8 +129,9 @@ func (cc *CachingConnector) DialContext(ctx context.Context, target string, opts
if err := cc.openConn(ctx, c); err != nil {
cc.lock.Lock()
setClosed(c)
cc.removeConn(c)
cc.lock.Unlock()
return nil, errors.Errorf("dialing connection timed out [%s]", target)
return nil, errors.WithMessagef(err, "dialing connection on target [%s]", target)
}
return c.conn, nil
}
Expand Down Expand Up @@ -229,6 +230,12 @@ func waitConn(ctx context.Context, conn *grpc.ClientConn, targetState connectivi
if state == targetState {
break
}
if state == connectivity.TransientFailure {
// The server was probably restarted. It's better for the client to retry with a new connection rather
// than reusing a cached connection that's in TRANSIENT_FAILURE state since it takes much longer to
// recover while waiting for the state to change to READY - even if the server is up.
return errors.Errorf("connection is in %s", state)
}
if !conn.WaitForStateChange(ctx, state) {
return errors.Wrap(ctx.Err(), "waiting for connection failed")
}
Expand Down
21 changes: 21 additions & 0 deletions pkg/fab/comm/connector_test.go
Expand Up @@ -93,6 +93,27 @@ func TestDialAfterClose(t *testing.T) {
assert.Error(t, err, "expecting error when dialing after connector is closed")
}

func TestDialAfterRestart(t *testing.T) {
srvs, addrs, err := startEndorsers(1, endorserAddress)
require.NoError(t, err)
require.Len(t, addrs, 1)

addr := addrs[0]

connector := NewCachingConnector(normalSweepTime, normalIdleTime)

conn1, err := connector.DialContext(context.Background(), addr, grpc.WithInsecure())
require.NoError(t, err)
require.NotNil(t, conn1)
srvs[0].Stop()
time.Sleep(time.Second)

conn2, err := connector.DialContext(context.Background(), addr, grpc.WithInsecure())
require.Error(t, err)
require.Contains(t, err.Error(), connectivity.TransientFailure.String())
require.Nil(t, conn2)
}

func TestConnectorHappyFlushNumber1(t *testing.T) {
connector := NewCachingConnector(normalSweepTime, normalIdleTime)
defer connector.Close()
Expand Down

0 comments on commit 6fa500f

Please sign in to comment.