Skip to content

Commit

Permalink
give a long gregor reconnect backoff to devices that don't send chats
Browse files Browse the repository at this point in the history
This should mitigate some of the reconnect flood that the gregor servers
have to deal with when they restart. Most idle clients in the wild
aren't participating in chat, and don't need to reconnect very
aggressively.

There were a few different heuristics we could've used here, and others
we might want to use in the future. One in particular we almost chose
was "has this user ever received a message". However, we sometimes send
system-wide messages, like when Linux updates are broken, which could
confuse that heuristic. Chat sending is a surer sign of activity than
receiving, and it also has the benefit of being
individual-device-specific.

Two things had to change to make this work. First, we had to configure a
chat-activity-based backoff (using a couple new keys in LevelDB).
Second, we had to make sure that the backoff was respected on reconnect,
which required the new ForceInitialBackoff ConnectionOpts parameter
upstream, since we don't keep a persisitent Connection object after
disconnects.
  • Loading branch information
oconnor663 committed Dec 6, 2017
1 parent 9314f39 commit 668c717
Show file tree
Hide file tree
Showing 4 changed files with 141 additions and 9 deletions.
78 changes: 78 additions & 0 deletions go/chat/active.go
@@ -0,0 +1,78 @@
package chat

import (
"time"

"github.com/keybase/client/go/chat/globals"
"github.com/keybase/client/go/libkb"
)

// All devices are presumed active for the first 24 hours. See comment below.
var InitialAssumedActiveInterval = 24 * time.Hour
var ActiveIntervalAfterSend = 30 * 24 * time.Hour

func chatActiveDBKey(name string) libkb.DbKey {
return libkb.DbKey{
Typ: libkb.DBChatActive,
Key: name,
}
}

func firstQueryTimeDbKey() libkb.DbKey {
return chatActiveDBKey("first_query_time")
}

func lastSendTimeDbKey() libkb.DbKey {
return chatActiveDBKey("last_send_time")
}

// If no first query time is found in the local db, this function writes the
// current time.
func TouchFirstChatActiveQueryTime(g *globals.Context) time.Time {
now := time.Now()
var firstQueryUnixTime int64
found, err := g.LocalChatDb.GetInto(&firstQueryUnixTime, firstQueryTimeDbKey())
// Warn for errors and bail.
if err != nil {
g.Log.Debug("Failed to get chat active query time: %s", err)
return now
}
// If the first query time doesn't exist, store Now().
if !found {
g.Log.Debug("Chat active query time not found. Storing current time.")
err := g.LocalChatDb.PutObj(firstQueryTimeDbKey(), nil, now.Unix())
if err != nil {
g.Log.Debug("Failed to store chat active query time: %s", err)
}
return now
}
// Otherwise return what we found.
return time.Unix(firstQueryUnixTime, 0)
}

// Returns the zero time if there is no recorded last send time (either because
// the device has never sent a message, or because it hasn't sent one since we
// started recording).
func GetLastSendTime(g *globals.Context) time.Time {
var zeroTime time.Time
var lastSendUnixTime int64
found, err := g.LocalChatDb.GetInto(&lastSendUnixTime, lastSendTimeDbKey())
// Warn for errors and return zero.
if err != nil {
g.Log.Debug("Failed to get chat active last send time: %s", err)
return zeroTime
}
// If the last time doesn't exist, again return zero.
if !found {
return zeroTime
}
// Otherwise return what we found.
return time.Unix(lastSendUnixTime, 0)
}

func RecordChatSend(g *globals.Context) {
err := g.LocalChatDb.PutObj(lastSendTimeDbKey(), nil, time.Now().Unix())
if err != nil {
g.Log.Debug("Failed to store chat last send time: %s", err)
}
}
4 changes: 4 additions & 0 deletions go/chat/sender.go
Expand Up @@ -490,6 +490,10 @@ func (s *BlockingSender) Send(ctx context.Context, convID chat1.ConversationID,
msg chat1.MessagePlaintext, clientPrev chat1.MessageID, outboxID *chat1.OutboxID) (obid chat1.OutboxID, boxed *chat1.MessageBoxed, rl *chat1.RateLimit, err error) {
defer s.Trace(ctx, func() error { return err }, fmt.Sprintf("Send(%s)", convID))()

// Record that this user is "active in chat", which we use to determine
// gregor reconnect backoffs.
RecordChatSend(s.G())

ri := s.getRi()
if ri == nil {
return chat1.OutboxID{}, nil, nil, fmt.Errorf("Send(): no remote client found")
Expand Down
1 change: 1 addition & 0 deletions go/libkb/db.go
Expand Up @@ -182,6 +182,7 @@ const (
DBSigChainTailPublic = 0xe7
DBSigChainTailSemiprivate = 0xe8
DBSigChainTailEncrypted = 0xe9
DBChatActive = 0xea
DBMerkleRoot = 0xf0
DBTrackers = 0xf1
DBGregor = 0xf2
Expand Down
67 changes: 58 additions & 9 deletions go/service/gregor.go
Expand Up @@ -33,7 +33,8 @@ import (
)

const GregorRequestTimeout time.Duration = 30 * time.Second
const GregorConnectionRetryInterval time.Duration = 2 * time.Second
const GregorConnectionShortRetryInterval time.Duration = 2 * time.Second
const GregorConnectionLongRetryInterval time.Duration = 10 * time.Second
const GregorGetClientTimeout time.Duration = 4 * time.Second

type IdentifyUIHandler struct {
Expand Down Expand Up @@ -1410,6 +1411,50 @@ func (g *gregorHandler) pingLoop() {
}
}

// Our heuristic for figuring out whether your device is "active in chat" is
// whether you've sent a message in the last month. We're recording your last
// send as a simple timestamp, and we'll compare that to the current time.
// However, we want to avoid the situation where we ship this code for the
// first time, and suddenly *all* devices appear inactive, because no one has
// written the timestamp yet. So we add a second timestamp, which is the first
// time you ran any of this code. For 24 hours after the first time a device
// queries these keys, we treat all devices as active. We'll want to keep
// (something like) this code even after it's all stable in the wild, because
// it covers newly provisioned devices too.
func chatAwareReconnectInterval(ctx context.Context, g *globals.Context) time.Duration {
short := GregorConnectionShortRetryInterval
long := GregorConnectionLongRetryInterval
now := time.Now()
firstQueryTime := chat.TouchFirstChatActiveQueryTime(g)

// As a special case, always use a short backoff on mobile. Mobile devices
// aren't responsible for the thundering herd issues that this logic is
// trying to mitigate, and mobile users are much more likely to notice a
// connection delay.
if g.Env.GetAppType() == libkb.MobileAppType {
return short
}

// All devices are presumed active for the first 24 hours after they start
// checking this, and we give them a short backoff.
if now.Sub(firstQueryTime) < chat.InitialAssumedActiveInterval {
return short
}

// Otherwise, devices that haven't recorded a send in the last month are
// given a long backoff.
lastSendTime := chat.GetLastSendTime(g)
if now.Sub(lastSendTime) < chat.ActiveIntervalAfterSend {
return short
}
g.Log.CDebugf(ctx, "Device isn't active in chat. Using long reconnect backoff.")
return long
}

func chatAwareReconnectBackoff(ctx context.Context, g *globals.Context) backoff.BackOff {
return backoff.NewConstantBackOff(chatAwareReconnectInterval(ctx, g))
}

// connMutex must be locked before calling this
func (g *gregorHandler) connectTLS() error {
ctx := context.Background()
Expand All @@ -1428,11 +1473,13 @@ func (g *gregorHandler) connectTLS() error {
// Let people know we are trying to sync
g.G().NotifyRouter.HandleChatInboxSyncStarted(ctx, g.G().Env.GetUID())

constBackoff := backoff.NewConstantBackOff(GregorConnectionRetryInterval)
opts := rpc.ConnectionOpts{
TagsFunc: logger.LogTagsFromContextRPC,
WrapErrorFunc: libkb.MakeWrapError(g.G().ExternalG()),
ReconnectBackoff: func() backoff.BackOff { return constBackoff },
TagsFunc: logger.LogTagsFromContextRPC,
WrapErrorFunc: libkb.MakeWrapError(g.G().ExternalG()),
InitialReconnectBackoffWindow: func() time.Duration { return chatAwareReconnectInterval(ctx, g.G()) },
ReconnectBackoff: func() backoff.BackOff { return chatAwareReconnectBackoff(ctx, g.G()) },
// We deliberately avoid ForceInitialBackoff here, becuase we don't
// want to penalize mobile, which tears down its connection frequently.
}
g.conn = rpc.NewTLSConnection(rpc.NewFixedRemote(uri.HostPort),
[]byte(rawCA), libkb.NewContextifiedErrorUnwrapper(g.G().ExternalG()), g, libkb.NewRPCLogFactory(g.G().ExternalG()), g.G().Log, opts)
Expand Down Expand Up @@ -1465,11 +1512,13 @@ func (g *gregorHandler) connectNoTLS() error {
t := newConnTransport(g.G().ExternalG(), uri.HostPort)
g.transportForTesting = t

constBackoff := backoff.NewConstantBackOff(GregorConnectionRetryInterval)
opts := rpc.ConnectionOpts{
TagsFunc: logger.LogTagsFromContextRPC,
WrapErrorFunc: libkb.MakeWrapError(g.G().ExternalG()),
ReconnectBackoff: func() backoff.BackOff { return constBackoff },
TagsFunc: logger.LogTagsFromContextRPC,
WrapErrorFunc: libkb.MakeWrapError(g.G().ExternalG()),
InitialReconnectBackoffWindow: func() time.Duration { return chatAwareReconnectInterval(ctx, g.G()) },
ReconnectBackoff: func() backoff.BackOff { return chatAwareReconnectBackoff(ctx, g.G()) },
// We deliberately avoid ForceInitialBackoff here, becuase we don't
// want to penalize mobile, which tears down its connection frequently.
}
g.conn = rpc.NewConnectionWithTransport(g, t, libkb.NewContextifiedErrorUnwrapper(g.G().ExternalG()), g.G().Log, opts)

Expand Down

0 comments on commit 668c717

Please sign in to comment.