Skip to content

Commit

Permalink
scraper cache (#14946)
Browse files Browse the repository at this point in the history
* scraper cache

* Fix test

* use unfurl type in cache key
  • Loading branch information
joshblum committed Dec 7, 2018
1 parent 8f43379 commit 36d23df
Show file tree
Hide file tree
Showing 10 changed files with 177 additions and 13 deletions.
72 changes: 72 additions & 0 deletions go/chat/unfurl/cache.go
@@ -0,0 +1,72 @@
package unfurl

import (
"sync"
"time"

lru "github.com/hashicorp/golang-lru"
"github.com/keybase/client/go/protocol/gregor1"
"github.com/keybase/clockwork"
)

const defaultCacheLifetime = 10 * time.Minute
const defaultCacheSize = 1000

type cacheItem struct {
data interface{}
err error
ctime gregor1.Time
}

type unfurlCache struct {
sync.Mutex
cache *lru.Cache
clock clockwork.Clock
}

func newUnfurlCache() *unfurlCache {
cache, err := lru.New(defaultCacheSize)
if err != nil {
panic(err)
}
return &unfurlCache{
cache: cache,
clock: clockwork.NewRealClock(),
}
}

func (c *unfurlCache) setClock(clock clockwork.Clock) {
c.clock = clock
}

// get determines if the item is in the cache and newer than 10
// minutes. We don't want to cache this value indefinitely in case the page
// content changes.
func (c *unfurlCache) get(key string) (res cacheItem, ok bool) {
c.Lock()
defer c.Unlock()

item, ok := c.cache.Get(key)
if !ok {
return res, false
}
cacheItem, ok := item.(cacheItem)
if !ok {
return res, false
}
valid := c.clock.Now().Sub(cacheItem.ctime.Time()) <= defaultCacheLifetime
if !valid {
c.cache.Remove(key)
}
return cacheItem, valid
}

func (c *unfurlCache) put(key string, data interface{}, err error) {
c.Lock()
defer c.Unlock()
c.cache.Add(key, cacheItem{
err: err,
data: data,
ctime: gregor1.ToTime(c.clock.Now()),
})
}
25 changes: 25 additions & 0 deletions go/chat/unfurl/packager.go
Expand Up @@ -23,6 +23,7 @@ import (
type Packager struct {
utils.DebugLabeler

cache *unfurlCache
ri func() chat1.RemoteInterface
store attachments.Store
s3signer s3.Signer
Expand All @@ -33,6 +34,7 @@ func NewPackager(l logger.Logger, store attachments.Store, s3signer s3.Signer,
ri func() chat1.RemoteInterface) *Packager {
return &Packager{
DebugLabeler: utils.NewDebugLabeler(l, "Packager", false),
cache: newUnfurlCache(),
store: store,
ri: ri,
s3signer: s3signer,
Expand Down Expand Up @@ -232,9 +234,32 @@ func (p *Packager) packageGiphy(ctx context.Context, uid gregor1.UID, convID cha
return chat1.NewUnfurlWithGiphy(g), nil
}

func (p *Packager) cacheKey(uid gregor1.UID, convID chat1.ConversationID, raw chat1.UnfurlRaw) string {
url := raw.GetUrl()
if url == "" {
return ""
}
typ, err := raw.UnfurlType()
if err != nil {
return ""
}
return fmt.Sprintf("%s-%s-%s-%s", uid, convID, url, typ)
}

func (p *Packager) Package(ctx context.Context, uid gregor1.UID, convID chat1.ConversationID,
raw chat1.UnfurlRaw) (res chat1.Unfurl, err error) {
defer p.Trace(ctx, func() error { return err }, "Package")()

cacheKey := p.cacheKey(uid, convID, raw)
if item, valid := p.cache.get(cacheKey); cacheKey != "" && valid {
return item.data.(chat1.Unfurl), item.err
}
defer func() {
if cacheKey != "" {
p.cache.put(cacheKey, res, err)
}
}()

typ, err := raw.UnfurlType()
if err != nil {
return res, err
Expand Down
15 changes: 15 additions & 0 deletions go/chat/unfurl/packager_test.go
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/keybase/client/go/logger"
"github.com/keybase/client/go/protocol/chat1"
"github.com/keybase/client/go/protocol/gregor1"
"github.com/keybase/clockwork"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -44,6 +45,8 @@ func TestPackager(t *testing.T) {
s3Signer := &ptsigner{}
ri := func() chat1.RemoteInterface { return paramsRemote{} }
packager := NewPackager(log, store, s3Signer, ri)
clock := clockwork.NewFakeClock()
packager.cache.setClock(clock)
srvFile := func(w io.Writer, name string) {
file, err := os.Open(name)
require.NoError(t, err)
Expand All @@ -67,6 +70,7 @@ func TestPackager(t *testing.T) {
imageURL := fmt.Sprintf("http://%s/?typ=image", addr)
faviconURL := fmt.Sprintf("http://%s/?typ=favicon", addr)
raw := chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{
Url: "https://example.com",
ImageUrl: &imageURL,
FaviconUrl: &faviconURL,
})
Expand All @@ -84,6 +88,17 @@ func TestPackager(t *testing.T) {
require.NotZero(t, favicon.Metadata.Image().Height)
require.NotZero(t, favicon.Metadata.Image().Width)

// test caching
cacheKey := packager.cacheKey(uid, convID, raw)
cachedRes, valid := packager.cache.get(cacheKey)
require.True(t, valid)
require.NoError(t, cachedRes.err)
require.Equal(t, res, cachedRes.data.(chat1.Unfurl))

clock.Advance(defaultCacheLifetime * 2)
cachedRes, valid = packager.cache.get(cacheKey)
require.False(t, valid)

compareSol := func(name string, resDat []byte) {
dat, err := ioutil.ReadFile(filepath.Join("testcases", name))
require.NoError(t, err)
Expand Down
22 changes: 17 additions & 5 deletions go/chat/unfurl/scraper.go
Expand Up @@ -12,11 +12,13 @@ import (

type Scraper struct {
utils.DebugLabeler
cache *unfurlCache
}

func NewScraper(logger logger.Logger) *Scraper {
return &Scraper{
DebugLabeler: utils.NewDebugLabeler(logger, "Scraper", false),
cache: newUnfurlCache(),
}
}

Expand All @@ -33,16 +35,26 @@ func (s *Scraper) makeCollector() *colly.Collector {

func (s *Scraper) Scrape(ctx context.Context, uri string, forceTyp *chat1.UnfurlType) (res chat1.UnfurlRaw, err error) {
defer s.Trace(ctx, func() error { return err }, "Scrape")()
// Check if we have a cached valued
if item, valid := s.cache.get(uri); valid {
return item.data.(chat1.UnfurlRaw), item.err
}
defer func() {
s.cache.put(uri, res, err)
}()

domain, err := GetDomain(uri)
if err != nil {
return res, err
}

var unfurlTyp chat1.UnfurlType
var domain string
if forceTyp != nil {
unfurlTyp = *forceTyp
} else {
var err error
if unfurlTyp, domain, err = ClassifyDomainFromURI(uri); err != nil {
return res, err
}
unfurlTyp = ClassifyDomain(domain)
}

switch unfurlTyp {
case chat1.UnfurlType_GENERIC:
return s.scrapeGeneric(ctx, uri, domain)
Expand Down
20 changes: 19 additions & 1 deletion go/chat/unfurl/scraper_test.go
Expand Up @@ -13,6 +13,7 @@ import (

"github.com/keybase/client/go/logger"
"github.com/keybase/client/go/protocol/chat1"
"github.com/keybase/clockwork"
"github.com/stretchr/testify/require"
)

Expand Down Expand Up @@ -81,19 +82,25 @@ func createTestCaseHTTPSrv(t *testing.T) *dummyHTTPSrv {

func TestScraper(t *testing.T) {
scraper := NewScraper(logger.NewTestLogger(t))

clock := clockwork.NewFakeClock()
scraper.cache.setClock(clock)

srv := createTestCaseHTTPSrv(t)
addr := srv.Start()
defer srv.Stop()
forceGiphy := new(chat1.UnfurlType)
*forceGiphy = chat1.UnfurlType_GIPHY
testCase := func(name string, expected chat1.UnfurlRaw, forceTyp *chat1.UnfurlType) {
res, err := scraper.Scrape(context.TODO(), "http://"+addr+"/?name="+name, forceTyp)
uri := fmt.Sprintf("http://%s/?name=%s", addr, name)
res, err := scraper.Scrape(context.TODO(), uri, forceTyp)
require.NoError(t, err)
etyp, err := expected.UnfurlType()
require.NoError(t, err)
rtyp, err := res.UnfurlType()
require.NoError(t, err)
require.Equal(t, etyp, rtyp)

t.Logf("expected:\n%v\n\nactual:\n%v", expected, res)
switch rtyp {
case chat1.UnfurlType_GENERIC:
Expand Down Expand Up @@ -134,7 +141,18 @@ func TestScraper(t *testing.T) {
default:
require.Fail(t, "unknown unfurl typ")
}

// test caching
cachedRes, valid := scraper.cache.get(uri)
require.True(t, valid)
require.NoError(t, cachedRes.err)
require.Equal(t, res, cachedRes.data.(chat1.UnfurlRaw))

clock.Advance(defaultCacheLifetime * 2)
cachedRes, valid = scraper.cache.get(uri)
require.False(t, valid)
}

testCase("cnn0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{
Title: "Kanye West seeks separation from politics",
Url: "https://www.cnn.com/2018/10/30/entertainment/kanye-west-politics/index.html",
Expand Down
6 changes: 6 additions & 0 deletions go/chat/unfurl/unfurler.go
Expand Up @@ -17,6 +17,7 @@ import (
"github.com/keybase/client/go/libkb"
"github.com/keybase/client/go/protocol/chat1"
"github.com/keybase/client/go/protocol/gregor1"
"github.com/keybase/clockwork"
)

type unfurlTask struct {
Expand Down Expand Up @@ -69,6 +70,11 @@ func NewUnfurler(g *globals.Context, store attachments.Store, s3signer s3.Signer
}
}

func (u *Unfurler) SetClock(clock clockwork.Clock) {
u.scraper.cache.setClock(clock)
u.packager.cache.setClock(clock)
}

func (u *Unfurler) SetTestingRetryCh(ch chan struct{}) {
u.retryCh = ch
}
Expand Down
7 changes: 0 additions & 7 deletions go/chat/unfurl/utils.go
Expand Up @@ -58,10 +58,3 @@ func ClassifyDomain(domain string) chat1.UnfurlType {
}
return chat1.UnfurlType_GENERIC
}

func ClassifyDomainFromURI(uri string) (typ chat1.UnfurlType, domain string, err error) {
if domain, err = GetDomain(uri); err != nil {
return typ, domain, err
}
return ClassifyDomain(domain), domain, nil
}
7 changes: 7 additions & 0 deletions go/chat/unfurl_test.go
Expand Up @@ -17,6 +17,7 @@ import (

"github.com/keybase/client/go/chat/attachments"
"github.com/keybase/client/go/libkb"
"github.com/keybase/clockwork"

"github.com/keybase/client/go/chat/unfurl"
"github.com/keybase/client/go/protocol/chat1"
Expand Down Expand Up @@ -120,6 +121,8 @@ func TestChatSrvUnfurl(t *testing.T) {
unfurlCh := make(chan *chat1.Unfurl, 5)
unfurler.SetTestingRetryCh(retryCh)
unfurler.SetTestingUnfurlCh(unfurlCh)
clock := clockwork.NewFakeClock()
unfurler.SetClock(clock)
tc.ChatG.Unfurler = unfurler
fetcher := NewRemoteAttachmentFetcher(tc.Context(), store)
tc.ChatG.AttachmentURLSrv = NewAttachmentHTTPSrv(tc.Context(), fetcher,
Expand Down Expand Up @@ -241,7 +244,11 @@ func TestChatSrvUnfurl(t *testing.T) {
default:
}
}
// now that we we can succeed, clear our cached value so we don't serve
// back the cached error
httpSrv.succeed = true
clock.Advance(2 * 10 * time.Minute) // unfurl.DefaultCacheTime

tc.Context().MessageDeliverer.ForceDeliverLoop(context.TODO())
recvSingleRetry()
u := recvUnfurl()
Expand Down
14 changes: 14 additions & 0 deletions go/protocol/chat1/extras.go
Expand Up @@ -1874,6 +1874,20 @@ func (idx *ConversationIndex) PercentIndexed(conv Conversation) int {
return 100 * (1 - (len(missingIDs) / numMessages))
}

func (u UnfurlRaw) GetUrl() string {
typ, err := u.UnfurlType()
if err != nil {
return ""
}
switch typ {
case UnfurlType_GENERIC:
return u.Generic().Url
case UnfurlType_GIPHY:
return u.Giphy().ImageUrl
}
return ""
}

func (u UnfurlRaw) UnsafeDebugString() string {
typ, err := u.UnfurlType()
if err != nil {
Expand Down
2 changes: 2 additions & 0 deletions protocol/avdl/chat1/unfurl.avdl
Expand Up @@ -3,6 +3,8 @@
protocol unfurl {
import idl "common.avdl";

// NOTE if adding a new type here, also add the case to the
// `UnfurlRaw.GetURL` type since a URL is used as a key for caching
enum UnfurlType {
GENERIC_0,
YOUTUBE_1,
Expand Down

0 comments on commit 36d23df

Please sign in to comment.