Skip to content

Commit

Permalink
unfurling scraper part 1 CORE-9243 (#14506)
Browse files Browse the repository at this point in the history
  • Loading branch information
mmaxim committed Nov 1, 2018
1 parent d800482 commit 0115da7
Show file tree
Hide file tree
Showing 218 changed files with 245,724 additions and 0 deletions.
66 changes: 66 additions & 0 deletions go/chat/unfurl/scrape_generic.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
package unfurl

import (
"context"
"strings"
"time"

"github.com/gocolly/colly"
"github.com/keybase/client/go/protocol/chat1"
)

func fullURL(hostname, path string) string {
if strings.HasPrefix(path, "//") {
return "http:" + path
} else if strings.HasPrefix(path, "/") {
return "http://" + hostname + path
}
return path
}

func (s *Scraper) scrapeGeneric(ctx context.Context, uri, domain string) (res chat1.UnfurlRaw, err error) {
var generic chat1.UnfurlGenericRaw
hostname, err := GetHostname(uri)
if err != nil {
return res, err
}
generic.Url = uri
generic.SiteName = domain
c := colly.NewCollector()
c.OnHTML("head meta[content][property]", func(e *colly.HTMLElement) {
prop := e.Attr("property")
content := e.Attr("content")
switch prop {
case "og:description":
generic.Description = &content
case "og:image":
generic.ImageUrl = new(string)
*generic.ImageUrl = fullURL(hostname, content)
case "og:site_name":
generic.SiteName = content
case "og:pubdate":
s.Debug(ctx, "pubdate: %s", content)
t, err := time.Parse("2006-01-02T15:04:05Z", content)
if err == nil {
generic.PublishTime = new(int)
*generic.PublishTime = int(t.Unix())
} else {
s.Debug(ctx, "scrapeGeneric: failed to parse pubdate: %s", err)
}
}
})
c.OnHTML("head title", func(e *colly.HTMLElement) {
generic.Title = e.Text
})
c.OnHTML("head link[rel][href]", func(e *colly.HTMLElement) {
rel := strings.ToLower(e.Attr("rel"))
if strings.Contains(rel, "shortcut icon") {
generic.FaviconUrl = new(string)
*generic.FaviconUrl = fullURL(hostname, e.Attr("href"))
}
})
if err := c.Visit(uri); err != nil {
return res, err
}
return chat1.NewUnfurlRawWithGeneric(generic), nil
}
34 changes: 34 additions & 0 deletions go/chat/unfurl/scraper.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
package unfurl

import (
"context"

"github.com/keybase/client/go/chat/utils"
"github.com/keybase/client/go/logger"

"github.com/keybase/client/go/protocol/chat1"
)

type Scraper struct {
utils.DebugLabeler
}

func NewScraper(logger logger.Logger) *Scraper {
return &Scraper{
DebugLabeler: utils.NewDebugLabeler(logger, "Scraper", false),
}
}

func (s *Scraper) Scrape(ctx context.Context, uri string) (res chat1.UnfurlRaw, err error) {
defer s.Trace(ctx, func() error { return err }, "Scrape(%s)", uri)()
typ, domain, err := ClassifyDomainFromURI(uri)
if err != nil {
return res, err
}
switch typ {
case chat1.UnfurlType_GENERIC:
return s.scrapeGeneric(ctx, uri, domain)
default:
return s.scrapeGeneric(ctx, uri, domain)
}
}
32 changes: 32 additions & 0 deletions go/chat/unfurl/scraper/main.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package main

import (
"context"
"flag"
"fmt"
"os"

"github.com/keybase/client/go/chat/unfurl"
"github.com/keybase/client/go/logger"
logging "github.com/keybase/go-logging"
)

func main() {
flag.Parse()
args := flag.Args()
if len(args) != 1 {
fmt.Printf("must supply a URL\n")
os.Exit(3)
}

logger := logger.New("scraper")
logging.Reset()
url := args[0]
scraper := unfurl.NewScraper(logger)
res, err := scraper.Scrape(context.TODO(), url)
if err != nil {
fmt.Printf("error scraping URL: %s\n", err)
os.Exit(3)
}
fmt.Printf("%s\n", res)
}
124 changes: 124 additions & 0 deletions go/chat/unfurl/scraper_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package unfurl

import (
"bytes"
"context"
"fmt"
"io"
"io/ioutil"
"net"
"net/http"
"path/filepath"
"testing"

"github.com/keybase/client/go/logger"
"github.com/keybase/client/go/protocol/chat1"
"github.com/stretchr/testify/require"
)

type dummyHTTPSrv struct {
t *testing.T
srv *http.Server
}

func newDummyHTTPSrv(t *testing.T) *dummyHTTPSrv {
return &dummyHTTPSrv{
t: t,
}
}

func (d *dummyHTTPSrv) Start() string {
localhost := "127.0.0.1"
listener, err := net.Listen("tcp", fmt.Sprintf("%s:0", localhost))
require.NoError(d.t, err)
port := listener.Addr().(*net.TCPAddr).Port
d.srv = &http.Server{
Addr: fmt.Sprintf("%s:%d", localhost, port),
}
http.HandleFunc("/", d.handle)
go d.srv.Serve(listener)
return d.srv.Addr
}

func (d *dummyHTTPSrv) Stop() {
require.NoError(d.t, d.srv.Close())
}

func (d *dummyHTTPSrv) handle(w http.ResponseWriter, r *http.Request) {
w.WriteHeader(200)
name := r.URL.Query().Get("name")
dat, err := ioutil.ReadFile(filepath.Join("testcases", name+".html"))
require.NoError(d.t, err)
_, err = io.Copy(w, bytes.NewBuffer(dat))
require.NoError(d.t, err)
}

func strPtr(s string) *string {
return &s
}

func TestScraper(t *testing.T) {
scraper := NewScraper(logger.NewTestLogger(t))
srv := newDummyHTTPSrv(t)
addr := srv.Start()
defer srv.Stop()
testCase := func(name string, expected chat1.UnfurlRaw) {
res, err := scraper.Scrape(context.TODO(), "http://"+addr+"/?name="+name)
require.NoError(t, err)
etyp, err := expected.UnfurlType()
require.NoError(t, err)
rtyp, err := res.UnfurlType()
require.NoError(t, err)
require.Equal(t, etyp, rtyp)
t.Logf("expected: %v res: %v", expected, res)
switch rtyp {
case chat1.UnfurlType_GENERIC:
e := expected.Generic()
r := res.Generic()
require.Equal(t, e.Title, r.Title)
require.Equal(t, e.SiteName, r.SiteName)
require.True(t, (e.Description == nil && r.Description == nil) || (e.Description != nil && r.Description != nil))
require.True(t, e.Description == nil || *e.Description == *r.Description)
require.True(t, (e.ImageUrl == nil && r.ImageUrl == nil) || (e.ImageUrl != nil && r.ImageUrl != nil))
require.True(t, e.ImageUrl == nil || *e.ImageUrl == *r.ImageUrl)
require.True(t, (e.FaviconUrl == nil && r.FaviconUrl == nil) || (e.FaviconUrl != nil && r.FaviconUrl != nil))
require.True(t, e.FaviconUrl == nil || *e.FaviconUrl == *r.FaviconUrl)
default:
require.Fail(t, "unknown unfurl typ")
}
}
testCase("wsj0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{
Title: "U.S. Stocks Jump as Tough Month Sets to Wrap - WSJ",
SiteName: "WSJ",
Description: strPtr("A surge in technology shares following Facebook’s latest earnings lifted U.S. stocks, helping major indexes trim some of their October declines following a punishing period for global investors."),
ImageUrl: strPtr("https://images.wsj.net/im-33925/social"),
FaviconUrl: strPtr("https://s.wsj.net/media/wsj_favicon-32x32.png"),
}))
testCase("nytimes0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{
Title: "First Up if Democrats Win: Campaign and Ethics Changes, Infrastructure and Drug Prices - The New York Times",
SiteName: "0.1", // the default for these tests
Description: strPtr("House Democratic leaders, for the first time, laid out an ambitious opening salvo of bills for a majority, including an overhaul of campaign and ethics laws."),
ImageUrl: strPtr("https://static01.nyt.com/images/2018/10/31/us/politics/31dc-dems/31dc-dems-facebookJumbo.jpg"),
FaviconUrl: strPtr("http://127.0.0.1/vi-assets/static-assets/favicon-4bf96cb6a1093748bf5b3c429accb9b4.ico"),
}))
testCase("github0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{
Title: "GitHub - keybase/client: Keybase Go Library, Client, Service, OS X, iOS, Android, Electron",
SiteName: "GitHub",
Description: strPtr("Keybase Go Library, Client, Service, OS X, iOS, Android, Electron - keybase/client"),
ImageUrl: strPtr("https://avatars1.githubusercontent.com/u/5400834?s=400&v=4"),
}))
testCase("youtube0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{
Title: "Mario Kart Wii: The History of the Ultra Shortcut - YouTube",
SiteName: "YouTube",
Description: strPtr("https://www.twitch.tv/summoningsalt https://twitter.com/summoningsalt Music List- https://docs.google.com/document/d/1p2qV31ZhtNuP7AAXtRjGNZr2QwMSolzuz2wX6wu..."),
ImageUrl: strPtr("https://i.ytimg.com/vi/mmJ_LT8bUj0/hqdefault.jpg"),
FaviconUrl: strPtr("https://s.ytimg.com/yts/img/favicon-vfl8qSV2F.ico"),
}))
testCase("twitter0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{
Title: "Ars Technica on Twitter: \"Nintendo recommits to “keep the business going” for 3DS https://t.co/wTIJxmGTJH by @KyleOrl\"",
SiteName: "Twitter",
Description: strPtr("“Nintendo recommits to “keep the business going” for 3DS https://t.co/wTIJxmGTJH by @KyleOrl”"),
ImageUrl: strPtr("https://pbs.twimg.com/profile_images/2215576731/ars-logo_400x400.png"),
FaviconUrl: strPtr("http://abs.twimg.com/favicons/favicon.ico"),
}))
}
34 changes: 34 additions & 0 deletions go/chat/unfurl/testcases/cnn0.html

Large diffs are not rendered by default.

Loading

0 comments on commit 0115da7

Please sign in to comment.