-
Notifications
You must be signed in to change notification settings - Fork 1.2k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
unfurling scraper part 1 CORE-9243 (#14506)
- Loading branch information
Showing
218 changed files
with
245,724 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
package unfurl | ||
|
||
import ( | ||
"context" | ||
"strings" | ||
"time" | ||
|
||
"github.com/gocolly/colly" | ||
"github.com/keybase/client/go/protocol/chat1" | ||
) | ||
|
||
func fullURL(hostname, path string) string { | ||
if strings.HasPrefix(path, "//") { | ||
return "http:" + path | ||
} else if strings.HasPrefix(path, "/") { | ||
return "http://" + hostname + path | ||
} | ||
return path | ||
} | ||
|
||
func (s *Scraper) scrapeGeneric(ctx context.Context, uri, domain string) (res chat1.UnfurlRaw, err error) { | ||
var generic chat1.UnfurlGenericRaw | ||
hostname, err := GetHostname(uri) | ||
if err != nil { | ||
return res, err | ||
} | ||
generic.Url = uri | ||
generic.SiteName = domain | ||
c := colly.NewCollector() | ||
c.OnHTML("head meta[content][property]", func(e *colly.HTMLElement) { | ||
prop := e.Attr("property") | ||
content := e.Attr("content") | ||
switch prop { | ||
case "og:description": | ||
generic.Description = &content | ||
case "og:image": | ||
generic.ImageUrl = new(string) | ||
*generic.ImageUrl = fullURL(hostname, content) | ||
case "og:site_name": | ||
generic.SiteName = content | ||
case "og:pubdate": | ||
s.Debug(ctx, "pubdate: %s", content) | ||
t, err := time.Parse("2006-01-02T15:04:05Z", content) | ||
if err == nil { | ||
generic.PublishTime = new(int) | ||
*generic.PublishTime = int(t.Unix()) | ||
} else { | ||
s.Debug(ctx, "scrapeGeneric: failed to parse pubdate: %s", err) | ||
} | ||
} | ||
}) | ||
c.OnHTML("head title", func(e *colly.HTMLElement) { | ||
generic.Title = e.Text | ||
}) | ||
c.OnHTML("head link[rel][href]", func(e *colly.HTMLElement) { | ||
rel := strings.ToLower(e.Attr("rel")) | ||
if strings.Contains(rel, "shortcut icon") { | ||
generic.FaviconUrl = new(string) | ||
*generic.FaviconUrl = fullURL(hostname, e.Attr("href")) | ||
} | ||
}) | ||
if err := c.Visit(uri); err != nil { | ||
return res, err | ||
} | ||
return chat1.NewUnfurlRawWithGeneric(generic), nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
package unfurl | ||
|
||
import ( | ||
"context" | ||
|
||
"github.com/keybase/client/go/chat/utils" | ||
"github.com/keybase/client/go/logger" | ||
|
||
"github.com/keybase/client/go/protocol/chat1" | ||
) | ||
|
||
type Scraper struct { | ||
utils.DebugLabeler | ||
} | ||
|
||
func NewScraper(logger logger.Logger) *Scraper { | ||
return &Scraper{ | ||
DebugLabeler: utils.NewDebugLabeler(logger, "Scraper", false), | ||
} | ||
} | ||
|
||
func (s *Scraper) Scrape(ctx context.Context, uri string) (res chat1.UnfurlRaw, err error) { | ||
defer s.Trace(ctx, func() error { return err }, "Scrape(%s)", uri)() | ||
typ, domain, err := ClassifyDomainFromURI(uri) | ||
if err != nil { | ||
return res, err | ||
} | ||
switch typ { | ||
case chat1.UnfurlType_GENERIC: | ||
return s.scrapeGeneric(ctx, uri, domain) | ||
default: | ||
return s.scrapeGeneric(ctx, uri, domain) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package main | ||
|
||
import ( | ||
"context" | ||
"flag" | ||
"fmt" | ||
"os" | ||
|
||
"github.com/keybase/client/go/chat/unfurl" | ||
"github.com/keybase/client/go/logger" | ||
logging "github.com/keybase/go-logging" | ||
) | ||
|
||
func main() { | ||
flag.Parse() | ||
args := flag.Args() | ||
if len(args) != 1 { | ||
fmt.Printf("must supply a URL\n") | ||
os.Exit(3) | ||
} | ||
|
||
logger := logger.New("scraper") | ||
logging.Reset() | ||
url := args[0] | ||
scraper := unfurl.NewScraper(logger) | ||
res, err := scraper.Scrape(context.TODO(), url) | ||
if err != nil { | ||
fmt.Printf("error scraping URL: %s\n", err) | ||
os.Exit(3) | ||
} | ||
fmt.Printf("%s\n", res) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
package unfurl | ||
|
||
import ( | ||
"bytes" | ||
"context" | ||
"fmt" | ||
"io" | ||
"io/ioutil" | ||
"net" | ||
"net/http" | ||
"path/filepath" | ||
"testing" | ||
|
||
"github.com/keybase/client/go/logger" | ||
"github.com/keybase/client/go/protocol/chat1" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
type dummyHTTPSrv struct { | ||
t *testing.T | ||
srv *http.Server | ||
} | ||
|
||
func newDummyHTTPSrv(t *testing.T) *dummyHTTPSrv { | ||
return &dummyHTTPSrv{ | ||
t: t, | ||
} | ||
} | ||
|
||
func (d *dummyHTTPSrv) Start() string { | ||
localhost := "127.0.0.1" | ||
listener, err := net.Listen("tcp", fmt.Sprintf("%s:0", localhost)) | ||
require.NoError(d.t, err) | ||
port := listener.Addr().(*net.TCPAddr).Port | ||
d.srv = &http.Server{ | ||
Addr: fmt.Sprintf("%s:%d", localhost, port), | ||
} | ||
http.HandleFunc("/", d.handle) | ||
go d.srv.Serve(listener) | ||
return d.srv.Addr | ||
} | ||
|
||
func (d *dummyHTTPSrv) Stop() { | ||
require.NoError(d.t, d.srv.Close()) | ||
} | ||
|
||
func (d *dummyHTTPSrv) handle(w http.ResponseWriter, r *http.Request) { | ||
w.WriteHeader(200) | ||
name := r.URL.Query().Get("name") | ||
dat, err := ioutil.ReadFile(filepath.Join("testcases", name+".html")) | ||
require.NoError(d.t, err) | ||
_, err = io.Copy(w, bytes.NewBuffer(dat)) | ||
require.NoError(d.t, err) | ||
} | ||
|
||
func strPtr(s string) *string { | ||
return &s | ||
} | ||
|
||
func TestScraper(t *testing.T) { | ||
scraper := NewScraper(logger.NewTestLogger(t)) | ||
srv := newDummyHTTPSrv(t) | ||
addr := srv.Start() | ||
defer srv.Stop() | ||
testCase := func(name string, expected chat1.UnfurlRaw) { | ||
res, err := scraper.Scrape(context.TODO(), "http://"+addr+"/?name="+name) | ||
require.NoError(t, err) | ||
etyp, err := expected.UnfurlType() | ||
require.NoError(t, err) | ||
rtyp, err := res.UnfurlType() | ||
require.NoError(t, err) | ||
require.Equal(t, etyp, rtyp) | ||
t.Logf("expected: %v res: %v", expected, res) | ||
switch rtyp { | ||
case chat1.UnfurlType_GENERIC: | ||
e := expected.Generic() | ||
r := res.Generic() | ||
require.Equal(t, e.Title, r.Title) | ||
require.Equal(t, e.SiteName, r.SiteName) | ||
require.True(t, (e.Description == nil && r.Description == nil) || (e.Description != nil && r.Description != nil)) | ||
require.True(t, e.Description == nil || *e.Description == *r.Description) | ||
require.True(t, (e.ImageUrl == nil && r.ImageUrl == nil) || (e.ImageUrl != nil && r.ImageUrl != nil)) | ||
require.True(t, e.ImageUrl == nil || *e.ImageUrl == *r.ImageUrl) | ||
require.True(t, (e.FaviconUrl == nil && r.FaviconUrl == nil) || (e.FaviconUrl != nil && r.FaviconUrl != nil)) | ||
require.True(t, e.FaviconUrl == nil || *e.FaviconUrl == *r.FaviconUrl) | ||
default: | ||
require.Fail(t, "unknown unfurl typ") | ||
} | ||
} | ||
testCase("wsj0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{ | ||
Title: "U.S. Stocks Jump as Tough Month Sets to Wrap - WSJ", | ||
SiteName: "WSJ", | ||
Description: strPtr("A surge in technology shares following Facebook’s latest earnings lifted U.S. stocks, helping major indexes trim some of their October declines following a punishing period for global investors."), | ||
ImageUrl: strPtr("https://images.wsj.net/im-33925/social"), | ||
FaviconUrl: strPtr("https://s.wsj.net/media/wsj_favicon-32x32.png"), | ||
})) | ||
testCase("nytimes0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{ | ||
Title: "First Up if Democrats Win: Campaign and Ethics Changes, Infrastructure and Drug Prices - The New York Times", | ||
SiteName: "0.1", // the default for these tests | ||
Description: strPtr("House Democratic leaders, for the first time, laid out an ambitious opening salvo of bills for a majority, including an overhaul of campaign and ethics laws."), | ||
ImageUrl: strPtr("https://static01.nyt.com/images/2018/10/31/us/politics/31dc-dems/31dc-dems-facebookJumbo.jpg"), | ||
FaviconUrl: strPtr("http://127.0.0.1/vi-assets/static-assets/favicon-4bf96cb6a1093748bf5b3c429accb9b4.ico"), | ||
})) | ||
testCase("github0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{ | ||
Title: "GitHub - keybase/client: Keybase Go Library, Client, Service, OS X, iOS, Android, Electron", | ||
SiteName: "GitHub", | ||
Description: strPtr("Keybase Go Library, Client, Service, OS X, iOS, Android, Electron - keybase/client"), | ||
ImageUrl: strPtr("https://avatars1.githubusercontent.com/u/5400834?s=400&v=4"), | ||
})) | ||
testCase("youtube0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{ | ||
Title: "Mario Kart Wii: The History of the Ultra Shortcut - YouTube", | ||
SiteName: "YouTube", | ||
Description: strPtr("https://www.twitch.tv/summoningsalt https://twitter.com/summoningsalt Music List- https://docs.google.com/document/d/1p2qV31ZhtNuP7AAXtRjGNZr2QwMSolzuz2wX6wu..."), | ||
ImageUrl: strPtr("https://i.ytimg.com/vi/mmJ_LT8bUj0/hqdefault.jpg"), | ||
FaviconUrl: strPtr("https://s.ytimg.com/yts/img/favicon-vfl8qSV2F.ico"), | ||
})) | ||
testCase("twitter0", chat1.NewUnfurlRawWithGeneric(chat1.UnfurlGenericRaw{ | ||
Title: "Ars Technica on Twitter: \"Nintendo recommits to “keep the business going” for 3DS https://t.co/wTIJxmGTJH by @KyleOrl\"", | ||
SiteName: "Twitter", | ||
Description: strPtr("“Nintendo recommits to “keep the business going” for 3DS https://t.co/wTIJxmGTJH by @KyleOrl”"), | ||
ImageUrl: strPtr("https://pbs.twimg.com/profile_images/2215576731/ars-logo_400x400.png"), | ||
FaviconUrl: strPtr("http://abs.twimg.com/favicons/favicon.ico"), | ||
})) | ||
} |
Large diffs are not rendered by default.
Oops, something went wrong.
Oops, something went wrong.