-
-
Notifications
You must be signed in to change notification settings - Fork 429
/
commoncrawl.go
122 lines (102 loc) · 2.96 KB
/
commoncrawl.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
package commoncrawl
import (
"bufio"
"bytes"
"context"
"fmt"
"github.com/json-iterator/go"
"github.com/lc/gau/v2/pkg/httpclient"
"github.com/lc/gau/v2/pkg/providers"
"github.com/sirupsen/logrus"
)
const (
Name = "commoncrawl"
)
// verify interface compliance
var _ providers.Provider = (*Client)(nil)
// Client is the structure that holds the Filters and the Client's configuration
type Client struct {
filters providers.Filters
config *providers.Config
apiURL string
}
func New(c *providers.Config, filters providers.Filters) (*Client, error) {
client := &Client{config: c, filters: filters}
// Fetch the list of available CommonCrawl Api URLs.
resp, err := httpclient.MakeRequest(c.Client, "http://index.commoncrawl.org/collinfo.json", int(c.MaxRetries))
if err != nil {
return nil, err
}
var r apiResult
if err = jsoniter.Unmarshal(resp, &r); err != nil {
return nil, err
}
client.apiURL = r[0].API
return client, nil
}
func (c *Client) Name() string {
return Name
}
// Fetch fetches all urls for a given domain and sends them to a channel.
// It returns an error should one occur.
func (c *Client) Fetch(ctx context.Context, domain string, results chan string) error {
p, err := c.getPagination(domain)
if err != nil {
return err
}
// 0 pages means no results
if p.Pages == 0 {
if c.config.Verbose {
logrus.WithFields(logrus.Fields{"provider": Name}).Infof("no results for %s", domain)
}
return nil
}
paginate:
for page := uint(0); page < p.Pages; page++ {
select {
case <-ctx.Done():
break paginate
default:
if c.config.Verbose {
logrus.WithFields(logrus.Fields{"provider": Name, "page": page}).Infof("fetching %s", domain)
}
apiURL := c.formatURL(domain, page)
resp, err := httpclient.MakeRequest(c.config.Client, apiURL, int(c.config.MaxRetries))
if err != nil {
return fmt.Errorf("failed to fetch commoncrawl(%d): %s", page, err)
}
sc := bufio.NewScanner(bytes.NewReader(resp))
for sc.Scan() {
var res apiResponse
if err := jsoniter.Unmarshal(sc.Bytes(), &res); err != nil {
return fmt.Errorf("failed to decode commoncrawl result: %s", err)
}
if res.Error != "" {
return fmt.Errorf("received an error from commoncrawl: %s", res.Error)
}
results <- res.URL
}
}
}
return nil
}
func (c *Client) formatURL(domain string, page uint) string {
if c.config.IncludeSubdomains {
domain = "*." + domain
}
filterParams := c.filters.GetParameters(false)
return fmt.Sprintf("%s?url=%s/*&output=json&fl=url&page=%d", c.apiURL, domain, page) + filterParams
}
// Fetch the number of pages.
func (c *Client) getPagination(domain string) (paginationResult, error) {
url := fmt.Sprintf("%s&showNumPages=true", c.formatURL(domain, 0))
resp, err := httpclient.MakeRequest(c.config.Client, url, int(c.config.MaxRetries))
if err != nil {
return paginationResult{}, err
}
var r paginationResult
if err = jsoniter.Unmarshal(resp, &r); err != nil {
return r, err
}
return r, nil
}