/
collector.go
93 lines (82 loc) · 2.4 KB
/
collector.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
package crawler
import (
"context"
"fmt"
"net/http"
"net/http/cookiejar"
"strings"
"github.com/gocolly/colly/v2"
)
// Collector searches for css, js, and images within a given link
// TODO improve for better performance
func Collector(ctx context.Context, url string, projectPath string, cookieJar *cookiejar.Jar, proxyString string, userAgent string) error {
// create a new collector
c := colly.NewCollector(colly.Async(true))
setUpCollector(c, ctx, cookieJar, proxyString, userAgent)
// search for all link tags that have a rel attribute that is equal to stylesheet - CSS
c.OnHTML("link[rel='stylesheet']", func(e *colly.HTMLElement) {
// hyperlink reference
link := e.Attr("href")
// print css file was found
fmt.Println("Css found", "-->", link)
// extraction
Extractor(e.Request.AbsoluteURL(link), projectPath)
})
// search for all script tags with src attribute -- JS
c.OnHTML("script[src]", func(e *colly.HTMLElement) {
// src attribute
link := e.Attr("src")
// Print link
fmt.Println("Js found", "-->", link)
// extraction
Extractor(e.Request.AbsoluteURL(link), projectPath)
})
// serach for all img tags with src attribute -- Images
c.OnHTML("img[src]", func(e *colly.HTMLElement) {
// src attribute
link := e.Attr("src")
if strings.HasPrefix(link, "data:image") || strings.HasPrefix(link, "blob:") {
return
}
// Print link
fmt.Println("Img found", "-->", link)
// extraction
Extractor(e.Request.AbsoluteURL(link), projectPath)
})
//Before making a request
c.OnRequest(func(r *colly.Request) {
link := r.URL.String()
if url == link {
HTMLExtractor(link, projectPath)
}
})
// Visit each url and wait for stuff to load :)
if err := c.Visit(url); err != nil {
return err
}
c.Wait()
return nil
}
type cancelableTransport struct {
ctx context.Context
transport http.RoundTripper
}
func (t cancelableTransport) RoundTrip(req *http.Request) (*http.Response, error) {
if err := t.ctx.Err(); err != nil {
return nil, err
}
return t.transport.RoundTrip(req.WithContext(t.ctx))
}
func setUpCollector(c *colly.Collector, ctx context.Context, cookieJar *cookiejar.Jar, proxyString, userAgent string) {
if cookieJar != nil {
c.SetCookieJar(cookieJar)
}
if proxyString != "" {
c.SetProxy(proxyString)
} else {
c.WithTransport(cancelableTransport{ctx: ctx, transport: http.DefaultTransport})
}
if userAgent != "" {
c.UserAgent = userAgent
}
}