From f4c4641d789a4ca48ae72c4856ff02a54a253ac4 Mon Sep 17 00:00:00 2001 From: Pulkit Kathuria Date: Thu, 28 Mar 2024 11:17:00 +0900 Subject: [PATCH] (feat) relative url fix and adds limit on email addresses as well --- README.md | 5 ++++- main.go | 3 +++ pkg/crawl.go | 16 +++++++--------- pkg/strings.go | 12 ++++++++++++ 4 files changed, 26 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index ea67069..e400a2b 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,8 @@ email_extractor -limit-urls=100 -url=kevincobain2000.github.io Note: pagination links are usually query params Set it to false, if you want to crawl such links (default true) + -limit-emails int + limit of emails to crawl (default 1000) -limit-urls int limit of urls to crawl (default 1000) -out string @@ -114,4 +116,5 @@ It crawled `1000 urls`, and found `300 email addresses` in about `11 seconds`. - v1.0 - Python implementation to extract email addresses by crawling URLS. Installation using pip. - v2.0 - 100x performance improvement by using goroutines - v2.5 - 2x performance improvement by not opening the same url again -- v2.6 - Added depth of crawling urls \ No newline at end of file +- v2.6 - Added depth of crawling urls +- v2.7 - Limit emails addresses, and possible fix on relative urls \ No newline at end of file diff --git a/main.go b/main.go index bff3ca2..629c8fc 100644 --- a/main.go +++ b/main.go @@ -19,6 +19,7 @@ type Flags struct { url string writeToFile string limitUrls int + limitEmails int depth int timeout int64 sleep int64 @@ -40,6 +41,7 @@ func main() { opt.TimeoutMillisecond = f.timeout opt.SleepMillisecond = f.sleep opt.LimitUrls = f.limitUrls + opt.LimitEmails = f.limitEmails opt.WriteToFile = f.writeToFile opt.URL = f.url opt.Depth = f.depth @@ -91,6 +93,7 @@ func SetupFlags() { flag.StringVar(&f.writeToFile, "out", "emails.txt", "file to write to") flag.IntVar(&f.limitUrls, "limit-urls", 1000, "limit of urls to crawl") + flag.IntVar(&f.limitEmails, "limit-emails", 1000, "limit of emails to crawl") flag.IntVar(&f.depth, "depth", -1, `depth of urls to crawl. -1 for the url provided & all depths (default) diff --git a/pkg/crawl.go b/pkg/crawl.go index 7a3fd63..a3757a7 100644 --- a/pkg/crawl.go +++ b/pkg/crawl.go @@ -2,7 +2,6 @@ package pkg import ( "fmt" - "strings" "sync" "time" @@ -19,6 +18,7 @@ type Options struct { IgnoreQueries bool Depth int LimitUrls int + LimitEmails int WriteToFile string } @@ -61,6 +61,10 @@ func (hc *HTTPChallenge) CrawlRecursive(url string, wg *sync.WaitGroup) *HTTPCha if len(hc.urls) >= hc.options.LimitUrls { break } + if len(hc.Emails) >= hc.options.LimitEmails { + hc.Emails = hc.Emails[:hc.options.LimitEmails] + break + } if StringInSlice(u, hc.urls) { continue } @@ -107,6 +111,7 @@ func (hc *HTTPChallenge) Crawl(url string) []string { } if hc.options.WriteToFile != "" { hc.Emails = append(hc.Emails, emails...) + hc.Emails = UniqueStrings(hc.Emails) } // crawl the page and print all links @@ -115,7 +120,7 @@ func (hc *HTTPChallenge) Crawl(url string) []string { if !exists { return } - href = hc.relativeToAbsoluteURL(href) + href = RelativeToAbsoluteURL(href, url, GetBaseURL(url)) if hc.options.IgnoreQueries { href = RemoveAnyQueryParam(href) @@ -142,10 +147,3 @@ func (hc *HTTPChallenge) Crawl(url string) []string { urls = UniqueStrings(urls) return urls } - -func (hc *HTTPChallenge) relativeToAbsoluteURL(href string) string { - if !strings.HasPrefix(href, "http") && !strings.HasPrefix(href, "//") { - href = fmt.Sprintf("%s://%s%s", hc.browse.Url().Scheme, hc.browse.Url().Host, href) - } - return href -} diff --git a/pkg/strings.go b/pkg/strings.go index 5aede1a..a6c9ff4 100644 --- a/pkg/strings.go +++ b/pkg/strings.go @@ -88,3 +88,15 @@ func ExtractEmailsFromText(text string) []string { return emails } + +func RelativeToAbsoluteURL(href, currentURL, baseURL string) string { + if strings.HasPrefix(href, "http") { + return href + } + + if strings.HasPrefix(href, "/") { + return baseURL + href + } + + return currentURL + href +}