Skip to content

Commit

Permalink
(feat) relative url fix and adds limit on email addresses as well
Browse files Browse the repository at this point in the history
  • Loading branch information
kevincobain2000 committed Mar 28, 2024
1 parent b46109e commit f4c4641
Show file tree
Hide file tree
Showing 4 changed files with 26 additions and 10 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,8 @@ email_extractor -limit-urls=100 -url=kevincobain2000.github.io
Note: pagination links are usually query params
Set it to false, if you want to crawl such links
(default true)
-limit-emails int
limit of emails to crawl (default 1000)
-limit-urls int
limit of urls to crawl (default 1000)
-out string
Expand Down Expand Up @@ -114,4 +116,5 @@ It crawled `1000 urls`, and found `300 email addresses` in about `11 seconds`.
- v1.0 - Python implementation to extract email addresses by crawling URLS. Installation using pip.
- v2.0 - 100x performance improvement by using goroutines
- v2.5 - 2x performance improvement by not opening the same url again
- v2.6 - Added depth of crawling urls
- v2.6 - Added depth of crawling urls
- v2.7 - Limit emails addresses, and possible fix on relative urls
3 changes: 3 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ type Flags struct {
url string
writeToFile string
limitUrls int
limitEmails int
depth int
timeout int64
sleep int64
Expand All @@ -40,6 +41,7 @@ func main() {
opt.TimeoutMillisecond = f.timeout
opt.SleepMillisecond = f.sleep
opt.LimitUrls = f.limitUrls
opt.LimitEmails = f.limitEmails
opt.WriteToFile = f.writeToFile
opt.URL = f.url
opt.Depth = f.depth
Expand Down Expand Up @@ -91,6 +93,7 @@ func SetupFlags() {
flag.StringVar(&f.writeToFile, "out", "emails.txt", "file to write to")

flag.IntVar(&f.limitUrls, "limit-urls", 1000, "limit of urls to crawl")
flag.IntVar(&f.limitEmails, "limit-emails", 1000, "limit of emails to crawl")

flag.IntVar(&f.depth, "depth", -1, `depth of urls to crawl.
-1 for the url provided & all depths (default)
Expand Down
16 changes: 7 additions & 9 deletions pkg/crawl.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@ package pkg

import (
"fmt"
"strings"
"sync"
"time"

Expand All @@ -19,6 +18,7 @@ type Options struct {
IgnoreQueries bool
Depth int
LimitUrls int
LimitEmails int
WriteToFile string
}

Expand Down Expand Up @@ -61,6 +61,10 @@ func (hc *HTTPChallenge) CrawlRecursive(url string, wg *sync.WaitGroup) *HTTPCha
if len(hc.urls) >= hc.options.LimitUrls {
break
}
if len(hc.Emails) >= hc.options.LimitEmails {
hc.Emails = hc.Emails[:hc.options.LimitEmails]
break
}
if StringInSlice(u, hc.urls) {
continue
}
Expand Down Expand Up @@ -107,6 +111,7 @@ func (hc *HTTPChallenge) Crawl(url string) []string {
}
if hc.options.WriteToFile != "" {
hc.Emails = append(hc.Emails, emails...)
hc.Emails = UniqueStrings(hc.Emails)
}

// crawl the page and print all links
Expand All @@ -115,7 +120,7 @@ func (hc *HTTPChallenge) Crawl(url string) []string {
if !exists {
return
}
href = hc.relativeToAbsoluteURL(href)
href = RelativeToAbsoluteURL(href, url, GetBaseURL(url))

if hc.options.IgnoreQueries {
href = RemoveAnyQueryParam(href)
Expand All @@ -142,10 +147,3 @@ func (hc *HTTPChallenge) Crawl(url string) []string {
urls = UniqueStrings(urls)
return urls
}

func (hc *HTTPChallenge) relativeToAbsoluteURL(href string) string {
if !strings.HasPrefix(href, "http") && !strings.HasPrefix(href, "//") {
href = fmt.Sprintf("%s://%s%s", hc.browse.Url().Scheme, hc.browse.Url().Host, href)
}
return href
}
12 changes: 12 additions & 0 deletions pkg/strings.go
Original file line number Diff line number Diff line change
Expand Up @@ -88,3 +88,15 @@ func ExtractEmailsFromText(text string) []string {

return emails
}

func RelativeToAbsoluteURL(href, currentURL, baseURL string) string {
if strings.HasPrefix(href, "http") {
return href
}

if strings.HasPrefix(href, "/") {
return baseURL + href
}

return currentURL + href
}

0 comments on commit f4c4641

Please sign in to comment.