From f4c4641d789a4ca48ae72c4856ff02a54a253ac4 Mon Sep 17 00:00:00 2001
From: Pulkit Kathuria <kevincobain2000@gmail.com>
Date: Thu, 28 Mar 2024 11:17:00 +0900
Subject: [PATCH] (feat) relative url fix and adds limit on email addresses as
 well

---
 README.md      |  5 ++++-
 main.go        |  3 +++
 pkg/crawl.go   | 16 +++++++---------
 pkg/strings.go | 12 ++++++++++++
 4 files changed, 26 insertions(+), 10 deletions(-)

diff --git a/README.md b/README.md
index ea67069..e400a2b 100644
--- a/README.md
+++ b/README.md
@@ -83,6 +83,8 @@ email_extractor -limit-urls=100 -url=kevincobain2000.github.io
     	Note: pagination links are usually query params
     	Set it to false, if you want to crawl such links
     	 (default true)
+  -limit-emails int
+    	limit of emails to crawl (default 1000)
   -limit-urls int
     	limit of urls to crawl (default 1000)
   -out string
@@ -114,4 +116,5 @@ It crawled `1000 urls`, and found `300 email addresses` in about `11 seconds`.
 - v1.0 - Python implementation to extract email addresses by crawling URLS. Installation using pip.
 - v2.0 - 100x performance improvement by using goroutines
 - v2.5 - 2x performance improvement by not opening the same url again
-- v2.6 - Added depth of crawling urls
\ No newline at end of file
+- v2.6 - Added depth of crawling urls
+- v2.7 - Limit emails addresses, and possible fix on relative urls
\ No newline at end of file
diff --git a/main.go b/main.go
index bff3ca2..629c8fc 100644
--- a/main.go
+++ b/main.go
@@ -19,6 +19,7 @@ type Flags struct {
 	url           string
 	writeToFile   string
 	limitUrls     int
+	limitEmails   int
 	depth         int
 	timeout       int64
 	sleep         int64
@@ -40,6 +41,7 @@ func main() {
 			opt.TimeoutMillisecond = f.timeout
 			opt.SleepMillisecond = f.sleep
 			opt.LimitUrls = f.limitUrls
+			opt.LimitEmails = f.limitEmails
 			opt.WriteToFile = f.writeToFile
 			opt.URL = f.url
 			opt.Depth = f.depth
@@ -91,6 +93,7 @@ func SetupFlags() {
 	flag.StringVar(&f.writeToFile, "out", "emails.txt", "file to write to")
 
 	flag.IntVar(&f.limitUrls, "limit-urls", 1000, "limit of urls to crawl")
+	flag.IntVar(&f.limitEmails, "limit-emails", 1000, "limit of emails to crawl")
 
 	flag.IntVar(&f.depth, "depth", -1, `depth of urls to crawl.
 -1 for the url provided & all depths (default)
diff --git a/pkg/crawl.go b/pkg/crawl.go
index 7a3fd63..a3757a7 100644
--- a/pkg/crawl.go
+++ b/pkg/crawl.go
@@ -2,7 +2,6 @@ package pkg
 
 import (
 	"fmt"
-	"strings"
 	"sync"
 	"time"
 
@@ -19,6 +18,7 @@ type Options struct {
 	IgnoreQueries      bool
 	Depth              int
 	LimitUrls          int
+	LimitEmails        int
 	WriteToFile        string
 }
 
@@ -61,6 +61,10 @@ func (hc *HTTPChallenge) CrawlRecursive(url string, wg *sync.WaitGroup) *HTTPCha
 		if len(hc.urls) >= hc.options.LimitUrls {
 			break
 		}
+		if len(hc.Emails) >= hc.options.LimitEmails {
+			hc.Emails = hc.Emails[:hc.options.LimitEmails]
+			break
+		}
 		if StringInSlice(u, hc.urls) {
 			continue
 		}
@@ -107,6 +111,7 @@ func (hc *HTTPChallenge) Crawl(url string) []string {
 	}
 	if hc.options.WriteToFile != "" {
 		hc.Emails = append(hc.Emails, emails...)
+		hc.Emails = UniqueStrings(hc.Emails)
 	}
 
 	// crawl the page and print all links
@@ -115,7 +120,7 @@ func (hc *HTTPChallenge) Crawl(url string) []string {
 		if !exists {
 			return
 		}
-		href = hc.relativeToAbsoluteURL(href)
+		href = RelativeToAbsoluteURL(href, url, GetBaseURL(url))
 
 		if hc.options.IgnoreQueries {
 			href = RemoveAnyQueryParam(href)
@@ -142,10 +147,3 @@ func (hc *HTTPChallenge) Crawl(url string) []string {
 	urls = UniqueStrings(urls)
 	return urls
 }
-
-func (hc *HTTPChallenge) relativeToAbsoluteURL(href string) string {
-	if !strings.HasPrefix(href, "http") && !strings.HasPrefix(href, "//") {
-		href = fmt.Sprintf("%s://%s%s", hc.browse.Url().Scheme, hc.browse.Url().Host, href)
-	}
-	return href
-}
diff --git a/pkg/strings.go b/pkg/strings.go
index 5aede1a..a6c9ff4 100644
--- a/pkg/strings.go
+++ b/pkg/strings.go
@@ -88,3 +88,15 @@ func ExtractEmailsFromText(text string) []string {
 
 	return emails
 }
+
+func RelativeToAbsoluteURL(href, currentURL, baseURL string) string {
+	if strings.HasPrefix(href, "http") {
+		return href
+	}
+
+	if strings.HasPrefix(href, "/") {
+		return baseURL + href
+	}
+
+	return currentURL + href
+}