Merge branch 'main' of https://github.com/signedsecurity/sigurlfind3r

hueristiq · Aug 18, 2021 · e2342fd · e2342fd
2 parents b9f7995 + 149e3a3
commit e2342fd
Show file tree

Hide file tree

Showing 2 changed files with 45 additions and 19 deletions.
diff --git a/README.md b/README.md
@@ -21,10 +21,13 @@ A passive reconnaissance tool for known URLs discovery - it gathers a list of UR
 
 ## Features
 
-* Fetches known URLs from **[AlienVault's OTX](https://otx.alienvault.com/)**, **[Common Crawl](https://commoncrawl.org/)**, **[URLScan](https://urlscan.io/)**, **[Github](https://github.com)** and the **[Wayback Machine](https://archive.org/web/)**.
-* Fetches disallowed paths from `robots.txt` found on your target domain and snapshotted by the Wayback Machine.
-* Regex filter URLs.
-* Save output to file.
+* Collect known URLs:
+    * Fetches from **[AlienVault's OTX](https://otx.alienvault.com/)**, **[Common Crawl](https://commoncrawl.org/)**, **[URLScan](https://urlscan.io/)**, **[Github](https://github.com)** and the **[Wayback Machine](https://archive.org/web/)**.
+    * Fetches disallowed paths from `robots.txt` found on your target domain and snapshotted by the Wayback Machine.
+* Reduce noise:
+    * Regex filter URLs.
+    * Removes duplicate pages in the sense of URL patterns that are probably repetitive and points to the same web template.
+* Output to stdout for piping or save to file.
 
 ## Installation
 

diff --git a/pkg/sigurlfind3r/sigurlfind3r.go b/pkg/sigurlfind3r/sigurlfind3r.go
@@ -2,8 +2,11 @@ package sigurlfind3r
 
 import (
 	"context"
+	"fmt"
+	"net/url"
 	"regexp"
 
+	"github.com/enenumxela/urlx/pkg/urlx"
 	"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/passive"
 	"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping"
 	"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/session"
@@ -59,35 +62,55 @@ func New(options ...*Options) (runner *Runner) {
 func (runner *Runner) Run(ctx context.Context, domain string) (URLs chan scraping.URL, err error) {
 	URLs = make(chan scraping.URL)
 
-	// Create a unique map for filtering duplicate URLs out
-	uniqueMap := make(map[string]scraping.URL)
-	// Create a map to track sources for each URL
-	sourceMap := make(map[string]map[string]struct{})
-
 	results := runner.Passive.Run(domain, runner.FilterRegex, runner.Options.IncludeSubdomains, runner.Options.Keys)
 
+	deDupMap := make(map[string]url.Values)
+	uniqueMap := make(map[string]scraping.URL)
+
 	// Process the results in a separate goroutine
 	go func() {
 		defer close(URLs)
 
 		for result := range results {
-			URL := result.Value
-
-			if _, exists := uniqueMap[URL]; !exists {
-				sourceMap[URL] = make(map[string]struct{})
+			// unique urls - If the url already exists in the unique map
+			if _, exists := uniqueMap[result.Value]; exists {
+				continue
 			}
 
-			sourceMap[URL][result.Source] = struct{}{}
-
-			if _, exists := uniqueMap[URL]; exists {
+			parsedURL, err := urlx.Parse(result.Value)
+			if err != nil {
 				continue
 			}
 
-			hostEntry := scraping.URL{Source: result.Source, Value: URL}
+			// urls with query
+			if len(parsedURL.Query()) > 0 {
+				unique := false
+
+				key := fmt.Sprintf("%s://%s%s", parsedURL.Scheme, parsedURL.Domain, parsedURL.Path)
+
+				if _, exists := deDupMap[key]; exists {
+					for parameter := range parsedURL.Query() {
+						if _, exists := deDupMap[key][parameter]; !exists {
+							deDupMap[key][parameter] = []string{"sigurlfind3r"}
+							unique = true
+						}
+					}
+				} else {
+					deDupMap[key] = parsedURL.Query()
+					unique = true
+				}
+
+				if !unique {
+					continue
+				}
+			}
 
-			uniqueMap[URL] = hostEntry
+			uniqueMap[parsedURL.String()] = scraping.URL{
+				Source: result.Source,
+				Value:  parsedURL.String(),
+			}
 
-			URLs <- hostEntry
+			URLs <- uniqueMap[parsedURL.String()]
 		}
 	}()