feat: Extract URLs from wayback robots.txt files

This feature add a new source, waybackrobots. It extracts endpoints marked as disallowed from robots.txt found on your target domain and snapshotted by the Wayback Machine.
hueristiq · Aug 12, 2021 · 7784a37 · 7784a37
1 parent b4fc663
commit 7784a37
Show file tree

Hide file tree

Showing 3 changed files with 136 additions and 0 deletions.
diff --git a/README.md b/README.md
@@ -22,6 +22,7 @@ sigurlfind3r is a passive reconnaissance tool, it fetches known URLs from **[Ali
 ## Features
 
 * Fetches known URLs from AlienVault's OTX, Common Crawl, URLScan, Github & the Wayback Machine.
+* Fetches disallowed paths from `robots.txt` found on your target domain and snapshotted by the Wayback Machine.
 * Save output to file.
 * Regex filter URLs.
 

diff --git a/pkg/sigurlfind3r/passive/agent.go b/pkg/sigurlfind3r/passive/agent.go
@@ -7,6 +7,7 @@ import (
 	"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping/sources/otx"
 	"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping/sources/urlscan"
 	"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping/sources/wayback"
+	"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping/sources/waybackrobots"
 )
 
 // Agent is a struct for running passive url collection for a given host.
@@ -42,6 +43,8 @@ func (agent *Agent) addSources(sourcesToUse []string) {
 			agent.sources[source] = &urlscan.Source{}
 		case "wayback":
 			agent.sources[source] = &wayback.Source{}
+		case "waybackrobots":
+			agent.sources[source] = &waybackrobots.Source{}
 		}
 	}
 }

diff --git a/pkg/sigurlfind3r/scraping/sources/waybackrobots/waybackrobots.go b/pkg/sigurlfind3r/scraping/sources/waybackrobots/waybackrobots.go
@@ -0,0 +1,132 @@
+package waybackrobots
+
+import (
+	"bytes"
+	"encoding/json"
+	"fmt"
+	"io/ioutil"
+	"path/filepath"
+	"regexp"
+	"strings"
+	"sync"
+
+	"github.com/enenumxela/urlx/pkg/urlx"
+	"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping"
+	"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/session"
+)
+
+type Source struct{}
+
+func (source *Source) Run(domain string, ses *session.Session, includeSubs bool) chan scraping.URL {
+	URLs := make(chan scraping.URL)
+
+	go func() {
+		defer close(URLs)
+
+		// if includeSubs {
+		// 	domain = "*." + domain
+		// }
+
+		res, err := ses.SimpleGet(fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest", domain))
+		if err != nil {
+			ses.DiscardHTTPResponse(res)
+			return
+		}
+
+		defer res.Body.Close()
+
+		robotsURLsRows := [][2]string{}
+
+		data, err := ioutil.ReadAll(res.Body)
+		if err != nil {
+			return
+		}
+
+		if err = json.Unmarshal(data, &robotsURLsRows); err != nil {
+			return
+		}
+
+		if len(robotsURLsRows) < 2 {
+			return
+		}
+
+		robotsURLsRows = robotsURLsRows[1:]
+
+		wg := &sync.WaitGroup{}
+
+		for _, row := range robotsURLsRows {
+			wg.Add(1)
+
+			go func(row [2]string) {
+				defer wg.Done()
+
+				res, err := ses.SimpleGet(fmt.Sprintf("https://web.archive.org/web/%sif_/%s", row[0], row[1]))
+				if err != nil {
+					// fmt.Println(err)
+					ses.DiscardHTTPResponse(res)
+					return
+				}
+
+				buf := new(bytes.Buffer)
+				buf.ReadFrom(res.Body)
+
+				pattern := regexp.MustCompile(`Disallow:\s?.+`)
+
+				disallowed := pattern.FindAllStringSubmatch(buf.String(), -1)
+
+				if len(disallowed) < 1 {
+					return
+				}
+
+				for _, entry := range disallowed {
+					temp := strings.Split(entry[0], "Disallow:")
+
+					if len(temp) <= 1 {
+						continue
+					}
+
+					endpoint := strings.Trim(temp[1], " ")
+
+					if endpoint == "/" || endpoint == "*" || endpoint == "" {
+						continue
+					}
+
+					endpoint = strings.Replace(endpoint, "*", "", -1)
+
+					for strings.HasPrefix(endpoint, "/") {
+						if len(endpoint) >= 1 {
+							endpoint = endpoint[1:] // Ex. /*/test or /*/*/demo
+						} else {
+							continue
+						}
+					}
+
+					for strings.HasSuffix(endpoint, "/") {
+						if len(endpoint) >= 1 {
+							endpoint = endpoint[0 : len(endpoint)-1]
+						} else {
+							continue
+						}
+					}
+
+					parsedURL, _ := urlx.Parse(row[1])
+
+					endpoint = filepath.Join(parsedURL.Host, endpoint)
+					endpoint = parsedURL.Scheme + "://" + endpoint
+
+					if URL, ok := scraping.NormalizeURL(endpoint, ses.Scope); ok {
+						URLs <- scraping.URL{Source: source.Name(), Value: URL}
+					}
+				}
+			}(row)
+		}
+
+		wg.Wait()
+	}()
+
+	return URLs
+}
+
+func (source *Source) Name() string {
+	return "waybackrobots"
+}