Skip to content

Commit

Permalink
feat: Extract URLs from wayback robots.txt files
Browse files Browse the repository at this point in the history
This feature add a new source, waybackrobots. It extracts endpoints
marked as disallowed from robots.txt found on your target domain
and snapshotted by the Wayback Machine.
  • Loading branch information
enenumxela committed Aug 12, 2021
1 parent b4fc663 commit 7784a37
Show file tree
Hide file tree
Showing 3 changed files with 136 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -22,6 +22,7 @@ sigurlfind3r is a passive reconnaissance tool, it fetches known URLs from **[Ali
## Features

* Fetches known URLs from AlienVault's OTX, Common Crawl, URLScan, Github & the Wayback Machine.
* Fetches disallowed paths from `robots.txt` found on your target domain and snapshotted by the Wayback Machine.
* Save output to file.
* Regex filter URLs.

Expand Down
3 changes: 3 additions & 0 deletions pkg/sigurlfind3r/passive/agent.go
Expand Up @@ -7,6 +7,7 @@ import (
"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping/sources/otx"
"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping/sources/urlscan"
"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping/sources/wayback"
"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping/sources/waybackrobots"
)

// Agent is a struct for running passive url collection for a given host.
Expand Down Expand Up @@ -42,6 +43,8 @@ func (agent *Agent) addSources(sourcesToUse []string) {
agent.sources[source] = &urlscan.Source{}
case "wayback":
agent.sources[source] = &wayback.Source{}
case "waybackrobots":
agent.sources[source] = &waybackrobots.Source{}
}
}
}
Expand Down
132 changes: 132 additions & 0 deletions pkg/sigurlfind3r/scraping/sources/waybackrobots/waybackrobots.go
@@ -0,0 +1,132 @@
package waybackrobots

import (
"bytes"
"encoding/json"
"fmt"
"io/ioutil"
"path/filepath"
"regexp"
"strings"
"sync"

"github.com/enenumxela/urlx/pkg/urlx"
"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/scraping"
"github.com/signedsecurity/sigurlfind3r/pkg/sigurlfind3r/session"
)

type Source struct{}

func (source *Source) Run(domain string, ses *session.Session, includeSubs bool) chan scraping.URL {
URLs := make(chan scraping.URL)

go func() {
defer close(URLs)

// if includeSubs {
// domain = "*." + domain
// }

res, err := ses.SimpleGet(fmt.Sprintf("https://web.archive.org/cdx/search/cdx?url=%s/robots.txt&output=json&fl=timestamp,original&filter=statuscode:200&collapse=digest", domain))
if err != nil {
ses.DiscardHTTPResponse(res)
return
}

defer res.Body.Close()

robotsURLsRows := [][2]string{}

data, err := ioutil.ReadAll(res.Body)
if err != nil {
return
}

if err = json.Unmarshal(data, &robotsURLsRows); err != nil {
return
}

if len(robotsURLsRows) < 2 {
return
}

robotsURLsRows = robotsURLsRows[1:]

wg := &sync.WaitGroup{}

for _, row := range robotsURLsRows {
wg.Add(1)

go func(row [2]string) {
defer wg.Done()

res, err := ses.SimpleGet(fmt.Sprintf("https://web.archive.org/web/%sif_/%s", row[0], row[1]))
if err != nil {
// fmt.Println(err)
ses.DiscardHTTPResponse(res)
return
}

buf := new(bytes.Buffer)
buf.ReadFrom(res.Body)

pattern := regexp.MustCompile(`Disallow:\s?.+`)

disallowed := pattern.FindAllStringSubmatch(buf.String(), -1)

if len(disallowed) < 1 {
return
}

for _, entry := range disallowed {
temp := strings.Split(entry[0], "Disallow:")

if len(temp) <= 1 {
continue
}

endpoint := strings.Trim(temp[1], " ")

if endpoint == "/" || endpoint == "*" || endpoint == "" {
continue
}

endpoint = strings.Replace(endpoint, "*", "", -1)

for strings.HasPrefix(endpoint, "/") {
if len(endpoint) >= 1 {
endpoint = endpoint[1:] // Ex. /*/test or /*/*/demo
} else {
continue
}
}

for strings.HasSuffix(endpoint, "/") {
if len(endpoint) >= 1 {
endpoint = endpoint[0 : len(endpoint)-1]
} else {
continue
}
}

parsedURL, _ := urlx.Parse(row[1])

endpoint = filepath.Join(parsedURL.Host, endpoint)
endpoint = parsedURL.Scheme + "://" + endpoint

if URL, ok := scraping.NormalizeURL(endpoint, ses.Scope); ok {
URLs <- scraping.URL{Source: source.Name(), Value: URL}
}
}
}(row)
}

wg.Wait()
}()

return URLs
}

func (source *Source) Name() string {
return "waybackrobots"
}

0 comments on commit 7784a37

Please sign in to comment.